In [1]:
%load_ext autoreload
%autoreload 2

## DB Setup

We assume you already have a postgres database ready.

In [14]:
DATABASE = "postgres"
USER = "postgres"
PASSWORD = "mysecretpassword"
HOST = "localhost"
PORT = 5432
TABLES = []  # list of tables to load or [] to load all tables

In [15]:
from db_connectors import PostgresConnector
from prompt_formatters import RajkumarFormatter, SqlCoderFormatter

# Get the connector and formatter
postgres_connector = PostgresConnector(
    user=USER, password=PASSWORD, dbname=DATABASE, host=HOST, port=PORT
)
postgres_connector.connect()
if len(TABLES) <= 0:
    TABLES.extend(postgres_connector.get_tables())

print(f"Loading tables: {TABLES}")

db_schema = [postgres_connector.get_schema(table) for table in TABLES]
formatter = SqlCoderFormatter(db_schema)

Loading tables: ['companies', 'stores', 'categories', 'products', 'inventory', 'restocks', 'transactions', 'productdemandforecast']


ValidationError: 2 validation errors for Table
columns.0
  Input should be a valid dictionary or instance of TableColumn [type=model_type, input_value=TableColumn(name='company...ne, foreign_column=None), input_type=TableColumn]
    For further information visit https://errors.pydantic.dev/2.5/v/model_type
columns.1
  Input should be a valid dictionary or instance of TableColumn [type=model_type, input_value=TableColumn(name='company...ne, foreign_column=None), input_type=TableColumn]
    For further information visit https://errors.pydantic.dev/2.5/v/model_type

## Using LLama-2-7b-chat to enhance the SQL definition

In [4]:
from llm_utils import *

formatter.table_str = add_comment_to_sql(formatter.table_str)
print(formatter.table_str)

  from .autonotebook import tqdm as notebook_tqdm


Loading model...
Using device mps.


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.74s/it]


Tokenizing....
Generating...

CREATE TABLE companies (
company_id SERIAL PRIMARY KEY,
company_name VARCHAR(255) -- company_id is the company unique ID
)

CREATE TABLE stores (
company_id integer, -- foreign key referencing companies.company_id
store_id VARCHAR(255) PRIMARY KEY, -- store_id is the store unique ID
store_name VARCHAR(255), -- name of the store
address text, -- address of the store
phone_number VARCHAR(20) -- phone number of the store
)

CREATE TABLE categories (
category_id VARCHAR(255) PRIMARY KEY, -- category_id is the category unique ID
category_name_en VARCHAR(255) -- name of the category in English
)

CREATE TABLE products (
product_id VARCHAR(255) PRIMARY KEY, -- product_id is the product unique ID
article_id VARCHAR(255), -- article number of the product
product_name_en VARCHAR(255), -- name of the product in English
uom VARCHAR(50), -- unit of measurement for the product
buom VARCHAR(50), -- bought quantity of the product
category_id VARCHAR(255), -- foreign key r

In [6]:
# Writing to the file
with open("sample_table_str.txt", 'w') as file:
    file.write(formatter.table_str)

## Model Setup

In a separate screen or window, first install [Manifest](https://github.com/HazyResearch/manifest)
```bash
pip install manifest-ml\[all\]
```

Then run
```bash
python3 -m manifest.api.app \
    --model_type huggingface \
    --model_generation_type text-generation \
    --model_name_or_path NumbersStation/nsql-350M \
    --device 0
```

If successful, you will see an output like
```bash
* Running on http://127.0.0.1:5000
```

## Using Specialised LLM to answer question with SQL

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained("defog/sqlcoder-7b")
model = AutoModelForCausalLM.from_pretrained("defog/sqlcoder-7b", torch_dtype=torch.float16)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# Move the model to the MPS device
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.18s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  

In [25]:
# Reading from the file
with open("sample_table_str.txt", 'r') as file:
    formatter.table_str = file.read()
print("Tokenizing....")
prompt = formatter.format_prompt("How many products exists in the store named `Store 1` ?", database='Postgres')
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Move the input to the MPS device
input_ids = input_ids.to(device)

print("Generating...")
generated_ids = model.generate(input_ids, max_length=2000)
response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Tokenizing....
Generating...


In [26]:
from llm_utils import extract_sql_code
print(extract_sql_code(response))
query = extract_sql_code(response)

# Execute the query and format the output
try:
    result = postgres_connector.run_sql_as_df(query)
    print("Query executed successfully. The result is:")
    print(result.to_json())
except Exception as e:
    print(f"Error executing the query: {e}")


 SELECT COUNT(DISTINCT products.product_id) AS number_of_products FROM products JOIN inventory ON products.product_id = inventory.product_id JOIN stores ON inventory.store_id = stores.store_id WHERE stores.store_name = 'Store 1';

Query executed successfully. The result is:
{"number_of_products":{"0":3}}


## Manifest-ml deploying and using server side

In [8]:
from manifest import Manifest

manifest_client = Manifest(client_name="huggingface", client_connection="http://127.0.0.1:5001")

def get_sql(instruction: str, max_tokens: int = 300) -> str:
    prompt = formatter.format_prompt(instruction)
    res = manifest_client.run(prompt, max_tokens=max_tokens)
    return formatter.format_model_output(res)

In [9]:
print(manifest_client.client_pool.get_current_client().get_model_params())
print(manifest_client.client_pool.get_current_client().get_model_inputs())

{'model_name': 'NumbersStation/nsql-llama-2-7B', 'model_path': 'NumbersStation/nsql-llama-2-7B', 'client_name': 'huggingface'}
['temperature', 'max_tokens', 'n', 'top_p', 'top_k', 'repetition_penalty', 'do_sample']


In [10]:
sql = get_sql("How many products are sold in Store 001 ?")
print(sql)

prompt='CREATE TABLE companies (\n    company_id integer  primary key,\n    company_name character varying \n)\n\nCREATE TABLE stores (\n    company_id integer ,\n    store_id character varying  primary key,\n    store_name character varying ,\n    address text ,\n    phone_number character varying ,\n    foreign key (company_id) references companies(company_id)\n)\n\nCREATE TABLE categories (\n    category_id character varying  primary key,\n    category_name_en character varying \n)\n\nCREATE TABLE products (\n    product_id character varying  primary key,\n    article_id character varying ,\n    product_name_en character varying ,\n    uom character varying ,\n    buom character varying ,\n    category_id character varying ,\n    brand character varying ,\n    gtin character varying ,\n    varient_en character varying ,\n    foreign key (category_id) references categories(category_id)\n)\n\nCREATE TABLE inventory (\n    inventory_id integer  primary key,\n    current_stock integer ,

ReadTimeout: HTTPConnectionPool(host='127.0.0.1', port=5001): Read timed out. (read timeout=60)

In [12]:
print(postgres_connector.run_sql_as_df(sql))

ProgrammingError: (psycopg2.errors.UndefinedColumn) column t1.store_id does not exist
LINE 1: ...duct_id) FROM products AS t1 JOIN stores AS t2 ON t1.store_i...
                                                             ^
HINT:  Perhaps you meant to reference the column "t2.store_id".

[SQL: SELECT COUNT(DISTINCT t1.product_id) FROM products AS t1 JOIN stores AS t2 ON t1.store_id = t2.store_id WHERE t2.company_name = "A";]
(Background on this error at: https://sqlalche.me/e/14/f405)