In [20]:
# https://github.com/h2oai/sql-sidekick/releases
#!python3 -m pip install --force-reinstall sql_sidekick-0.2.2-py3-none-any.whl

In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
from sidekick.prompter import db_setup, ask
from sidekick.schema_generator import generate_schema
from sidekick.utils import setup_dir, list_models

In [4]:
base_path = "./"
cache_path = f"{base_path}/var/lib/tmp"
setup_dir(base_path)

In [5]:
import pandas as pd
f = pd.read_csv("./sleep_health_and_lifestyle_dataset.csv")

In [6]:
# Env variables
import os

os.environ['OPENAI_API_KEY'] = ""
os.environ['H2O_BASE_MODEL_URL'] = 'http://38.128.233.247'
os.environ['H2O_BASE_MODEL_API_KEY'] = ""
# To get access to h2ogpte endpoint, reach out to cloud-feedback@h2o.ai
os.environ['RECOMMENDATION_MODEL_REMOTE_URL'] = "https://h2ogpte.genai.h2o.ai"  # e.g. https://<>.h2ogpte.h2o.ai
os.environ['RECOMMENDATION_MODEL_API_KEY'] = ""

In [7]:
HOST_NAME = "localhost"
USER_NAME = "sqlite"
PASSWORD = "abc"
DB_NAME = "querydb"
PORT = "5432"


# Given .csv file, auto-generate schema
# Download dataset --> https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset
data_path = "./sleep_health_and_lifestyle_dataset.csv"
table_name = "sleep_health_eda"

r, table_info_path = generate_schema(data_path=data_path, output_path=f"{cache_path}/{table_name}_table_info.jsonl")

llm_response, err = db_setup(
                db_name=DB_NAME,
                hostname=HOST_NAME,
                user_name=USER_NAME,
                password=PASSWORD,
                port=PORT,
                table_info_path=table_info_path,
                table_samples_path=data_path,
                table_name=table_name,
                local_base_path=base_path
            )

 Information supplied:
 querydb, localhost, sqlite, abc, 5432
Database already exists!
Table name: sleep_health_eda


[32m2024-01-27 20:35:06.568[0m | [34m[1mDEBUG   [0m | [36msidekick.db_config[0m:[36m_extract_schema_info[0m:[36m162[0m - [34m[1mUsing schema information from: .//var/lib/tmp/sleep_health_eda_table_info.jsonl[0m
[32m2024-01-27 20:35:06.572[0m | [34m[1mDEBUG   [0m | [36msidekick.db_config[0m:[36mcreate_table[0m:[36m186[0m - [34m[1mSchema info used for creating table:
 Person_ID NUMERIC,
Gender TEXT COLLATE NOCASE,
Age NUMERIC,
Occupation TEXT COLLATE NOCASE,
Sleep_Duration NUMERIC,
Quality_of_Sleep NUMERIC,
Physical_Activity_Level NUMERIC,
Stress_Level NUMERIC,
BMI_Category TEXT COLLATE NOCASE,
Blood_Pressure TEXT COLLATE NOCASE,
Heart_Rate NUMERIC,
Daily_Steps NUMERIC,
Sleep_Disorder TEXT COLLATE NOCASE[0m
[32m2024-01-27 20:35:06.578[0m | [1mINFO    [0m | [36msidekick.db_config[0m:[36mcreate_table[0m:[36m198[0m - [1mTable created: sleep_health_eda[0m


Checked table sleep_health_eda exists in the DB.


[32m2024-01-27 20:35:06.586[0m | [34m[1mDEBUG   [0m | [36msidekick.db_config[0m:[36madd_samples[0m:[36m222[0m - [34m[1mAdding sample values to table: ./sleep_health_and_lifestyle_dataset.csv[0m
[32m2024-01-27 20:35:06.597[0m | [34m[1mDEBUG   [0m | [36msidekick.db_config[0m:[36madd_samples[0m:[36m228[0m - [34m[1mInserting chunk: 0[0m
[32m2024-01-27 20:35:06.755[0m | [1mINFO    [0m | [36msidekick.db_config[0m:[36madd_samples[0m:[36m233[0m - [1mData inserted into table: sleep_health_eda[0m
[32m2024-01-27 20:35:06.759[0m | [1mINFO    [0m | [36msidekick.db_config[0m:[36madd_samples[0m:[36m238[0m - [1mNumber of rows inserted: 2618[0m


Created a Database querydb. Inserted sample values from ./sleep_health_and_lifestyle_dataset.csv into table sleep_health_eda, please ask questions!


In [9]:
# List supported models
list_models()

['h2ogpt-sql-sqlcoder2-4bit',
 'h2ogpt-sql-sqlcoder-34b-alpha-4bit',
 'h2ogpt-sql-nsql-llama-2-7B-4bit',
 'h2ogpt-sql-sqlcoder2',
 'h2ogpt-sql-sqlcoder-34b-alpha',
 'h2ogpt-sql-nsql-llama-2-7B',
 'gpt-3.5-turbo',
 'gpt-4-8k',
 'gpt-4-1106-preview-128k']

In [10]:
# Use `list_models()` to check other supported models

def query(
    question: str,
    table_name,
    table_info_path: str,
    sample_qna_path: str,
    regenerate=False,
    regenerate_with_options=False,
):
    """Asks question and returns SQL."""
    base_path = "./"
    # self_correction is enabled by default, set to False if not needed.
    res = ask(
        question=question,
        table_info_path=table_info_path,
        sample_queries_path=sample_qna_path,
        table_name=table_name,
        is_command=False,
        model_name="h2ogpt-sql-sqlcoder2-4bit",  #Other default model option: h2ogpt-sql-sqlcoder-34b-alpha
        is_regenerate=regenerate,
        is_regen_with_options=regenerate_with_options,
        execute_query=False,
        local_base_path=base_path,
    )
    return res

In [11]:
res = query("What is the average sleep duration for each gender?", table_name="sleep_health_eda", 
            table_info_path=table_info_path, sample_qna_path=None)

[32m2024-01-27 20:35:33.226[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m500[0m - [1mTable in use: ['sleep_health_eda'][0m
[32m2024-01-27 20:35:33.229[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m501[0m - [1mSQL dialect for generation: sqlite[0m
[32m2024-01-27 20:35:33.231[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m534[0m - [1mSetting context...[0m
[32m2024-01-27 20:35:33.232[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m535[0m - [1mQuestion: What is the average sleep duration for each gender?[0m
[32m2024-01-27 20:35:33.234[0m | [34m[1mDEBUG   [0m | [36msidekick.prompter[0m:[36mask[0m:[36m553[0m - [34m[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl[0m
[32m2024-01-27 20:35:33.235[0m | [1mINFO    [0m | [36msidekick.utils[0m:[36mis_resource_low[0m:[36m355[0m - [1mNumber of GPUs: 1[0m
[32m2024-01-27 20:35:33.236[0m | [34m[1m

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/532 [00:00<?, ?B/s]

[32m2024-01-27 20:36:20.189[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36mload_embedding_model[0m:[36m100[0m - [34m[1mLoading embedding model from: .//models/sentence_transformers[0m


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/17d124a4b773c4c9248ca816b0b0901e3c49a243/
Load pretrained SentenceTransformer: .//models/sentence_transformers/models--BAAI--bge-base-en/snapshots/17d124a4b773c4c9248ca816b0b0901e3c49a243/


[32m2024-01-27 20:36:20.805[0m | [1mINFO    [0m | [36msidekick.utils[0m:[36m_check_file_info[0m:[36m469[0m - [1mUsing information info from path .//var/lib/tmp/sleep_health_eda_table_info.jsonl[0m
[32m2024-01-27 20:36:20.807[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m603[0m - [1mComputing user request ...[0m
[32m2024-01-27 20:36:20.811[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m155[0m - [34m[1mInput questions: # query: what is the average sleep duration for each gender?[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-27 20:36:22.408[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m160[0m - [34m[1mProbable context: ["if patterns like 'current time' or 'now' occurs in question", "if patterns like 'total number', or 'list' occurs in question", 'detailed summary', 'summary'][0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-27 20:36:22.444[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: if patterns like 'current time' or 'now' occurs in question: 0.8459207869447033[0m
[32m2024-01-27 20:36:22.446[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: if patterns like 'total number', or 'list' occurs in question: 0.8319947353454415[0m
[32m2024-01-27 20:36:22.448[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: detailed summary: 0.8346069603076574[0m
[32m2024-01-27 20:36:22.449[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: summary: 0.8394152180082535[0m
[32m2024-01-27 20:36:22.450[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m184[0m - [34m[1mSorted co

Exiting...


In [12]:
print(f"Question = {res[0][0]}")
print("----")
print(f"Generated SQL = {res[0][1]}")

Question = **Generated response for question,**
What is the average sleep duration for each gender?

----
Generated SQL = ``` sql
SELECT "gender",
       AVG("sleep_duration") AS "average_sleep_duration"
FROM "sleep_health_eda"
GROUP BY "gender"
LIMIT 100
```




In [16]:
# On using re-generation flag we toggle the temperature values between 0 and 1 alternating between low 
# (focus/conservative generation and high values (random/creative generation)
res = query("What are the most common occupations among individuals in the dataset?", table_name="sleep_health_eda", 
            table_info_path=table_info_path, sample_qna_path=None, regenerate=True)

[32m2024-01-27 20:39:50.016[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m500[0m - [1mTable in use: ['sleep_health_eda'][0m
[32m2024-01-27 20:39:50.017[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m501[0m - [1mSQL dialect for generation: sqlite[0m
[32m2024-01-27 20:39:50.018[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m534[0m - [1mSetting context...[0m
[32m2024-01-27 20:39:50.019[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m535[0m - [1mQuestion: What are the most common occupations among individuals in the dataset?[0m
[32m2024-01-27 20:39:50.020[0m | [34m[1mDEBUG   [0m | [36msidekick.prompter[0m:[36mask[0m:[36m553[0m - [34m[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl[0m
[32m2024-01-27 20:39:50.021[0m | [1mINFO    [0m | [36msidekick.utils[0m:[36mis_resource_low[0m:[36m355[0m - [1mNumber of GPUs: 1[0m
[32m2024-01-27 20:39:50.

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-27 20:39:50.099[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m160[0m - [34m[1mProbable context: ["if patterns like 'current time' or 'now' occurs in question", "if patterns like 'total number', or 'list' occurs in question", 'detailed summary', 'summary'][0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-27 20:39:50.139[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: if patterns like 'current time' or 'now' occurs in question: 0.8284876985286928[0m
[32m2024-01-27 20:39:50.141[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: if patterns like 'total number', or 'list' occurs in question: 0.8591431101102107[0m
[32m2024-01-27 20:39:50.143[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: detailed summary: 0.8650206131706182[0m
[32m2024-01-27 20:39:50.146[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: summary: 0.8724867083448907[0m
[32m2024-01-27 20:39:50.147[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m184[0m - [34m[1mSorted co

Exiting...


In [17]:
print(f"Question = {res[0][0]}")
print("----")
print(f"Generated SQL = {res[0][1]}")

Question = **Generated response for question,**
What are the most common occupations among individuals in the dataset?

----
Generated SQL = ``` sql
SELECT "occupation",
       COUNT(1) AS "COUNT"
FROM "sleep_health_eda"
GROUP BY "occupation"
ORDER BY "COUNT" DESC
LIMIT 100
```




In [18]:
# Alternate options
res = query("What is the average sleep duration for each gender?", table_name="sleep_health_eda", 
            table_info_path=table_info_path, sample_qna_path=None, regenerate_with_options=True)

[32m2024-01-27 20:39:56.595[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m500[0m - [1mTable in use: ['sleep_health_eda'][0m
[32m2024-01-27 20:39:56.597[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m501[0m - [1mSQL dialect for generation: sqlite[0m
[32m2024-01-27 20:39:56.598[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m534[0m - [1mSetting context...[0m
[32m2024-01-27 20:39:56.599[0m | [1mINFO    [0m | [36msidekick.prompter[0m:[36mask[0m:[36m535[0m - [1mQuestion: What is the average sleep duration for each gender?[0m
[32m2024-01-27 20:39:56.601[0m | [34m[1mDEBUG   [0m | [36msidekick.prompter[0m:[36mask[0m:[36m553[0m - [34m[1mTable info path: .//var/lib/tmp/sleep_health_eda_table_info.jsonl[0m
[32m2024-01-27 20:39:56.602[0m | [1mINFO    [0m | [36msidekick.utils[0m:[36mis_resource_low[0m:[36m355[0m - [1mNumber of GPUs: 1[0m
[32m2024-01-27 20:39:56.604[0m | [34m[1m

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-27 20:39:56.699[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m160[0m - [34m[1mProbable context: ["if patterns like 'current time' or 'now' occurs in question", "if patterns like 'total number', or 'list' occurs in question", 'detailed summary', 'summary'][0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[32m2024-01-27 20:39:56.752[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: if patterns like 'current time' or 'now' occurs in question: 0.8459207869447033[0m
[32m2024-01-27 20:39:56.755[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: if patterns like 'total number', or 'list' occurs in question: 0.8319947353454415[0m
[32m2024-01-27 20:39:56.759[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: detailed summary: 0.8346069603076574[0m
[32m2024-01-27 20:39:56.763[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m168[0m - [34m[1mSimilarity score for: summary: 0.8394152180082535[0m
[32m2024-01-27 20:39:56.764[0m | [34m[1mDEBUG   [0m | [36msidekick.utils[0m:[36msemantic_search[0m:[36m184[0m - [34m[1mSorted co

Exiting...


In [19]:
print(f"Question = {res[0][0]}")
print("----Options----")
for _r in res[1]:
    print(_r)

Question = **Generated response for question,**
What is the average sleep duration for each gender?

----Options----
Option 1: (_probability_: 0.381034255027771)
``` sql
SELECT gender,
       AVG(sleep_duration) AS average_sleep_duration
FROM sleep_health_eda
GROUP BY gender
ORDER BY average_sleep_duration DESC NULLS LAST
LIMIT 100;
```



Option 2: (_probability_: 0.2624567449092865)
``` sql
SELECT AVG(sleep_duration) AS average_sleep_duration,
       gender
FROM sleep_health_eda
GROUP BY gender
LIMIT 100;
```



Option 3: (_probability_: 0.22498156130313873)
``` sql
SELECT Gender,
       AVG(Sleep_Duration) AS average_duration
FROM sleep_health_eda
GROUP BY Gender
ORDER BY average_duration DESC NULLS LAST
LIMIT 100;
```



Option 4: (_probability_: 0.13085876405239105)
``` sql
SELECT 'Gender',
       AVG('Sleep_Duration') AS average_sleep_duration
FROM'sleep_health_eda'
GROUP BY 'Gender'
ORDER BY average_sleep_duration DESC NULLS LAST
LIMIT 100;
```



Option 5: (_probability_: 0.000