# Fine-tune premsql

In [20]:
import os
import torch
from datasets import load_dataset
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

import warnings
warnings.filterwarnings('ignore')

In [21]:
print(torch.cuda.is_available())

True


In [22]:
import wandb # do from scipt later
wandb.login()
%env WANDB_PROJECT=sql-fine-tuning

env: WANDB_PROJECT=sql-fine-tuning


In [23]:
import warnings
warnings.filterwarnings('ignore')

In [24]:
output_dir = './sql_output_dir_llama3'
logging_dir = './sql_logging_dir_llama3'

In [25]:
import sys,gc,traceback
import torch
def clean_ipython_hist():
    # Code in this function mainly copied from IPython source
    if not 'get_ipython' in globals(): return
    ip = get_ipython()
    user_ns = ip.user_ns
    ip.displayhook.flush()
    pc = ip.displayhook.prompt_count + 1
    for n in range(1, pc): user_ns.pop('_i'+repr(n),None)
    user_ns.update(dict(_i='',_ii='',_iii=''))
    hm = ip.history_manager
    hm.input_hist_parsed[:] = [''] * pc
    hm.input_hist_raw[:] = [''] * pc
    hm._i = hm._ii = hm._iii = hm._i00 =  ''



def clean_tb():
    # h/t Piotr Czapla
    if hasattr(sys, 'last_traceback'):
        traceback.clear_frames(sys.last_traceback)
        delattr(sys, 'last_traceback')
    if hasattr(sys, 'last_type'): delattr(sys, 'last_type')
    if hasattr(sys, 'last_value'): delattr(sys, 'last_value')

def clean_mem():
    clean_tb()
    clean_ipython_hist()
    gc.collect()
    torch.cuda.empty_cache()

In [26]:
clean_mem()

## Load and Prepare the Data

<|begin▁of▁sentence|> Question: What is the total sales amount? Context: The sales table contains information about each sale including date, amount, and customer ID. Answer: SELECT SUM(amount) FROM sales; <|end▁of▁sentence|>

In [27]:

from transformers import AutoTokenizer
from datasets import load_dataset

# Load a tokenizer to use its chat template
template_tokenizer = AutoTokenizer.from_pretrained(
    "premai-io/prem-1B-SQL"
)

# def format_prompt(sample):

#     """Given a sample dictionary with keys "title" and "abstract" format into a prompt.

#     Args:
#       sample: A sample dictionary from a Hugging Face dataset.

#     Returns:
#       sample: sample dictionary with "text" key for the formatted prompt.
#     """
#     #sample['text']=f"[INST] <> Write SQL code to answer the question based on the context. Please wrap your code answer using ```: <> {sample['question']} {sample['context']} [/INST] {sample['answer']}"
#     sample['text']=f"""<|begin▁of▁sentence|> You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.
# Context: {sample['context']}
# Question: {sample['question']}
# SQL Query:{sample['answer']} <|end▁of▁sentence|>"""
#     return {"text":sample['text']}

def format_prompt(sample):
    sample['text'] = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {sample['context']}
Question: {sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:{sample['answer']} <|end▁of▁sentence|>"""
    return {"text":sample['text']}
  

# Load and format the data using the template TinyLLama is using
dataset = (
    load_dataset("dpv/exoplanets-sql")#, split="train")
      .shuffle(seed=42)
)
dataset = dataset.map(format_prompt)

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'results', 'text'],
        num_rows: 50
    })
    valid: Dataset({
        features: ['context', 'question', 'answer', 'results', 'text'],
        num_rows: 10
    })
})

In [29]:
dataset['train']['text'][0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nContext: CREATE TABLE exoplanets (\n    name TEXT,\n    distance REAL,\n    stellar_magnitude REAL,\n    planet_type TEXT,\n    discovery_year INTEGER,\n    mass_multiplier REAL,\n    mass_wrt TEXT,\n    radius_multiplier REAL,\n    radius_wrt TEXT,\n    orbital_radius REAL,\n    orbital_period REAL,\n    eccentricity REAL,\n    detection_method TEXT\n); CREATE TABLE reference_planets (name TEXT, mass REAL);  'mass_wrt' in exoplanets table has a one-to-one match to 'name' in reference_planets table, and  'name' refers to either Earth or Jupyter, with Jupyter having 317.8 the mass of Earth.\nQuestion: What is the average mass of exoplanets discovered each year?<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:SELECT \n    e.discovery_y

- SFTTrainer expect a single 'text' column

## Evaluate the original model

In [30]:
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'
model_orig = AutoModelForCausalLM.from_pretrained(MODEL_ID)#.to("cuda:1")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


In [15]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM exoplanets LIMIT 5;")
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)
    pattern = re.compile(r'SQL Query:\s*```sql\s*(SELECT.*?|WITH.*?)\s*```', re.DOTALL | re.IGNORECASE)

#    pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')

def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model_orig.eval()
    with torch.no_grad():
        out = tokenizer.decode(model_orig.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    res = extract_sql_statement(out)
    if debug: print(res)
    run_query(res, cursor)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
for idx,example in enumerate(valid_loaded):
    generation = ''
    try:
        print('idx: ',idx)
        generation=generate_sql(context=example['context'], question=example['question'],debug=True)
        #print(generation); break
    except sqlite3.OperationalError as op_err:
        num_opp_errors += 1
        op_errors.append((idx,op_err))
        bad_idxs.append(idx)
        print(op_err)
    except Exception as e:
        num_other_exceptions +=1
        exceptions.append((idx,e))
        bad_idxs.append(idx)
        print(e)
    else:
        result_comparisons[idx]=compare_results(run_query(generation,cursor), example['results'])
    finally:
        generations.append(generation)
print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    discovery_year, 
    COUNT(*) as num_discoveries
FROM 
    exoplanets
GROUP BY 
    discovery_year
ORDER BY 
    num_discoveries DESC;
idx:  1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT detection_method
FROM exoplanets
GROUP BY detection_method
ORDER BY SUM(mass_wrt) DESC
LIMIT 1;
idx:  2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    discovery_year, 
    SUM(mass_wrt * mass_multiplier) AS total_mass
FROM 
    exoplanets
GROUP BY 
    discovery_year
ORDER BY 
    total_mass DESC
LIMIT 5;
idx:  3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    e.name, 
    e.orbital_period, 
    e.discovery_year, 
    ROW_NUMBER() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) as rank
FROM 
    exoplanets e
WHERE 
    e.name IN (
        SELECT 
            name
        FROM 
            reference_planets
        WHERE 
            name = 'Earth'
    )
ORDER BY 
    rank;
idx:  4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT * 
FROM exoplanets 
WHERE discovery_year < 2000 
AND eccentricity < 0.2;
idx:  5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT DISTINCT planet_type 
FROM exoplanets 
WHERE discovery_year < 2020;
idx:  6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    e.name, 
    e.mass_wrt * r.mass AS mass
FROM 
    exoplanets e
JOIN 
    reference_planets r ON e.mass_wrt = r.name
WHERE 
    e.discovery_year = (SELECT MIN(discovery_year) FROM exoplanets)
ORDER BY 
    e.discovery_year;
idx:  7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT e.name, e.discovery_year, e.detection_method
FROM exoplanets e
WHERE e.discovery_year >= (CURRENT_YEAR - 2)
AND e.name NOT IN (
  SELECT r.name
  FROM reference_planets r
  WHERE r.mass_wrt = 'Jupiter'
);
no such column: CURRENT_YEAR
idx:  8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    planet_type, 
    AVG(orbital_radius) AS average_orbital_radius
FROM 
    exoplanets
GROUP BY 
    planet_type
ORDER BY 
    average_orbital_radius DESC;
idx:  9
SELECT 
    e.name, 
    e.mass_wrt / e.radius_wrt AS mass_to_radius_ratio
FROM 
    exoplanets e
WHERE 
    e.planet_type = 'Terrestrial' AND 
    e.mass_wrt > 0 AND 
    e.radius_wrt > 0
ORDER BY 
    e.mass_wrt / e.radius_wrt DESC
LIMIT 5;
num_opp_errors: 1, num_other_exceptions:0,
Error rate: 10.0


In [16]:
op_errors, exceptions, result_comparisons

([(7, sqlite3.OperationalError('no such column: CURRENT_YEAR'))],
 [],
 [False, False, False, False, False, False, False, False, False, False])

- OK, 80-90% execution rate out of the box.  Looks like query 7 used improper SQLite syntax -> consider fine tuning on umesh16071973/SQLite_Training_Dataset first to align syntax.  Still not getting correct answers, it seems, must dig deeper.

In [17]:
generations

['SELECT \n    discovery_year, \n    COUNT(*) as num_discoveries\nFROM \n    exoplanets\nGROUP BY \n    discovery_year\nORDER BY \n    num_discoveries DESC;',
 'SELECT detection_method\nFROM exoplanets\nGROUP BY detection_method\nORDER BY SUM(mass_wrt) DESC\nLIMIT 1;',
 'SELECT \n    discovery_year, \n    SUM(mass_wrt * mass_multiplier) AS total_mass\nFROM \n    exoplanets\nGROUP BY \n    discovery_year\nORDER BY \n    total_mass DESC\nLIMIT 5;',
 "SELECT \n    e.name, \n    e.orbital_period, \n    e.discovery_year, \n    ROW_NUMBER() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) as rank\nFROM \n    exoplanets e\nWHERE \n    e.name IN (\n        SELECT \n            name\n        FROM \n            reference_planets\n        WHERE \n            name = 'Earth'\n    )\nORDER BY \n    rank;",
 'SELECT * \nFROM exoplanets \nWHERE discovery_year < 2000 \nAND eccentricity < 0.2;',
 'SELECT DISTINCT planet_type \nFROM exoplanets \nWHERE discovery_year < 2020;',
 'SELECT 

In [25]:
valid_loaded['answer']

['SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;',
 'SELECT detection_method,\n       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nGROUP BY detection_method\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT discovery_year, \nSUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nWHERE mass_multiplier IS NOT NULL\nGROUP BY discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'SELECT discovery_year, name, orbital_period FROM exoplanets WHERE (discovery_year, name, orbital_period) IN (   SELECT discovery_year, name, orbital_period    FROM exoplanets    ORDER BY discovery_year, orbital_period DESC) GROUP BY discovery_year LIMIT 3;',
 'SELECT name, discovery_year, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AND eccentricity < 0.2;',
 'SELECT DISTINCT \n    planet_type\nFROM \n    exopla

In [24]:
run_query(generations[-1],cursor), valid_loaded['results'][-1]


('[{"name": "EPIC 201497682 b", "mass_to_radius_ratio": null}, {"name": "EPIC 201757695.02", "mass_to_radius_ratio": null}, {"name": "EPIC 201833600 c", "mass_to_radius_ratio": null}, {"name": "EPIC 206215704 b", "mass_to_radius_ratio": null}, {"name": "EPIC 206317286 b", "mass_to_radius_ratio": null}]',
 '[{"name": "HD 100546 b", "mass_radius_ratio": 2.068544927536232e+29}, {"name": "K2-52 b", "mass_radius_ratio": 1.7018181818181818e+29}, {"name": "Kepler-718 b", "mass_radius_ratio": 1.683398781313473e+29}, {"name": "PH2 b", "mass_radius_ratio": 1.6815060908084162e+29}, {"name": "Kepler-488 b", "mass_radius_ratio": 1.670347764371895e+29}]')

In [26]:
valid_loaded['question'][-1]

'Find the exoplanets with the highest mass-to-radius ratio, showing their name and calculated ratio (return top 5).'

In [23]:
compare_results(run_query(generations[-1],cursor), valid_loaded['results'][-1])

False

In [27]:
run_query(generations[0],cursor), valid_loaded['results'][0]


('[{"discovery_year": 2016, "num_discoveries": 1517}, {"discovery_year": 2014, "num_discoveries": 875}, {"discovery_year": 2021, "num_discoveries": 525}, {"discovery_year": 2022, "num_discoveries": 338}, {"discovery_year": 2018, "num_discoveries": 326}, {"discovery_year": 2020, "num_discoveries": 234}, {"discovery_year": 2019, "num_discoveries": 203}, {"discovery_year": 2015, "num_discoveries": 157}, {"discovery_year": 2017, "num_discoveries": 153}, {"discovery_year": 2012, "num_discoveries": 138}, {"discovery_year": 2011, "num_discoveries": 138}, {"discovery_year": 2013, "num_discoveries": 126}, {"discovery_year": 2010, "num_discoveries": 97}, {"discovery_year": 2009, "num_discoveries": 94}, {"discovery_year": 2008, "num_discoveries": 65}, {"discovery_year": 2007, "num_discoveries": 52}, {"discovery_year": 2005, "num_discoveries": 36}, {"discovery_year": 2006, "num_discoveries": 31}, {"discovery_year": 2002, "num_discoveries": 29}, {"discovery_year": 2004, "num_discoveries": 27}, {"di

In [28]:
valid_loaded['question'][0]

'Determine the number of exoplanets discovered each year, and show the year with the highest number of discoveries.'

In [29]:
compare_results(run_query(generations[0],cursor), valid_loaded['results'][0])

False

In [27]:
bad_idxs

[7, 8]

In [29]:
for idx in bad_idxs:
    print(run_query(generations[idx],cursor))
    print(valid_loaded[idx]['results'])
    print('_'*100)
#generations[2]

In [31]:
#run_query("""SELECT name, mass, discovery_year FROM (SELECT name, mass, discovery_year, ROW_NUMBER() OVER (PARTITION BY discovery_year ORDER BY mass ASC) as rn FROM exoplanets) t WHERE rn = 1;""",cursor)
run_query("""SELECT 
    p.planet_type,
    AVG(exoplanets.orbital_radius) AS avg_orbital_radius
FROM 
    exoplanets
JOIN 
    reference_planets r ON exoplanets.mass_wrt = r.name
GROUP BY 
    exoplanets.planet_type
ORDER BY 
    avg_orbital_radius DESC;""",cursor)


## Configure Model, Data, Tokenizer, Trainer

In [31]:
clean_mem()

In [32]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_config,get_peft_model
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    # quantization_config = bnb_config
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# tokenizer.pad_token = "<pad>"
# tokenizer.padding_side = "left"
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the padding side
tokenizer.padding_side = "left"
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=128,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=  ["k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"],# Layers to target
   # dtype="float32"                   	# Keep LoRA parameters in float32 precision

)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
# Ensure LoRA weights are in full precision
for name, param in model.named_parameters():
    if "lora" in name:
        param.data = param.data.to(torch.float32)

# Verify the precision of LoRA weights
for name, param in model.named_parameters():
    if "lora" in name:
        print(f"{name}: {param.dtype}")


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.16s/it]


base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight

In [33]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the padding side
tokenizer.padding_side = "left"

# Verify the tokenizer configuration
print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Padding side: {tokenizer.padding_side}")


Pad token: <pad>
Pad token ID: 128256
Padding side: left


In [34]:
from transformers import DataCollatorForLanguageModeling
# Step 3: Launch training with SFTTrainer
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs = 20.0,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
   # peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,
)

trainer.train()


Generating train split: 56 examples [00:00, 177.21 examples/s]
Generating train split: 10 examples [00:00, 4252.13 examples/s]


  7%|▋         | 10/140 [00:15<03:14,  1.50s/it]

{'loss': 1.582, 'grad_norm': 0.33631056547164917, 'learning_rate': 0.00019749279121818235, 'epoch': 1.43}


 14%|█▍        | 20/140 [00:30<03:00,  1.51s/it]

{'loss': 0.4596, 'grad_norm': 0.19766917824745178, 'learning_rate': 0.0001900968867902419, 'epoch': 2.86}


 21%|██▏       | 30/140 [00:45<02:45,  1.50s/it]

{'loss': 0.2819, 'grad_norm': 0.1442306935787201, 'learning_rate': 0.000178183148246803, 'epoch': 4.29}


 29%|██▊       | 40/140 [01:00<02:30,  1.50s/it]

{'loss': 0.1905, 'grad_norm': 0.13105623424053192, 'learning_rate': 0.00016234898018587337, 'epoch': 5.71}


 36%|███▌      | 50/140 [01:15<02:15,  1.51s/it]

{'loss': 0.1291, 'grad_norm': 0.10289920121431351, 'learning_rate': 0.00014338837391175582, 'epoch': 7.14}


 43%|████▎     | 60/140 [01:30<02:00,  1.51s/it]

{'loss': 0.0789, 'grad_norm': 0.14818541705608368, 'learning_rate': 0.00012225209339563145, 'epoch': 8.57}


 50%|█████     | 70/140 [01:45<01:45,  1.51s/it]

{'loss': 0.0518, 'grad_norm': 0.10121903568506241, 'learning_rate': 0.0001, 'epoch': 10.0}


 57%|█████▋    | 80/140 [02:00<01:30,  1.51s/it]

{'loss': 0.0376, 'grad_norm': 0.07688477635383606, 'learning_rate': 7.774790660436858e-05, 'epoch': 11.43}


 64%|██████▍   | 90/140 [02:16<01:15,  1.51s/it]

{'loss': 0.0316, 'grad_norm': 0.13318626582622528, 'learning_rate': 5.6611626088244194e-05, 'epoch': 12.86}


 71%|███████▏  | 100/140 [02:31<01:00,  1.51s/it]

{'loss': 0.0287, 'grad_norm': 0.1262512058019638, 'learning_rate': 3.7651019814126654e-05, 'epoch': 14.29}


 79%|███████▊  | 110/140 [02:46<00:45,  1.51s/it]

{'loss': 0.025, 'grad_norm': 0.08413591980934143, 'learning_rate': 2.181685175319702e-05, 'epoch': 15.71}


 86%|████████▌ | 120/140 [03:01<00:30,  1.52s/it]

{'loss': 0.0231, 'grad_norm': 0.05600259453058243, 'learning_rate': 9.903113209758096e-06, 'epoch': 17.14}


 93%|█████████▎| 130/140 [03:16<00:15,  1.52s/it]

{'loss': 0.0239, 'grad_norm': 0.06649579107761383, 'learning_rate': 2.5072087818176382e-06, 'epoch': 18.57}


100%|██████████| 140/140 [03:31<00:00,  1.51s/it]

{'loss': 0.0224, 'grad_norm': 0.0636662095785141, 'learning_rate': 0.0, 'epoch': 20.0}


100%|██████████| 140/140 [03:34<00:00,  1.53s/it]

{'train_runtime': 214.5772, 'train_samples_per_second': 5.22, 'train_steps_per_second': 0.652, 'train_loss': 0.21186797097325324, 'epoch': 20.0}





TrainOutput(global_step=140, training_loss=0.21186797097325324, metrics={'train_runtime': 214.5772, 'train_samples_per_second': 5.22, 'train_steps_per_second': 0.652, 'total_flos': 5183768415436800.0, 'train_loss': 0.21186797097325324, 'epoch': 20.0})

In [33]:
# trainer.model.save_pretrained(output_dir)
# from peft import AutoPeftModelForCausalLM
# model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# model = model.merge_and_unload()

# output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.18it/s]


In [35]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM exoplanets LIMIT 5;")
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    #pattern = re.compile(r'(WITH.*?SELECT.*?;|SELECT.*?;)', re.DOTALL | re.IGNORECASE)
    pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)

    match = pattern.search(text)
    if match:
        print('EXTRACTED: ', match.group(1).strip())
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')

def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model.eval()
    with torch.no_grad():
        out = tokenizer.decode(model.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    #print('out:\n',out)
    res = extract_sql_statement(out)
    #if debug: print(res)
    run_query(res, cursor)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
for idx,example in enumerate(valid_loaded):
    generation = '' # OK, had mistake here, query 9 is still wrong due to AS, though!
    result = None
    try:
        print('idx: ',idx)
        generation=generate_sql(context=example['context'], question=example['question'],debug=True)
        result =  run_query(generation,cursor)
        #print("generation: ")
        print(generation)
    except sqlite3.OperationalError as op_err:
        num_opp_errors += 1
        op_errors.append((idx,op_err))
        bad_idxs.append(idx)
        print(op_err)
    except Exception as e:
        num_other_exceptions +=1
        exceptions.append((idx,e))
        bad_idxs.append(idx)
        print(e)
    else:
        if result: result_comparisons[idx]=compare_results(result, example['results'])
    finally:
        generations.append(generation)
print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH YearlyDiscoveries AS (
    SELECT 
        discovery_year,
        COUNT(*) AS num_discoveries
    FROM exoplanets
    GROUP BY discovery_year
)

SELECT 
    discovery_year,
    num_discoveries
FROM YearlyDiscoveries
ORDER BY num_discoveries DESC
LIMIT 1;
WITH YearlyDiscoveries AS (
    SELECT 
        discovery_year,
        COUNT(*) AS num_discoveries
    FROM exoplanets
    GROUP BY discovery_year
)

SELECT 
    discovery_year,
    num_discoveries
FROM YearlyDiscoveries
ORDER BY num_discoveries DESC
LIMIT 1;
idx:  1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH MethodMass AS (
    SELECT 
        detection_method,
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)

SELECT detection_method
FROM MethodMass
WHERE total_mass IS NOT NULL
ORDER BY total_mass DESC
LIMIT 1;
WITH MethodMass AS (
    SELECT 
        detection_method,
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)

SELECT detection_method
FROM MethodMass
WHERE total_mass IS NOT NULL
ORDER BY total_mass DESC
LIMIT 1;
idx:  2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH YearlyMass AS (
    SELECT discovery_year, 
           SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY discovery_year
)

SELECT discovery_year, total_mass
FROM YearlyMass
ORDER BY total_mass DESC
LIMIT 5;
WITH YearlyMass AS (
    SELECT discovery_year, 
           SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY discovery_year
)

SELECT discovery_year, total_mass
FROM YearlyMass
ORDER BY total_mass DESC
LIMIT 5;
idx:  3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH RankedExoplanets AS (
    SELECT 
        discovery_year,
        name,
        orbital_period,
        RANK() OVER (PARTITION BY discovery_year ORDER BY orbital_period DESC) AS period_rank
    FROM exoplanets
)
SELECT * 
FROM RankedExoplanets
WHERE period_rank <= 3;
WITH RankedExoplanets AS (
    SELECT 
        discovery_year,
        name,
        orbital_period,
        RANK() OVER (PARTITION BY discovery_year ORDER BY orbital_period DESC) AS period_rank
    FROM exoplanets
)
SELECT * 
FROM RankedExoplanets
WHERE period_rank <= 3;
idx:  4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT name, eccentricity
FROM exoplanets
WHERE discovery_year < 2000 AND eccentricity < 0.2;
SELECT name, eccentricity
FROM exoplanets
WHERE discovery_year < 2000 AND eccentricity < 0.2;
idx:  5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT planet_type
FROM exoplanets
WHERE discovery_year < 2020;
SELECT planet_type
FROM exoplanets
WHERE discovery_year < 2020;
idx:  6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH RankedExoplanets AS (
    SELECT 
        name,
        mass_multiplier * (SELECT mass FROM reference_planets AS ref_planets WHERE ref_planets.name = mass_wrt) AS mass,
        discovery_year,
        RANK() OVER (PARTITION BY discovery_year ORDER BY mass_multiplier * (SELECT mass FROM reference_planets AS ref_planets WHERE ref_planets.name = mass_wrt) DESC) AS mass_rank
    FROM exoplanets
)

SELECT name, mass, discovery_year
FROM RankedExoplanets
WHERE mass_rank = 1;
WITH RankedExoplanets AS (
    SELECT 
        name,
        mass_multiplier * (SELECT mass FROM reference_planets AS ref_planets WHERE ref_planets.name = mass_wrt) AS mass,
        discovery_year,
        RANK() OVER (PARTITION BY discovery_year ORDER BY mass_multiplier * (SELECT mass FROM reference_planets AS ref_planets WHERE ref_planets.name = mass_wrt) DESC) AS mass_rank
    FROM exoplanets
)

SELECT name, mass, discovery_year
FROM RankedExoplanets
WHERE mass_rank = 1;
idx:  7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT 
    e.name,
    e.discovery_year,
    e.detection_method
FROM exoplanets e
WHERE e.discovery_year > CURRENT_YEAR() - 2  -- Last two years
ORDER BY e.discovery_year;
no such function: CURRENT_YEAR
idx:  8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;
SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;
idx:  9
EXTRACTED:  WITH MassFactors AS (
    SELECT 
        mass_wrt,
        AVG(mass) AS avg_mass
    FROM reference_planets
    GROUP BY mass_wrt
)

SELECT e.name, e.mass_multiplier * r.avg_mass AS mass, e.radius_multiplier * r.avg_mass AS radius, 
       (e.mass_multiplier * r.avg_mass / (e.radius_multiplier * r.avg_mass)) AS mass_to_radius_ratio
FROM exoplanets e
JOIN MassFactors r ON e.mass_wrt = r.mass_wrt
ORDER BY mass_to_radius_ratio DESC
LIMIT 5;
no such column: mass_wrt
num_opp_errors: 2, num_other_exceptions:0,
Error rate: 20.0


- After 10 epochs of training with QLoRA set with default precision for it, got the following, execution rate down from 80% to 70%, still not getting correct queries.  Trying full precision for adaptors LoRA.

- Same deal with full precision adaptor training with QLoRA.

- Same full precision LoRA (not 'Q', comment out peft_config), get 90% execution rate.  

In [36]:
op_errors, exceptions,result_comparisons

([(7, sqlite3.OperationalError('no such function: CURRENT_YEAR')),
  (9, sqlite3.OperationalError('no such column: mass_wrt'))],
 [],
 [False, False, True, False, False, False, False, False, True, False])

- Next steps: Dive into results divergence  + Possibly do preliminary fine tune on SQLite ds (umesh16071973/SQLite_Training_Dataset).

In [37]:
valid_loaded['answer']

['SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;',
 'SELECT detection_method,\n       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nGROUP BY detection_method\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT discovery_year, \nSUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nWHERE mass_multiplier IS NOT NULL\nGROUP BY discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'WITH RankedExoplanets AS ( SELECT discovery_year, name, orbital_period, RANK() OVER (PARTITION BY discovery_year ORDER BY orbital_period DESC) AS period_rank FROM exoplanets) SELECT * FROM RankedExoplanets WHERE period_rank <= 3;',
 'SELECT name, discovery_year, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AND eccentricity < 0.2;',
 'SELECT DISTINCT \n    planet_type\nFROM \n    exoplanets\nWHERE \n    planet_ty

In [38]:
generations

['WITH YearlyDiscoveries AS (\n    SELECT \n        discovery_year,\n        COUNT(*) AS num_discoveries\n    FROM exoplanets\n    GROUP BY discovery_year\n)\n\nSELECT \n    discovery_year,\n    num_discoveries\nFROM YearlyDiscoveries\nORDER BY num_discoveries DESC\nLIMIT 1;',
 'WITH MethodMass AS (\n    SELECT \n        detection_method,\n        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\n    FROM exoplanets\n    GROUP BY detection_method\n)\n\nSELECT detection_method\nFROM MethodMass\nWHERE total_mass IS NOT NULL\nORDER BY total_mass DESC\nLIMIT 1;',
 'WITH YearlyMass AS (\n    SELECT discovery_year, \n           SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\n    FROM exoplanets\n    GROUP BY discovery_year\n)\n\nSELECT discovery_year, total_mass\nFROM YearlyMass\nORDER BY total_mass DESC\nLIMIT 5;',
 'WITH RankedExoplanets AS (\n    SELECT \n        discovery_year,\n        name,\

In [39]:
generations[0],valid_loaded['answer'][0]

('SELECT discovery_year, COUNT(*) AS num_discoveries\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY num_discoveries DESC\nLIMIT 1;',
 'SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;')

- Results for query 0 are equivalent, just used different alias/structure +1

In [39]:
i=0
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 2016, "count": 1517}]
gen_result:
 [{"discovery_year": 2016, "num_discoveries": 1517}]
How are the following two queries different? WITH YearlyDiscoveries AS (
    SELECT 
        discovery_year,
        COUNT(*) AS num_discoveries
    FROM exoplanets
    GROUP BY discovery_year
)

SELECT 
    discovery_year,
    num_discoveries
FROM YearlyDiscoveries
ORDER BY num_discoveries DESC
LIMIT 1;, SELECT discovery_year, COUNT(*) AS count
FROM exoplanets
GROUP BY discovery_year
ORDER BY count DESC
LIMIT 1; and which one better answers the following question: Determine the number of exoplanets discovered each year, and show the year with the highest number of discoveries.?


- Results for query 1: OK, just includes an extra column, slightly more complex, +1

In [40]:
i=1
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"detection_method": "Radial Velocity", "total_mass": 6.38296985206948e+30}]
gen_result:
 [{"detection_method": "Radial Velocity"}]
How are the following two queries different? WITH MethodMass AS (
    SELECT 
        detection_method,
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)

SELECT detection_method
FROM MethodMass
WHERE total_mass IS NOT NULL
ORDER BY total_mass DESC
LIMIT 1;, SELECT detection_method,
       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
FROM exoplanets
GROUP BY detection_method
ORDER BY total_mass DESC
LIMIT 1; and which one better answers the following question: Display the detection method that has produced the most massive exoplanets (based on total mass).?


- Query 3 is good, just and extra column +1

In [41]:
i=3
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973}]
gen_result:
 [{"discovery_year": 1992, "name": "PSR B1257+12 d", "orbital_period": 0.26885694, "period_rank": 1}, {"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708, "period_rank": 2}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762, "period_rank": 1}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973, "period_rank": 1}, {"discovery_year": 1996, "name": "47 Ursae Majoris b", "orbital_period": 3.0, "period_rank": 1}, {"discovery_year": 1996, "name": "16 Cygni B b", "orbital_period": 2.2, "period_rank": 2}, {"discovery_year": 1996, "name": "70 Virginis b", "orbital_period": 0.31950718, "period_rank": 3}, {"discovery_year": 1997, "name": "Rho Cor

- Query 4 is good, just and extra column +1

In [42]:
i=4
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"name": "47 Ursae Majoris b", "discovery_year": 1996, "eccentricity": 0.03}, {"name": "51 Pegasi b", "discovery_year": 1995, "eccentricity": 0.01}, {"name": "55 Cancri b", "discovery_year": 1996, "eccentricity": 0.0}, {"name": "GJ 86 b", "discovery_year": 1999, "eccentricity": 0.04}, {"name": "GJ 876 b", "discovery_year": 1998, "eccentricity": 0.03}, {"name": "HD 10697 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 130322 b", "discovery_year": 1999, "eccentricity": 0.03}, {"name": "HD 177830 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 187123 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 192263 b", "discovery_year": 1999, "eccentricity": 0.05}, {"name": "HD 195019 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 209458 b", "discovery_year": 1999, "eccentricity": 0.0}, {"name": "HD 217107 b", "discovery_year": 1998, "eccentricity": 0.13}, {"name": "HD 75289 b", "discovery_year": 1999, "eccentricity": 0.0

- Overall, ~70% accuracy post 2 minutes of fine tuning and uncovered a mistake in my validation set!

- Query 5 is off

In [43]:
i=5
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"planet_type": "Unknown"}]
gen_result:
 [{"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Super Earth"}, {"planet_type": "Gas Giant"}, {"planet_type": "Neptune-like"}, {"plane

- Query 6 returned the maximum, not minimum mass!

In [44]:
i=6
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 2001, "name": "47 Ursae Majoris c", "exoplanet_mass": 1.02492e+27}, {"discovery_year": 1995, "name": "51 Pegasi b", "exoplanet_mass": 8.730800000000001e+26}, {"discovery_year": 2004, "name": "55 Cancri e", "exoplanet_mass": 4.771628000000001e+25}, {"discovery_year": 2019, "name": "EPIC 201497682 b", "exoplanet_mass": 1.5527200000000002e+24}, {"discovery_year": 2007, "name": "GJ 581 c", "exoplanet_mass": 3.2846e+25}, {"discovery_year": 2009, "name": "GJ 581 e", "exoplanet_mass": 1.01524e+25}, {"discovery_year": 1998, "name": "HD 187123 b", "exoplanet_mass": 9.926540000000001e+26}, {"discovery_year": 2003, "name": "HD 3651 b", "exoplanet_mass": 4.32744e+26}, {"discovery_year": 2000, "name": "HD 46375 b", "exoplanet_mass": 4.28948e+26}, {"discovery_year": 2002, "name": "HD 49674 b", "exoplanet_mass": 1.89808076e+26}, {"discovery_year": 2006, "name": "HD 69830 b", "exoplanet_mass": 6.09144e+25}, {"discovery_year": 1999, "name": "HD 75289 b", "exoplanet_ma

- Query 3 is good, just and extra column +1

In [46]:
i=8
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"planet_type": "Gas Giant", "avg_orbital_radius": 21.5154491121673}, {"planet_type": "Unknown", "avg_orbital_radius": 16.65}, {"planet_type": "Neptune-like", "avg_orbital_radius": 0.2249016860670194}, {"planet_type": "Super Earth", "avg_orbital_radius": 0.10995212107023411}, {"planet_type": "Terrestrial", "avg_orbital_radius": 0.06238086486486486}]
gen_result:
 [{"planet_type": "Gas Giant", "avg_orbital_radius": 21.5154491121673}, {"planet_type": "Unknown", "avg_orbital_radius": 16.65}, {"planet_type": "Neptune-like", "avg_orbital_radius": 0.2249016860670194}, {"planet_type": "Super Earth", "avg_orbital_radius": 0.10995212107023411}, {"planet_type": "Terrestrial", "avg_orbital_radius": 0.06238086486486486}]
How are the following two queries different? SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;, SELECT 
    e.planet_type,
    AVG(e.orbital_radius) AS avg_orbital_radius
FROM exoplan

In [11]:
import os
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(os.path.join(output_dir, "final_merged_checkpoint"))

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]


In [16]:
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the padding side
tokenizer.padding_side = "left"

# Verify the tokenizer configuration
print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Padding side: {tokenizer.padding_side}")

Pad token: <pad>
Pad token ID: 128256
Padding side: left


In [18]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    #pattern = re.compile(r'(WITH.*?SELECT.*?;|SELECT.*?;)', re.DOTALL | re.IGNORECASE)
    pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)

    match = pattern.search(text)
    if match:
        print('EXTRACTED: ', match.group(1).strip())
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')

def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model.eval()
    with torch.no_grad():
        out = tokenizer.decode(model.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    #print('out:\n',out)
    res = extract_sql_statement(out)
    #if debug: print(res)
    #run_query(res, cursor)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
# for idx,example in enumerate(valid_loaded):
#     generation = '' # OK, had mistake here, query 9 is still wrong due to AS, though!
#     result = None
#     try:
#         print('idx: ',idx)
#         generation=generate_sql(context=example['context'], question=example['question'],debug=True)
#         result =  run_query(generation,cursor)
#         #print("generation: ")
#         print(generation)
#     except sqlite3.OperationalError as op_err:
#         num_opp_errors += 1
#         op_errors.append((idx,op_err))
#         bad_idxs.append(idx)
#         print(op_err)
#     except Exception as e:
#         num_other_exceptions +=1
#         exceptions.append((idx,e))
#         bad_idxs.append(idx)
#         print(e)
#     else:
#         if result: result_comparisons[idx]=compare_results(result, example['results'])
#     finally:
#         generations.append(generation)
# print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

In [19]:
example = valid_loaded[0]
generate_sql(context=example['context'], question=example['question'],debug=True)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH DiscoveryCounts AS (
    SELECT 
        discovery_year,
        COUNT(*) AS count_exoplanets
    FROM exoplanets
    GROUP BY discovery_year
)
SELECT 
    discovery_year,
    count_exoplanets
FROM DiscoveryCounts
ORDER BY count_exoplanets DESC
LIMIT 1;


'WITH DiscoveryCounts AS (\n    SELECT \n        discovery_year,\n        COUNT(*) AS count_exoplanets\n    FROM exoplanets\n    GROUP BY discovery_year\n)\nSELECT \n    discovery_year,\n    count_exoplanets\nFROM DiscoveryCounts\nORDER BY count_exoplanets DESC\nLIMIT 1;'