# Fine-tune premsql

In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


[2024-10-14 13:47:11,329] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/mainuser/anaconda3/envs/sqlft/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
print(torch.cuda.is_available())

True


In [3]:
import wandb # do from scipt later
wandb.login()
%env WANDB_PROJECT=sql-fine-tuning

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdpopovvelasco[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=sql-fine-tuning


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
output_dir = './sql_output_dir_llama3'
logging_dir = './sql_logging_dir_llama3'

In [6]:
import sys,gc,traceback
import torch
def clean_ipython_hist():
    # Code in this function mainly copied from IPython source
    if not 'get_ipython' in globals(): return
    ip = get_ipython()
    user_ns = ip.user_ns
    ip.displayhook.flush()
    pc = ip.displayhook.prompt_count + 1
    for n in range(1, pc): user_ns.pop('_i'+repr(n),None)
    user_ns.update(dict(_i='',_ii='',_iii=''))
    hm = ip.history_manager
    hm.input_hist_parsed[:] = [''] * pc
    hm.input_hist_raw[:] = [''] * pc
    hm._i = hm._ii = hm._iii = hm._i00 =  ''



def clean_tb():
    # h/t Piotr Czapla
    if hasattr(sys, 'last_traceback'):
        traceback.clear_frames(sys.last_traceback)
        delattr(sys, 'last_traceback')
    if hasattr(sys, 'last_type'): delattr(sys, 'last_type')
    if hasattr(sys, 'last_value'): delattr(sys, 'last_value')

def clean_mem():
    clean_tb()
    clean_ipython_hist()
    gc.collect()
    torch.cuda.empty_cache()

In [7]:
clean_mem()

## Load and Prepare the Data

<|begin▁of▁sentence|> Question: What is the total sales amount? Context: The sales table contains information about each sale including date, amount, and customer ID. Answer: SELECT SUM(amount) FROM sales; <|end▁of▁sentence|>

In [8]:
from transformers import AutoTokenizer
from datasets import load_dataset

def format_prompt(sample):
    sample['text'] = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {sample['context']}
Question: {sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:{sample['answer']} <|end▁of▁sentence|>"""
    return {"text":sample['text']}
  

# Load and format the data using the template TinyLLama is using
dataset = (
    load_dataset("dpv/exoplanets-sql")#, split="train")
      .shuffle(seed=42)
)
dataset = dataset.map(format_prompt)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'results', 'text'],
        num_rows: 50
    })
    valid: Dataset({
        features: ['context', 'question', 'answer', 'results', 'text'],
        num_rows: 10
    })
})

In [10]:
dataset['train']['text'][0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nContext: CREATE TABLE exoplanets (\n    name TEXT,\n    distance REAL,\n    stellar_magnitude REAL,\n    planet_type TEXT,\n    discovery_year INTEGER,\n    mass_multiplier REAL,\n    mass_wrt TEXT,\n    radius_multiplier REAL,\n    radius_wrt TEXT,\n    orbital_radius REAL,\n    orbital_period REAL,\n    eccentricity REAL,\n    detection_method TEXT\n); CREATE TABLE reference_planets (name TEXT, mass REAL);  'mass_wrt' in exoplanets table has a one-to-one match to 'name' in reference_planets table, and  'name' refers to either Earth or Jupyter, with Jupyter having 317.8 the mass of Earth.\nQuestion: What is the average mass of exoplanets discovered each year?<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:SELECT \n    e.discovery_y

In [11]:
dataset['valid']['results'][4]

'[{"discovery_year": 1992, "name": "PSR B1257+12 d", "orbital_period": 0.26885694, "period_rank": 1}, {"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708, "period_rank": 2}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762, "period_rank": 1}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973, "period_rank": 1}, {"discovery_year": 1996, "name": "47 Ursae Majoris b", "orbital_period": 3.0, "period_rank": 1}, {"discovery_year": 1996, "name": "16 Cygni B b", "orbital_period": 2.2, "period_rank": 2}, {"discovery_year": 1996, "name": "70 Virginis b", "orbital_period": 0.31950718, "period_rank": 3}, {"discovery_year": 1997, "name": "Rho Coronae Borealis b", "orbital_period": 0.10896646, "period_rank": 1}, {"discovery_year": 1998, "name": "HD 210277 b", "orbital_period": 1.2106776, "period_rank": 1}, {"discovery_year": 1998, "name": "GJ 876 b", "orbital_period": 0.16728269, "period_rank": 2}, {"discovery_yea

- SFTTrainer expect a single 'text' column

## Evaluate the original model

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'
model_orig = AutoModelForCausalLM.from_pretrained(MODEL_ID)#.to("cuda:1")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


In [15]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM exoplanets LIMIT 5;")
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)
    pattern = re.compile(r'SQL Query:\s*```sql\s*(SELECT.*?|WITH.*?)\s*```', re.DOTALL | re.IGNORECASE)

#    pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')

def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model_orig.eval()
    with torch.no_grad():
        out = tokenizer.decode(model_orig.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    res = extract_sql_statement(out)
    if debug: print(res)
    run_query(res, cursor)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
for idx,example in enumerate(valid_loaded):
    generation = ''
    try:
        print('idx: ',idx)
        generation=generate_sql(context=example['context'], question=example['question'],debug=True)
        #print(generation); break
    except sqlite3.OperationalError as op_err:
        num_opp_errors += 1
        op_errors.append((idx,op_err))
        bad_idxs.append(idx)
        print(op_err)
    except Exception as e:
        num_other_exceptions +=1
        exceptions.append((idx,e))
        bad_idxs.append(idx)
        print(e)
    else:
        result_comparisons[idx]=compare_results(run_query(generation,cursor), example['results'])
    finally:
        generations.append(generation)
print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    discovery_year, 
    COUNT(*) as num_discoveries
FROM 
    exoplanets
GROUP BY 
    discovery_year
ORDER BY 
    num_discoveries DESC;
idx:  1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT detection_method
FROM exoplanets
GROUP BY detection_method
ORDER BY SUM(mass_wrt) DESC
LIMIT 1;
idx:  2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    discovery_year, 
    SUM(mass_wrt * mass_multiplier) AS total_mass
FROM 
    exoplanets
GROUP BY 
    discovery_year
ORDER BY 
    total_mass DESC
LIMIT 5;
idx:  3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    e.name, 
    e.orbital_period, 
    e.discovery_year, 
    ROW_NUMBER() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) as rank
FROM 
    exoplanets e
WHERE 
    e.name IN (
        SELECT 
            name
        FROM 
            reference_planets
        WHERE 
            name = 'Earth'
    )
ORDER BY 
    rank;
idx:  4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT * 
FROM exoplanets 
WHERE discovery_year < 2000 
AND eccentricity < 0.2;
idx:  5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT DISTINCT planet_type 
FROM exoplanets 
WHERE discovery_year < 2020;
idx:  6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    e.name, 
    e.mass_wrt * r.mass AS mass
FROM 
    exoplanets e
JOIN 
    reference_planets r ON e.mass_wrt = r.name
WHERE 
    e.discovery_year = (SELECT MIN(discovery_year) FROM exoplanets)
ORDER BY 
    e.discovery_year;
idx:  7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT e.name, e.discovery_year, e.detection_method
FROM exoplanets e
WHERE e.discovery_year >= (CURRENT_YEAR - 2)
AND e.name NOT IN (
  SELECT r.name
  FROM reference_planets r
  WHERE r.mass_wrt = 'Jupiter'
);
no such column: CURRENT_YEAR
idx:  8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    planet_type, 
    AVG(orbital_radius) AS average_orbital_radius
FROM 
    exoplanets
GROUP BY 
    planet_type
ORDER BY 
    average_orbital_radius DESC;
idx:  9
SELECT 
    e.name, 
    e.mass_wrt / e.radius_wrt AS mass_to_radius_ratio
FROM 
    exoplanets e
WHERE 
    e.planet_type = 'Terrestrial' AND 
    e.mass_wrt > 0 AND 
    e.radius_wrt > 0
ORDER BY 
    e.mass_wrt / e.radius_wrt DESC
LIMIT 5;
num_opp_errors: 1, num_other_exceptions:0,
Error rate: 10.0


In [16]:
op_errors, exceptions, result_comparisons

([(7, sqlite3.OperationalError('no such column: CURRENT_YEAR'))],
 [],
 [False, False, False, False, False, False, False, False, False, False])

- OK, 80% execution rate out of the box.  Looks like query 7 used improper SQLite syntax -> consider fine tuning on umesh16071973/SQLite_Training_Dataset first to align syntax.  Still not getting correct answers, it seems, must dig deeper.

In [17]:
generations

['SELECT \n    discovery_year, \n    COUNT(*) as num_discoveries\nFROM \n    exoplanets\nGROUP BY \n    discovery_year\nORDER BY \n    num_discoveries DESC;',
 'SELECT detection_method\nFROM exoplanets\nGROUP BY detection_method\nORDER BY SUM(mass_wrt) DESC\nLIMIT 1;',
 'SELECT \n    discovery_year, \n    SUM(mass_wrt * mass_multiplier) AS total_mass\nFROM \n    exoplanets\nGROUP BY \n    discovery_year\nORDER BY \n    total_mass DESC\nLIMIT 5;',
 "SELECT \n    e.name, \n    e.orbital_period, \n    e.discovery_year, \n    ROW_NUMBER() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) as rank\nFROM \n    exoplanets e\nWHERE \n    e.name IN (\n        SELECT \n            name\n        FROM \n            reference_planets\n        WHERE \n            name = 'Earth'\n    )\nORDER BY \n    rank;",
 'SELECT * \nFROM exoplanets \nWHERE discovery_year < 2000 \nAND eccentricity < 0.2;',
 'SELECT DISTINCT planet_type \nFROM exoplanets \nWHERE discovery_year < 2020;',
 'SELECT 

In [25]:
valid_loaded['answer']

['SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;',
 'SELECT detection_method,\n       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nGROUP BY detection_method\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT discovery_year, \nSUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nWHERE mass_multiplier IS NOT NULL\nGROUP BY discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'SELECT discovery_year, name, orbital_period FROM exoplanets WHERE (discovery_year, name, orbital_period) IN (   SELECT discovery_year, name, orbital_period    FROM exoplanets    ORDER BY discovery_year, orbital_period DESC) GROUP BY discovery_year LIMIT 3;',
 'SELECT name, discovery_year, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AND eccentricity < 0.2;',
 'SELECT DISTINCT \n    planet_type\nFROM \n    exopla

In [24]:
run_query(generations[-1],cursor), valid_loaded['results'][-1]


('[{"name": "EPIC 201497682 b", "mass_to_radius_ratio": null}, {"name": "EPIC 201757695.02", "mass_to_radius_ratio": null}, {"name": "EPIC 201833600 c", "mass_to_radius_ratio": null}, {"name": "EPIC 206215704 b", "mass_to_radius_ratio": null}, {"name": "EPIC 206317286 b", "mass_to_radius_ratio": null}]',
 '[{"name": "HD 100546 b", "mass_radius_ratio": 2.068544927536232e+29}, {"name": "K2-52 b", "mass_radius_ratio": 1.7018181818181818e+29}, {"name": "Kepler-718 b", "mass_radius_ratio": 1.683398781313473e+29}, {"name": "PH2 b", "mass_radius_ratio": 1.6815060908084162e+29}, {"name": "Kepler-488 b", "mass_radius_ratio": 1.670347764371895e+29}]')

In [26]:
valid_loaded['question'][-1]

'Find the exoplanets with the highest mass-to-radius ratio, showing their name and calculated ratio (return top 5).'

In [23]:
compare_results(run_query(generations[-1],cursor), valid_loaded['results'][-1])

False

In [27]:
run_query(generations[0],cursor), valid_loaded['results'][0]


('[{"discovery_year": 2016, "num_discoveries": 1517}, {"discovery_year": 2014, "num_discoveries": 875}, {"discovery_year": 2021, "num_discoveries": 525}, {"discovery_year": 2022, "num_discoveries": 338}, {"discovery_year": 2018, "num_discoveries": 326}, {"discovery_year": 2020, "num_discoveries": 234}, {"discovery_year": 2019, "num_discoveries": 203}, {"discovery_year": 2015, "num_discoveries": 157}, {"discovery_year": 2017, "num_discoveries": 153}, {"discovery_year": 2012, "num_discoveries": 138}, {"discovery_year": 2011, "num_discoveries": 138}, {"discovery_year": 2013, "num_discoveries": 126}, {"discovery_year": 2010, "num_discoveries": 97}, {"discovery_year": 2009, "num_discoveries": 94}, {"discovery_year": 2008, "num_discoveries": 65}, {"discovery_year": 2007, "num_discoveries": 52}, {"discovery_year": 2005, "num_discoveries": 36}, {"discovery_year": 2006, "num_discoveries": 31}, {"discovery_year": 2002, "num_discoveries": 29}, {"discovery_year": 2004, "num_discoveries": 27}, {"di

In [28]:
valid_loaded['question'][0]

'Determine the number of exoplanets discovered each year, and show the year with the highest number of discoveries.'

In [29]:
compare_results(run_query(generations[0],cursor), valid_loaded['results'][0])

False

In [27]:
bad_idxs

[7, 8]

In [29]:
for idx in bad_idxs:
    print(run_query(generations[idx],cursor))
    print(valid_loaded[idx]['results'])
    print('_'*100)
#generations[2]

In [31]:
#run_query("""SELECT name, mass, discovery_year FROM (SELECT name, mass, discovery_year, ROW_NUMBER() OVER (PARTITION BY discovery_year ORDER BY mass ASC) as rn FROM exoplanets) t WHERE rn = 1;""",cursor)
run_query("""SELECT 
    p.planet_type,
    AVG(exoplanets.orbital_radius) AS avg_orbital_radius
FROM 
    exoplanets
JOIN 
    reference_planets r ON exoplanets.mass_wrt = r.name
GROUP BY 
    exoplanets.planet_type
ORDER BY 
    avg_orbital_radius DESC;""",cursor)


## Configure Model, Data, Tokenizer, Trainer

In [12]:
clean_mem()

In [14]:
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    # quantization_config = bnb_config
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the padding side
tokenizer.padding_side = "left"

# Verify the tokenizer configuration
print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Padding side: {tokenizer.padding_side}")


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]


Pad token: <pad>
Pad token ID: 128256
Padding side: left


In [15]:
from transformers import DataCollatorForLanguageModeling
# Step 3: Launch training with SFTTrainer
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-6,
    lr_scheduler_type="cosine",
    num_train_epochs = 25.0,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
   # peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,
)

trainer.train()




  6%|▌         | 10/175 [00:59<16:35,  6.03s/it]

{'loss': 1.564, 'grad_norm': 6.856932640075684, 'learning_rate': 1.9839295885986295e-06, 'epoch': 1.43}


 11%|█▏        | 20/175 [02:00<15:37,  6.05s/it]

{'loss': 0.4872, 'grad_norm': 4.28322696685791, 'learning_rate': 1.936234870639737e-06, 'epoch': 2.86}


 17%|█▋        | 30/175 [03:00<14:41,  6.08s/it]

{'loss': 0.3071, 'grad_norm': 2.6900346279144287, 'learning_rate': 1.858448793601866e-06, 'epoch': 4.29}


 23%|██▎       | 40/175 [04:01<13:36,  6.05s/it]

{'loss': 0.2039, 'grad_norm': 2.4317920207977295, 'learning_rate': 1.753071466003611e-06, 'epoch': 5.71}


 29%|██▊       | 50/175 [05:02<12:37,  6.06s/it]

{'loss': 0.1444, 'grad_norm': 1.9937301874160767, 'learning_rate': 1.6234898018587336e-06, 'epoch': 7.14}


 34%|███▍      | 60/175 [06:02<11:38,  6.07s/it]

{'loss': 0.1186, 'grad_norm': 1.6364166736602783, 'learning_rate': 1.4738686624729987e-06, 'epoch': 8.57}


 40%|████      | 70/175 [07:03<10:37,  6.07s/it]

{'loss': 0.0969, 'grad_norm': 1.6010419130325317, 'learning_rate': 1.3090169943749473e-06, 'epoch': 10.0}


 46%|████▌     | 80/175 [08:04<09:36,  6.07s/it]

{'loss': 0.0882, 'grad_norm': 0.9637448787689209, 'learning_rate': 1.1342332658176555e-06, 'epoch': 11.43}


 51%|█████▏    | 90/175 [09:04<08:35,  6.07s/it]

{'loss': 0.0822, 'grad_norm': 1.1562446355819702, 'learning_rate': 9.551351696494853e-07, 'epoch': 12.86}


 57%|█████▋    | 100/175 [10:05<07:33,  6.05s/it]

{'loss': 0.0802, 'grad_norm': 0.6770322322845459, 'learning_rate': 7.774790660436857e-07, 'epoch': 14.29}


 63%|██████▎   | 110/175 [11:06<06:34,  6.07s/it]

{'loss': 0.0739, 'grad_norm': 0.7221038937568665, 'learning_rate': 6.069749683460764e-07, 'epoch': 15.71}


 69%|██████▊   | 120/175 [12:06<05:33,  6.06s/it]

{'loss': 0.0731, 'grad_norm': 0.9319068193435669, 'learning_rate': 4.4910301854789755e-07, 'epoch': 17.14}


 74%|███████▍  | 130/175 [13:07<04:33,  6.07s/it]

{'loss': 0.0697, 'grad_norm': 0.5880176424980164, 'learning_rate': 3.0893735101313535e-07, 'epoch': 18.57}


 80%|████████  | 140/175 [14:08<03:32,  6.07s/it]

{'loss': 0.0692, 'grad_norm': 0.8344286680221558, 'learning_rate': 1.9098300562505264e-07, 'epoch': 20.0}


 86%|████████▌ | 150/175 [15:09<02:31,  6.07s/it]

{'loss': 0.0685, 'grad_norm': 0.6844998002052307, 'learning_rate': 9.903113209758096e-08, 'epoch': 21.43}


 91%|█████████▏| 160/175 [16:09<01:31,  6.07s/it]

{'loss': 0.0668, 'grad_norm': 0.7809587717056274, 'learning_rate': 3.6037139304146756e-08, 'epoch': 22.86}


 97%|█████████▋| 170/175 [17:10<00:30,  6.08s/it]

{'loss': 0.0679, 'grad_norm': 0.6463181376457214, 'learning_rate': 4.025706004760931e-09, 'epoch': 24.29}


100%|██████████| 175/175 [18:06<00:00,  6.21s/it]

{'train_runtime': 1086.4086, 'train_samples_per_second': 1.289, 'train_steps_per_second': 0.161, 'train_loss': 0.21112436754362923, 'epoch': 25.0}





TrainOutput(global_step=175, training_loss=0.21112436754362923, metrics={'train_runtime': 1086.4086, 'train_samples_per_second': 1.289, 'train_steps_per_second': 0.161, 'total_flos': 6061434391756800.0, 'train_loss': 0.21112436754362923, 'epoch': 25.0})

In [33]:
# trainer.model.save_pretrained(output_dir)
# from peft import AutoPeftModelForCausalLM
# model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# model = model.merge_and_unload()

# output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.18it/s]


In [18]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM exoplanets LIMIT 5;")
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    #pattern = re.compile(r'(WITH.*?SELECT.*?;|SELECT.*?;)', re.DOTALL | re.IGNORECASE)
    pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)

    match = pattern.search(text)
    if match:
        print('EXTRACTED: ', match.group(1).strip())
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')

def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model.eval()
    with torch.no_grad():
        out = tokenizer.decode(model.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    #print('out:\n',out)
    res = extract_sql_statement(out)
    #if debug: print(res)
    run_query(res, cursor)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
for idx,example in enumerate(valid_loaded):
    generation = '' # OK, had mistake here, query 9 is still wrong due to AS, though!
    result = None
    try:
        print('idx: ',idx)
        generation=generate_sql(context=example['context'], question=example['question'],debug=True)
        result =  run_query(generation,cursor)
        #print("generation: ")
        print(generation)
    except sqlite3.OperationalError as op_err:
        num_opp_errors += 1
        op_errors.append((idx,op_err))
        bad_idxs.append(idx)
        print(op_err)
    except Exception as e:
        num_other_exceptions +=1
        exceptions.append((idx,e))
        bad_idxs.append(idx)
        print(e)
    else:
        if result: result_comparisons[idx]=compare_results(result, example['results'])
    finally:
        generations.append(generation)
print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT discovery_year, COUNT(*) AS num_discoveries
FROM exoplanets
GROUP BY discovery_year
ORDER BY num_discoveries DESC
LIMIT 1;
SELECT discovery_year, COUNT(*) AS num_discoveries
FROM exoplanets
GROUP BY discovery_year
ORDER BY num_discoveries DESC
LIMIT 1;
idx:  1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH MassByMethod AS (
    SELECT 
        detection_method, 
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)
SELECT detection_method, total_mass
FROM MassByMethod
ORDER BY total_mass DESC
LIMIT 1;
WITH MassByMethod AS (
    SELECT 
        detection_method, 
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)
SELECT detection_method, total_mass
FROM MassByMethod
ORDER BY total_mass DESC
LIMIT 1;
idx:  2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT 
    e.discovery_year,
    SUM(e.mass_multiplier * r.mass) AS total_mass
FROM exoplanets e
JOIN reference_planets r ON e.mass_wrt = r.name
GROUP BY e.discovery_year
ORDER BY total_mass DESC
LIMIT 5;
SELECT 
    e.discovery_year,
    SUM(e.mass_multiplier * r.mass) AS total_mass
FROM exoplanets e
JOIN reference_planets r ON e.mass_wrt = r.name
GROUP BY e.discovery_year
ORDER BY total_mass DESC
LIMIT 5;
idx:  3
EXTRACTED:  SELECT 
    e.name,
    e.orbital_period,
    RANK() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) AS period_rank
FROM exoplanets e
WHERE e.orbital_period IS NOT NULL
ORDER BY e.discovery_year, e.orbital_period DESC;
SELECT 
    e.name,
    e.orbital_period,
    RANK() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) AS period_rank
FROM exoplanets e
WHERE e.orbital_period IS NOT NULL
ORDER BY e.discovery_year, e.orbital_period DESC;


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT name, eccentricity
FROM exoplanets
WHERE discovery_year < 2000 AND eccentricity < 0.2;
SELECT name, eccentricity
FROM exoplanets
WHERE discovery_year < 2000 AND eccentricity < 0.2;
idx:  5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT planet_type 
FROM exoplanets 
WHERE discovery_year < 2020;
SELECT planet_type 
FROM exoplanets 
WHERE discovery_year < 2020;
idx:  6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


execute() argument 1 must be str, not None
idx:  7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT 
    e.name,
    e.discovery_year,
    e.detection_method
FROM exoplanets e
WHERE e.discovery_year >= DATE('now', '-2 year')
ORDER BY e.discovery_year;
SELECT 
    e.name,
    e.discovery_year,
    e.detection_method
FROM exoplanets e
WHERE e.discovery_year >= DATE('now', '-2 year')
ORDER BY e.discovery_year;
idx:  8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;
SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;
idx:  9
execute() argument 1 must be str, not None
num_opp_errors: 0, num_other_exceptions:2,
Error rate: 20.0


- After 10 epochs of training with QLoRA set with default precision for it, got the following, execution rate down from 80% to 70%, still not getting correct queries.  Trying full precision for adaptors LoRA.

- Same deal with full precision adaptor training with QLoRA.

- Same full precision LoRA (not 'Q', comment out peft_config), get 90% execution rate.  

In [19]:
op_errors, exceptions,result_comparisons

([],
 [(6, TypeError('execute() argument 1 must be str, not None')),
  (9, TypeError('execute() argument 1 must be str, not None'))],
 [False, True, True, False, False, False, False, False, True, False])

- Next steps: Dive into results divergence  + Possibly do preliminary fine tune on SQLite ds (umesh16071973/SQLite_Training_Dataset).

In [20]:
valid_loaded['answer']

['SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;',
 'SELECT detection_method,\n       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nGROUP BY detection_method\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT discovery_year, \nSUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nWHERE mass_multiplier IS NOT NULL\nGROUP BY discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'SELECT discovery_year, name, orbital_period FROM exoplanets WHERE (discovery_year, name, orbital_period) IN (   SELECT discovery_year, name, orbital_period    FROM exoplanets    ORDER BY discovery_year, orbital_period DESC) GROUP BY discovery_year LIMIT 3;',
 'SELECT name, discovery_year, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AND eccentricity < 0.2;',
 'SELECT DISTINCT \n    planet_type\nFROM \n    exopla

In [21]:
generations

['SELECT discovery_year, COUNT(*) AS num_discoveries\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY num_discoveries DESC\nLIMIT 1;',
 'WITH MassByMethod AS (\n    SELECT \n        detection_method, \n        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\n    FROM exoplanets\n    GROUP BY detection_method\n)\nSELECT detection_method, total_mass\nFROM MassByMethod\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT \n    e.discovery_year,\n    SUM(e.mass_multiplier * r.mass) AS total_mass\nFROM exoplanets e\nJOIN reference_planets r ON e.mass_wrt = r.name\nGROUP BY e.discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'SELECT \n    e.name,\n    e.orbital_period,\n    RANK() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) AS period_rank\nFROM exoplanets e\nWHERE e.orbital_period IS NOT NULL\nORDER BY e.discovery_year, e.orbital_period DESC;',
 'SELECT name, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AN

In [22]:
i=0 # Finally, both correct!!!
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 2016, "count": 1517}]
gen_result:
 [{"discovery_year": 2016, "num_discoveries": 1517}]
How are the following two queries different? SELECT discovery_year, COUNT(*) AS num_discoveries
FROM exoplanets
GROUP BY discovery_year
ORDER BY num_discoveries DESC
LIMIT 1;, SELECT discovery_year, COUNT(*) AS count
FROM exoplanets
GROUP BY discovery_year
ORDER BY count DESC
LIMIT 1; and which one better answers the following question: Determine the number of exoplanets discovered each year, and show the year with the highest number of discoveries.?


In [23]:
i=3 # Not quite
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973}]
gen_result:
 [{"name": "PSR B1257+12 d", "orbital_period": 0.26885694, "period_rank": 1}, {"name": "PSR B1257+12 c", "orbital_period": 0.18206708, "period_rank": 2}, {"name": "PSR B1257+12 b", "orbital_period": 0.06926762, "period_rank": 1}, {"name": "51 Pegasi b", "orbital_period": 0.011498973, "period_rank": 1}, {"name": "47 Ursae Majoris b", "orbital_period": 3.0, "period_rank": 1}, {"name": "16 Cygni B b", "orbital_period": 2.2, "period_rank": 2}, {"name": "70 Virginis b", "orbital_period": 0.31950718, "period_rank": 3}, {"name": "55 Cancri b", "orbital_period": 0.040246405, "period_rank": 4}, {"name": "Upsilon Andromedae b", "orbital_period": 0.012594113, "period_rank": 5}, {"name": "Tau Bootis b", "orbital_period": 0.0090349

In [25]:
i=4 # OK, just didn't include discovery year
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"name": "47 Ursae Majoris b", "discovery_year": 1996, "eccentricity": 0.03}, {"name": "51 Pegasi b", "discovery_year": 1995, "eccentricity": 0.01}, {"name": "55 Cancri b", "discovery_year": 1996, "eccentricity": 0.0}, {"name": "GJ 86 b", "discovery_year": 1999, "eccentricity": 0.04}, {"name": "GJ 876 b", "discovery_year": 1998, "eccentricity": 0.03}, {"name": "HD 10697 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 130322 b", "discovery_year": 1999, "eccentricity": 0.03}, {"name": "HD 177830 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 187123 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 192263 b", "discovery_year": 1999, "eccentricity": 0.05}, {"name": "HD 195019 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 209458 b", "discovery_year": 1999, "eccentricity": 0.0}, {"name": "HD 217107 b", "discovery_year": 1998, "eccentricity": 0.13}, {"name": "HD 75289 b", "discovery_year": 1999, "eccentricity": 0.0

In [26]:
i=5 # Not quite
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"planet_type": "Unknown"}]
gen_result:
 [{"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Super Earth"}, {"planet_type": "Gas Giant"}, {"planet_type": "Neptune-like"}, {"plane

In [None]:
i=5 # Not quite
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"planet_type": "Unknown"}]
gen_result:
 [{"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Super Earth"}, {"planet_type": "Gas Giant"}, {"planet_type": "Neptune-like"}, {"plane

In [27]:
i=7 # About the same
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"name": "GJ 1151 c", "discovery_year": 2023, "detection_method": "Radial Velocity"}, {"name": "TOI-1669 b", "discovery_year": 2023, "detection_method": "Radial Velocity"}, {"name": "TOI-1694 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-1694 c", "discovery_year": 2023, "detection_method": "Radial Velocity"}, {"name": "TOI-4342 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-4342 c", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-4562 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-4582 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-700 e", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "AB Aurigae b", "discovery_year": 2022, "detection_method": "Direct Imaging"}, {"name": "CoRoT-35 b", "discovery_year": 2022, "detection_method": "Transit"}, {"name": "CoRoT-36 b", "discovery_year": 2022, "detection_method": "Transit"}, {"nam

- Interestingly, 'INTERVAL' was still used in generated query after 15 epochs, but not after 25 epochs of fine tuning.  This suggests that doing stage 1 SQLite fine tune and longer training are possible substitutes. 

- Overall, ~50-60% accuracy post 18 minutes of fine tuning, seems like LoRA/DoRA is possibly at least as good for the task.