In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    LlamaForCausalLM,
    LlamaTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


[2024-10-11 14:59:33,636] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/mainuser/anaconda3/envs/sqlft/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
print(torch.cuda.is_available())

True


In [3]:
import wandb # do from scipt later
wandb.login()
%env WANDB_PROJECT=sql-fine-tuning

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdpopovvelasco[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=sql-fine-tuning


In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
output_dir = './sql_output_dir_llama3'
logging_dir = './sql_logging_dir_llama3'

In [6]:
import sys,gc,traceback
import torch
def clean_ipython_hist():
    # Code in this function mainly copied from IPython source
    if not 'get_ipython' in globals(): return
    ip = get_ipython()
    user_ns = ip.user_ns
    ip.displayhook.flush()
    pc = ip.displayhook.prompt_count + 1
    for n in range(1, pc): user_ns.pop('_i'+repr(n),None)
    user_ns.update(dict(_i='',_ii='',_iii=''))
    hm = ip.history_manager
    hm.input_hist_parsed[:] = [''] * pc
    hm.input_hist_raw[:] = [''] * pc
    hm._i = hm._ii = hm._iii = hm._i00 =  ''



def clean_tb():
    # h/t Piotr Czapla
    if hasattr(sys, 'last_traceback'):
        traceback.clear_frames(sys.last_traceback)
        delattr(sys, 'last_traceback')
    if hasattr(sys, 'last_type'): delattr(sys, 'last_type')
    if hasattr(sys, 'last_value'): delattr(sys, 'last_value')

def clean_mem():
    clean_tb()
    clean_ipython_hist()
    gc.collect()
    torch.cuda.empty_cache()

In [7]:
clean_mem()

## Load and Prepare the Data

<|begin▁of▁sentence|> Question: What is the total sales amount? Context: The sales table contains information about each sale including date, amount, and customer ID. Answer: SELECT SUM(amount) FROM sales; <|end▁of▁sentence|>

In [8]:

from transformers import AutoTokenizer
from datasets import load_dataset

# Load a tokenizer to use its chat template
template_tokenizer = AutoTokenizer.from_pretrained(
    "premai-io/prem-1B-SQL"
)

# def format_prompt(sample):

#     """Given a sample dictionary with keys "title" and "abstract" format into a prompt.

#     Args:
#       sample: A sample dictionary from a Hugging Face dataset.

#     Returns:
#       sample: sample dictionary with "text" key for the formatted prompt.
#     """
#     #sample['text']=f"[INST] <> Write SQL code to answer the question based on the context. Please wrap your code answer using ```: <> {sample['question']} {sample['context']} [/INST] {sample['answer']}"
#     sample['text']=f"""<|begin▁of▁sentence|> You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.
# Context: {sample['context']}
# Question: {sample['question']}
# SQL Query:{sample['answer']} <|end▁of▁sentence|>"""
#     return {"text":sample['text']}

def format_prompt(sample):
    sample['text'] = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {sample['context']}
Question: {sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:{sample['answer']} <|end▁of▁sentence|>"""
    return {"text":sample['text']}
  

# Load and format the data using the template TinyLLama is using
dataset = (
    load_dataset("dpv/exoplanets-sql")#, split="train")
      .shuffle(seed=42)
)
dataset = dataset.map(format_prompt)

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'results', 'text'],
        num_rows: 50
    })
    valid: Dataset({
        features: ['context', 'question', 'answer', 'results', 'text'],
        num_rows: 10
    })
})

In [10]:
dataset['train']['text'][0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nContext: CREATE TABLE exoplanets (\n    name TEXT,\n    distance REAL,\n    stellar_magnitude REAL,\n    planet_type TEXT,\n    discovery_year INTEGER,\n    mass_multiplier REAL,\n    mass_wrt TEXT,\n    radius_multiplier REAL,\n    radius_wrt TEXT,\n    orbital_radius REAL,\n    orbital_period REAL,\n    eccentricity REAL,\n    detection_method TEXT\n); CREATE TABLE reference_planets (name TEXT, mass REAL);  'mass_wrt' in exoplanets table has a one-to-one match to 'name' in reference_planets table, and  'name' refers to either Earth or Jupyter, with Jupyter having 317.8 the mass of Earth.\nQuestion: What is the average mass of exoplanets discovered each year?<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:SELECT \n    e.discovery_y

- SFTTrainer expect a single 'text' column

## Evaluate the original model

In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'
model_orig = AutoModelForCausalLM.from_pretrained(MODEL_ID)#.to("cuda:1")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


In [15]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM exoplanets LIMIT 5;")
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)
    pattern = re.compile(r'SQL Query:\s*```sql\s*(SELECT.*?|WITH.*?)\s*```', re.DOTALL | re.IGNORECASE)

#    pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')

def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model_orig.eval()
    with torch.no_grad():
        out = tokenizer.decode(model_orig.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    res = extract_sql_statement(out)
    if debug: print(res)
    run_query(res, cursor)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
for idx,example in enumerate(valid_loaded):
    generation = ''
    try:
        print('idx: ',idx)
        generation=generate_sql(context=example['context'], question=example['question'],debug=True)
        #print(generation); break
    except sqlite3.OperationalError as op_err:
        num_opp_errors += 1
        op_errors.append((idx,op_err))
        bad_idxs.append(idx)
        print(op_err)
    except Exception as e:
        num_other_exceptions +=1
        exceptions.append((idx,e))
        bad_idxs.append(idx)
        print(e)
    else:
        result_comparisons[idx]=compare_results(run_query(generation,cursor), example['results'])
    finally:
        generations.append(generation)
print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    discovery_year, 
    COUNT(*) as num_discoveries
FROM 
    exoplanets
GROUP BY 
    discovery_year
ORDER BY 
    num_discoveries DESC;
idx:  1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT detection_method
FROM exoplanets
GROUP BY detection_method
ORDER BY SUM(mass_wrt) DESC
LIMIT 1;
idx:  2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    discovery_year, 
    SUM(mass_wrt * mass_multiplier) AS total_mass
FROM 
    exoplanets
GROUP BY 
    discovery_year
ORDER BY 
    total_mass DESC
LIMIT 5;
idx:  3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    e.name, 
    e.orbital_period, 
    e.discovery_year, 
    ROW_NUMBER() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) as rank
FROM 
    exoplanets e
WHERE 
    e.name IN (
        SELECT 
            name
        FROM 
            reference_planets
        WHERE 
            name = 'Earth'
    )
ORDER BY 
    rank;
idx:  4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT * 
FROM exoplanets 
WHERE discovery_year < 2000 
AND eccentricity < 0.2;
idx:  5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT DISTINCT planet_type 
FROM exoplanets 
WHERE discovery_year < 2020;
idx:  6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    e.name, 
    e.mass_wrt * r.mass AS mass
FROM 
    exoplanets e
JOIN 
    reference_planets r ON e.mass_wrt = r.name
WHERE 
    e.discovery_year = (SELECT MIN(discovery_year) FROM exoplanets)
ORDER BY 
    e.discovery_year;
idx:  7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT e.name, e.discovery_year, e.detection_method
FROM exoplanets e
WHERE e.discovery_year >= (CURRENT_YEAR - 2)
AND e.name NOT IN (
  SELECT r.name
  FROM reference_planets r
  WHERE r.mass_wrt = 'Jupiter'
);
no such column: CURRENT_YEAR
idx:  8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


SELECT 
    planet_type, 
    AVG(orbital_radius) AS average_orbital_radius
FROM 
    exoplanets
GROUP BY 
    planet_type
ORDER BY 
    average_orbital_radius DESC;
idx:  9
SELECT 
    e.name, 
    e.mass_wrt / e.radius_wrt AS mass_to_radius_ratio
FROM 
    exoplanets e
WHERE 
    e.planet_type = 'Terrestrial' AND 
    e.mass_wrt > 0 AND 
    e.radius_wrt > 0
ORDER BY 
    e.mass_wrt / e.radius_wrt DESC
LIMIT 5;
num_opp_errors: 1, num_other_exceptions:0,
Error rate: 10.0


In [16]:
op_errors, exceptions, result_comparisons

([(7, sqlite3.OperationalError('no such column: CURRENT_YEAR'))],
 [],
 [False, False, False, False, False, False, False, False, False, False])

- OK, 80% execution rate out of the box.  Looks like query 7 used improper SQLite syntax -> consider fine tuning on umesh16071973/SQLite_Training_Dataset first to align syntax.  Still not getting correct answers, it seems, must dig deeper.

In [17]:
generations

['SELECT \n    discovery_year, \n    COUNT(*) as num_discoveries\nFROM \n    exoplanets\nGROUP BY \n    discovery_year\nORDER BY \n    num_discoveries DESC;',
 'SELECT detection_method\nFROM exoplanets\nGROUP BY detection_method\nORDER BY SUM(mass_wrt) DESC\nLIMIT 1;',
 'SELECT \n    discovery_year, \n    SUM(mass_wrt * mass_multiplier) AS total_mass\nFROM \n    exoplanets\nGROUP BY \n    discovery_year\nORDER BY \n    total_mass DESC\nLIMIT 5;',
 "SELECT \n    e.name, \n    e.orbital_period, \n    e.discovery_year, \n    ROW_NUMBER() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) as rank\nFROM \n    exoplanets e\nWHERE \n    e.name IN (\n        SELECT \n            name\n        FROM \n            reference_planets\n        WHERE \n            name = 'Earth'\n    )\nORDER BY \n    rank;",
 'SELECT * \nFROM exoplanets \nWHERE discovery_year < 2000 \nAND eccentricity < 0.2;',
 'SELECT DISTINCT planet_type \nFROM exoplanets \nWHERE discovery_year < 2020;',
 'SELECT 

In [25]:
valid_loaded['answer']

['SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;',
 'SELECT detection_method,\n       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nGROUP BY detection_method\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT discovery_year, \nSUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nWHERE mass_multiplier IS NOT NULL\nGROUP BY discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'SELECT discovery_year, name, orbital_period FROM exoplanets WHERE (discovery_year, name, orbital_period) IN (   SELECT discovery_year, name, orbital_period    FROM exoplanets    ORDER BY discovery_year, orbital_period DESC) GROUP BY discovery_year LIMIT 3;',
 'SELECT name, discovery_year, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AND eccentricity < 0.2;',
 'SELECT DISTINCT \n    planet_type\nFROM \n    exopla

In [24]:
run_query(generations[-1],cursor), valid_loaded['results'][-1]


('[{"name": "EPIC 201497682 b", "mass_to_radius_ratio": null}, {"name": "EPIC 201757695.02", "mass_to_radius_ratio": null}, {"name": "EPIC 201833600 c", "mass_to_radius_ratio": null}, {"name": "EPIC 206215704 b", "mass_to_radius_ratio": null}, {"name": "EPIC 206317286 b", "mass_to_radius_ratio": null}]',
 '[{"name": "HD 100546 b", "mass_radius_ratio": 2.068544927536232e+29}, {"name": "K2-52 b", "mass_radius_ratio": 1.7018181818181818e+29}, {"name": "Kepler-718 b", "mass_radius_ratio": 1.683398781313473e+29}, {"name": "PH2 b", "mass_radius_ratio": 1.6815060908084162e+29}, {"name": "Kepler-488 b", "mass_radius_ratio": 1.670347764371895e+29}]')

In [26]:
valid_loaded['question'][-1]

'Find the exoplanets with the highest mass-to-radius ratio, showing their name and calculated ratio (return top 5).'

In [23]:
compare_results(run_query(generations[-1],cursor), valid_loaded['results'][-1])

False

In [27]:
run_query(generations[0],cursor), valid_loaded['results'][0]


('[{"discovery_year": 2016, "num_discoveries": 1517}, {"discovery_year": 2014, "num_discoveries": 875}, {"discovery_year": 2021, "num_discoveries": 525}, {"discovery_year": 2022, "num_discoveries": 338}, {"discovery_year": 2018, "num_discoveries": 326}, {"discovery_year": 2020, "num_discoveries": 234}, {"discovery_year": 2019, "num_discoveries": 203}, {"discovery_year": 2015, "num_discoveries": 157}, {"discovery_year": 2017, "num_discoveries": 153}, {"discovery_year": 2012, "num_discoveries": 138}, {"discovery_year": 2011, "num_discoveries": 138}, {"discovery_year": 2013, "num_discoveries": 126}, {"discovery_year": 2010, "num_discoveries": 97}, {"discovery_year": 2009, "num_discoveries": 94}, {"discovery_year": 2008, "num_discoveries": 65}, {"discovery_year": 2007, "num_discoveries": 52}, {"discovery_year": 2005, "num_discoveries": 36}, {"discovery_year": 2006, "num_discoveries": 31}, {"discovery_year": 2002, "num_discoveries": 29}, {"discovery_year": 2004, "num_discoveries": 27}, {"di

In [28]:
valid_loaded['question'][0]

'Determine the number of exoplanets discovered each year, and show the year with the highest number of discoveries.'

In [29]:
compare_results(run_query(generations[0],cursor), valid_loaded['results'][0])

False

In [27]:
bad_idxs

[7, 8]

In [29]:
for idx in bad_idxs:
    print(run_query(generations[idx],cursor))
    print(valid_loaded[idx]['results'])
    print('_'*100)
#generations[2]

In [31]:
#run_query("""SELECT name, mass, discovery_year FROM (SELECT name, mass, discovery_year, ROW_NUMBER() OVER (PARTITION BY discovery_year ORDER BY mass ASC) as rn FROM exoplanets) t WHERE rn = 1;""",cursor)
run_query("""SELECT 
    p.planet_type,
    AVG(exoplanets.orbital_radius) AS avg_orbital_radius
FROM 
    exoplanets
JOIN 
    reference_planets r ON exoplanets.mass_wrt = r.name
GROUP BY 
    exoplanets.planet_type
ORDER BY 
    avg_orbital_radius DESC;""",cursor)


## Configure Model, Data, Tokenizer, Trainer

In [12]:
clean_mem()

### DoRA Training

In [12]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_config,get_peft_model
MODEL_ID='meta-llama/Llama-3.2-3B-Instruct'
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    # quantization_config = bnb_config
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# tokenizer.pad_token = "<pad>"
# tokenizer.padding_side = "left"
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the padding side
tokenizer.padding_side = "left"
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=128,
    bias="none",
    task_type="CAUSAL_LM",
    use_dora=True,
    target_modules=  ["k_proj", "gate_proj", "v_proj", "up_proj", "q_proj", "o_proj", "down_proj"],# Layers to target
   # dtype="float32"                   	# Keep LoRA parameters in float32 precision

)

#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
# Ensure LoRA weights are in full precision
for name, param in model.named_parameters():
    if "lora" in name:
        param.data = param.data.to(torch.float32)

# Verify the precision of LoRA weights
for name, param in model.named_parameters():
    if "lora" in name:
        print(f"{name}: {param.dtype}")


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.11s/it]


base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.q_proj.lora_magnitude_vector.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_magnitude_vector.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_magnitude_vector.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.float32
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.float32
base_mode

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})

# Set the padding side
tokenizer.padding_side = "left"

# Verify the tokenizer configuration
print(f"Pad token: {tokenizer.pad_token}")
print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"Padding side: {tokenizer.padding_side}")


Pad token: <pad>
Pad token ID: 128256
Padding side: left


In [17]:
from transformers import DataCollatorForLanguageModeling
# Step 3: Launch training with SFTTrainer
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs = 20.0,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=256,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,
)

trainer.train()


                                                
  7%|▋         | 10/140 [00:40<08:41,  4.01s/it]

{'loss': 0.0966, 'grad_norm': 0.24958732724189758, 'learning_rate': 0.00019749279121818235, 'epoch': 1.43}


                                                
 14%|█▍        | 20/140 [01:20<08:05,  4.05s/it]

{'loss': 0.0648, 'grad_norm': 0.13939407467842102, 'learning_rate': 0.0001900968867902419, 'epoch': 2.86}


                                                
 21%|██▏       | 30/140 [02:01<07:26,  4.06s/it]

{'loss': 0.0457, 'grad_norm': 0.1012723445892334, 'learning_rate': 0.000178183148246803, 'epoch': 4.29}


                                                
 29%|██▊       | 40/140 [02:42<06:46,  4.07s/it]

{'loss': 0.0335, 'grad_norm': 0.13082222640514374, 'learning_rate': 0.00016234898018587337, 'epoch': 5.71}


                                                
 36%|███▌      | 50/140 [03:22<06:05,  4.06s/it]

{'loss': 0.0262, 'grad_norm': 0.07908134907484055, 'learning_rate': 0.00014338837391175582, 'epoch': 7.14}


                                                
 43%|████▎     | 60/140 [04:03<05:25,  4.07s/it]

{'loss': 0.0204, 'grad_norm': 0.0730372741818428, 'learning_rate': 0.00012225209339563145, 'epoch': 8.57}


                                                
 50%|█████     | 70/140 [04:44<04:45,  4.07s/it]

{'loss': 0.0171, 'grad_norm': 0.06214720383286476, 'learning_rate': 0.0001, 'epoch': 10.0}


                                                
 57%|█████▋    | 80/140 [05:24<04:04,  4.07s/it]

{'loss': 0.0159, 'grad_norm': 0.05327403545379639, 'learning_rate': 7.774790660436858e-05, 'epoch': 11.43}


                                                
 64%|██████▍   | 90/140 [06:05<03:23,  4.08s/it]

{'loss': 0.0143, 'grad_norm': 0.08106514811515808, 'learning_rate': 5.6611626088244194e-05, 'epoch': 12.86}


                                                 
 71%|███████▏  | 100/140 [06:46<02:42,  4.07s/it]

{'loss': 0.0147, 'grad_norm': 0.04750976711511612, 'learning_rate': 3.7651019814126654e-05, 'epoch': 14.29}


                                                 
 79%|███████▊  | 110/140 [07:27<02:02,  4.09s/it]

{'loss': 0.0132, 'grad_norm': 0.05265866592526436, 'learning_rate': 2.181685175319702e-05, 'epoch': 15.71}


                                                 
 86%|████████▌ | 120/140 [08:07<01:21,  4.09s/it]

{'loss': 0.0138, 'grad_norm': 0.04926436394453049, 'learning_rate': 9.903113209758096e-06, 'epoch': 17.14}


                                                 
 93%|█████████▎| 130/140 [08:48<00:40,  4.09s/it]

{'loss': 0.014, 'grad_norm': 0.05464477092027664, 'learning_rate': 2.5072087818176382e-06, 'epoch': 18.57}


                                                 
100%|██████████| 140/140 [09:29<00:00,  4.07s/it]

{'loss': 0.0127, 'grad_norm': 0.04553806409239769, 'learning_rate': 0.0, 'epoch': 20.0}


                                                 
100%|██████████| 140/140 [09:32<00:00,  4.09s/it]

{'train_runtime': 572.0296, 'train_samples_per_second': 1.958, 'train_steps_per_second': 0.245, 'train_loss': 0.0287760711141995, 'epoch': 20.0}





TrainOutput(global_step=140, training_loss=0.0287760711141995, metrics={'train_runtime': 572.0296, 'train_samples_per_second': 1.958, 'train_steps_per_second': 0.245, 'total_flos': 5185100190842880.0, 'train_loss': 0.0287760711141995, 'epoch': 20.0})

In [16]:
clean_mem()

  0%|          | 0/70 [03:52<?, ?it/s]


In [33]:
# trainer.model.save_pretrained(output_dir)
# from peft import AutoPeftModelForCausalLM
# model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
# model = model.merge_and_unload()

# output_merged_dir = os.path.join(output_dir, "final_merged_checkpoint")
# model.save_pretrained(output_merged_dir, safe_serialization=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.18it/s]


In [38]:
valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train')


In [18]:
import sqlite3
import json
import pandas as pd
import re
pd.set_option('display.max_colwidth',1000)
conn = sqlite3.connect('./databases/exoplanets_db.db')
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM exoplanets LIMIT 5;")
def extract_sql_statement(text):
    # Regular expression to match the SQL statement
    #pattern = re.compile(r'SELECT.*?;', re.DOTALL)
    #pattern = re.compile(r'(WITH.*?SELECT.*?;|SELECT.*?;)', re.DOTALL | re.IGNORECASE)
    pattern = re.compile(r'SQL Query:(.*?;)', re.DOTALL)

    match = pattern.search(text)
    if match:
        print('EXTRACTED: ', match.group(1).strip())
        return match.group(1).strip()
    return None

def run_query(query, cursor):
    cursor.execute(query)
    rows = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]
    result_dict = [dict(zip(columns, row)) for row in rows]
    return json.dumps(result_dict)
def compare_results(query_results, expected_results):
    """Compare the query results with the expected results."""
    # Sort the results for comparison
    query_results_sorted = sorted(query_results, key=lambda x: json.dumps(x, sort_keys=True))
    expected_results_sorted = sorted(expected_results, key=lambda x: json.dumps(x, sort_keys=True))
    return query_results_sorted == expected_results_sorted

valid_loaded = load_dataset('json', data_files='./databases/valid.json', split='train') # should not have called it 'train' split in json, but ok, HF data has a clear train/test split
def generate_sql(context, question, max_new_tokens = 256, debug=False):
    full_question = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>SQL Query:"""
    model_input = tokenizer(full_question, return_tensors="pt")#.to("cuda:1")
    model.eval()
    with torch.no_grad():
        out = tokenizer.decode(model.generate(**model_input,max_new_tokens=max_new_tokens)[0],skip_special_tokens=True)
    #print('out: ',out)
    #import sys; sys.exit()
    #res = out[out.find("""```sql```""")+3:out.find(""";\n```""")]
    #res = out[out.find('SQL Query:')+10:]
    #print('out:\n',out)
    res = extract_sql_statement(out)
    #if debug: print(res)
    #return out
    return res
num_opp_errors = 0
num_other_exceptions = 0
op_errors = []
exceptions  = []
generations = []
bad_idxs = []
result_comparisons = [False]*len(valid_loaded)
for idx,example in enumerate(valid_loaded):
    generation = '' # OK, had mistake here, query 9 is still wrong due to AS, though!
    result = None
    try:
        print('idx: ',idx)
        generation=generate_sql(context=example['context'], question=example['question'],debug=True)
        result =  run_query(generation,cursor)
        #print("generation: ")
        print(generation)
    except sqlite3.OperationalError as op_err:
        num_opp_errors += 1
        op_errors.append((idx,op_err))
        bad_idxs.append(idx)
        print(op_err)
    except Exception as e:
        num_other_exceptions +=1
        exceptions.append((idx,e))
        bad_idxs.append(idx)
        print(e)
    else:
        if result: result_comparisons[idx]=compare_results(result, example['results'])
    finally:
        generations.append(generation)
print(f"num_opp_errors: {num_opp_errors}, num_other_exceptions:{num_other_exceptions},\nError rate: {(num_other_exceptions+num_opp_errors)*100/len(valid_loaded)}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


idx:  0


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH YearlyDiscoveries AS (
    SELECT discovery_year, COUNT(*) AS num_discoveries
    FROM exoplanets
    GROUP BY discovery_year
)
SELECT * 
FROM YearlyDiscoveries
ORDER BY num_discoveries DESC
LIMIT 1;
WITH YearlyDiscoveries AS (
    SELECT discovery_year, COUNT(*) AS num_discoveries
    FROM exoplanets
    GROUP BY discovery_year
)
SELECT * 
FROM YearlyDiscoveries
ORDER BY num_discoveries DESC
LIMIT 1;
idx:  1


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH DetectionMethods AS (
    SELECT 
        detection_method, 
        COUNT(*) AS num_exoplanets,
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)
SELECT detection_method
FROM DetectionMethods
ORDER BY total_mass DESC
LIMIT 1;
WITH DetectionMethods AS (
    SELECT 
        detection_method, 
        COUNT(*) AS num_exoplanets,
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)
SELECT detection_method
FROM DetectionMethods
ORDER BY total_mass DESC
LIMIT 1;
idx:  2


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH YearlyMass AS (
    SELECT discovery_year, 
           SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY discovery_year
)
SELECT discovery_year, total_mass
FROM YearlyMass
ORDER BY total_mass DESC
LIMIT 5;
WITH YearlyMass AS (
    SELECT discovery_year, 
           SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY discovery_year
)
SELECT discovery_year, total_mass
FROM YearlyMass
ORDER BY total_mass DESC
LIMIT 5;
idx:  3


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  WITH RankedExoplanets AS (
    SELECT 
        e.name,
        e.orbital_period,
        RANK() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) AS period_rank
    FROM exoplanets e
)
SELECT * 
FROM RankedExoplanets
WHERE period_rank <= 3;
WITH RankedExoplanets AS (
    SELECT 
        e.name,
        e.orbital_period,
        RANK() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) AS period_rank
    FROM exoplanets e
)
SELECT * 
FROM RankedExoplanets
WHERE period_rank <= 3;
idx:  4


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT name, eccentricity
FROM exoplanets
WHERE discovery_year < 2000 AND eccentricity < 0.2;
SELECT name, eccentricity
FROM exoplanets
WHERE discovery_year < 2000 AND eccentricity < 0.2;
idx:  5


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT planet_type
FROM exoplanets
WHERE discovery_year < 2020;
SELECT planet_type
FROM exoplanets
WHERE discovery_year < 2020;
idx:  6


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT 
    e.name,
    e.mass_multiplier * r.mass AS mass
FROM exoplanets e
JOIN reference_planets r ON e.mass_wrt = r.name
GROUP BY e.discovery_year
ORDER BY e.discovery_year
LIMIT 1;
SELECT 
    e.name,
    e.mass_multiplier * r.mass AS mass
FROM exoplanets e
JOIN reference_planets r ON e.mass_wrt = r.name
GROUP BY e.discovery_year
ORDER BY e.discovery_year
LIMIT 1;
idx:  7


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT 
    e.name,
    e.discovery_year,
    e.detection_method
FROM exoplanets e
WHERE e.discovery_year > NOW() - INTERVAL 2 YEAR;
near "2": syntax error
idx:  8


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


EXTRACTED:  SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;
SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;
idx:  9
EXTRACTED:  WITH CalculatedMassRatio AS (
    SELECT 
        e.name,
        e.mass_multiplier * r.mass AS mass,
        e.radius_multiplier * r.radius AS radius,
        (e.mass_multiplier * r.mass) / (e.radius_multiplier * r.radius) AS mass_ratio
    FROM exoplanets e
    JOIN reference_planets r ON e.mass_wrt = r.name
)
SELECT * 
FROM CalculatedMassRatio
ORDER BY mass_ratio DESC
LIMIT 5;
no such column: r.radius
num_opp_errors: 2, num_other_exceptions:0,
Error rate: 20.0


- After 10 epochs of training with QLoRA set with default precision for it, got the following, execution rate down from 80% to 70%, still not getting correct queries.  Trying full precision for adaptors LoRA.

- Same deal with full precision adaptor training with QLoRA.

- Same full precision LoRA (not 'Q', comment out peft_config), get 90% execution rate.  

In [19]:
op_errors, exceptions,result_comparisons

([(7, sqlite3.OperationalError('near "2": syntax error')),
  (9, sqlite3.OperationalError('no such column: r.radius'))],
 [],
 [False, False, True, False, False, False, False, False, True, False])

- Next steps: Dive into results divergence  + Possibly do preliminary fine tune on SQLite ds (umesh16071973/SQLite_Training_Dataset).

In [20]:
valid_loaded['answer']

['SELECT discovery_year, COUNT(*) AS count\nFROM exoplanets\nGROUP BY discovery_year\nORDER BY count DESC\nLIMIT 1;',
 'SELECT detection_method,\n       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nGROUP BY detection_method\nORDER BY total_mass DESC\nLIMIT 1;',
 'SELECT discovery_year, \nSUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\nFROM exoplanets\nWHERE mass_multiplier IS NOT NULL\nGROUP BY discovery_year\nORDER BY total_mass DESC\nLIMIT 5;',
 'WITH RankedExoplanets AS ( SELECT discovery_year, name, orbital_period, RANK() OVER (PARTITION BY discovery_year ORDER BY orbital_period DESC) AS period_rank FROM exoplanets) SELECT * FROM RankedExoplanets WHERE period_rank <= 3;',
 'SELECT name, discovery_year, eccentricity\nFROM exoplanets\nWHERE discovery_year < 2000 AND eccentricity < 0.2;',
 'SELECT DISTINCT \n    planet_type\nFROM \n    exoplanets\nWHERE \n    planet_ty

In [21]:
generations

['WITH YearlyDiscoveries AS (\n    SELECT discovery_year, COUNT(*) AS num_discoveries\n    FROM exoplanets\n    GROUP BY discovery_year\n)\nSELECT * \nFROM YearlyDiscoveries\nORDER BY num_discoveries DESC\nLIMIT 1;',
 'WITH DetectionMethods AS (\n    SELECT \n        detection_method, \n        COUNT(*) AS num_exoplanets,\n        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\n    FROM exoplanets\n    GROUP BY detection_method\n)\nSELECT detection_method\nFROM DetectionMethods\nORDER BY total_mass DESC\nLIMIT 1;',
 'WITH YearlyMass AS (\n    SELECT discovery_year, \n           SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass\n    FROM exoplanets\n    GROUP BY discovery_year\n)\nSELECT discovery_year, total_mass\nFROM YearlyMass\nORDER BY total_mass DESC\nLIMIT 5;',
 'WITH RankedExoplanets AS (\n    SELECT \n        e.name,\n        e.orbital_period,\n        RANK() OVER (PARTITION BY e.disc

In [21]:
# i=0 # syntax error
# valid_result=valid_loaded['results'][i]
# print('valid_result:\n',valid_result)
# gen_result=run_query(generations[i],cursor)
# print('gen_result:\n', gen_result)
# compare_results(valid_result,gen_result)
# print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

In [22]:
i=0 # Valid +1
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 2016, "count": 1517}]
gen_result:
 [{"discovery_year": 2016, "num_discoveries": 1517}]
How are the following two queries different? WITH YearlyDiscoveries AS (
    SELECT discovery_year, COUNT(*) AS num_discoveries
    FROM exoplanets
    GROUP BY discovery_year
)
SELECT * 
FROM YearlyDiscoveries
ORDER BY num_discoveries DESC
LIMIT 1;, SELECT discovery_year, COUNT(*) AS count
FROM exoplanets
GROUP BY discovery_year
ORDER BY count DESC
LIMIT 1; and which one better answers the following question: Determine the number of exoplanets discovered each year, and show the year with the highest number of discoveries.?


In [23]:
i=1 # Valid +1
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"detection_method": "Radial Velocity", "total_mass": 6.38296985206948e+30}]
gen_result:
 [{"detection_method": "Radial Velocity"}]
How are the following two queries different? WITH DetectionMethods AS (
    SELECT 
        detection_method, 
        COUNT(*) AS num_exoplanets,
        SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
    FROM exoplanets
    GROUP BY detection_method
)
SELECT detection_method
FROM DetectionMethods
ORDER BY total_mass DESC
LIMIT 1;, SELECT detection_method,
       SUM(mass_multiplier * (SELECT mass FROM reference_planets WHERE name = mass_wrt)) AS total_mass
FROM exoplanets
GROUP BY detection_method
ORDER BY total_mass DESC
LIMIT 1; and which one better answers the following question: Display the detection method that has produced the most massive exoplanets (based on total mass).?


In [40]:
i=3 # Valid +1ish,missing discovery year col in generation
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 1992, "name": "PSR B1257+12 d", "orbital_period": 0.26885694, "period_rank": 1}, {"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708, "period_rank": 2}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762, "period_rank": 1}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973, "period_rank": 1}, {"discovery_year": 1996, "name": "47 Ursae Majoris b", "orbital_period": 3.0, "period_rank": 1}, {"discovery_year": 1996, "name": "16 Cygni B b", "orbital_period": 2.2, "period_rank": 2}, {"discovery_year": 1996, "name": "70 Virginis b", "orbital_period": 0.31950718, "period_rank": 3}, {"discovery_year": 1997, "name": "Rho Coronae Borealis b", "orbital_period": 0.10896646, "period_rank": 1}, {"discovery_year": 1998, "name": "HD 210277 b", "orbital_period": 1.2106776, "period_rank": 1}, {"discovery_year": 1998, "name": "GJ 876 b", "orbital_period": 0.16728269, "period_rank": 2}, {

In [42]:
i=4 # Valid +1ish,missing discovery year col in generation
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"name": "47 Ursae Majoris b", "discovery_year": 1996, "eccentricity": 0.03}, {"name": "51 Pegasi b", "discovery_year": 1995, "eccentricity": 0.01}, {"name": "55 Cancri b", "discovery_year": 1996, "eccentricity": 0.0}, {"name": "GJ 86 b", "discovery_year": 1999, "eccentricity": 0.04}, {"name": "GJ 876 b", "discovery_year": 1998, "eccentricity": 0.03}, {"name": "HD 10697 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 130322 b", "discovery_year": 1999, "eccentricity": 0.03}, {"name": "HD 177830 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 187123 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 192263 b", "discovery_year": 1999, "eccentricity": 0.05}, {"name": "HD 195019 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 209458 b", "discovery_year": 1999, "eccentricity": 0.0}, {"name": "HD 217107 b", "discovery_year": 1998, "eccentricity": 0.13}, {"name": "HD 75289 b", "discovery_year": 1999, "eccentricity": 0.0

In [43]:
i=5 # Not quite, in real life could flag and generate such examples for training!
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"planet_type": "Unknown"}]
gen_result:
 [{"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Super Earth"}, {"planet_type": "Gas Giant"}, {"planet_type": "Neptune-like"}, {"plane

In [44]:
i=6 # Not quite, in real life could flag and generate such examples for training!
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"discovery_year": 2001, "name": "47 Ursae Majoris c", "exoplanet_mass": 1.02492e+27}, {"discovery_year": 1995, "name": "51 Pegasi b", "exoplanet_mass": 8.730800000000001e+26}, {"discovery_year": 2004, "name": "55 Cancri e", "exoplanet_mass": 4.771628000000001e+25}, {"discovery_year": 2019, "name": "EPIC 201497682 b", "exoplanet_mass": 1.5527200000000002e+24}, {"discovery_year": 2007, "name": "GJ 581 c", "exoplanet_mass": 3.2846e+25}, {"discovery_year": 2009, "name": "GJ 581 e", "exoplanet_mass": 1.01524e+25}, {"discovery_year": 1998, "name": "HD 187123 b", "exoplanet_mass": 9.926540000000001e+26}, {"discovery_year": 2003, "name": "HD 3651 b", "exoplanet_mass": 4.32744e+26}, {"discovery_year": 2000, "name": "HD 46375 b", "exoplanet_mass": 4.28948e+26}, {"discovery_year": 2002, "name": "HD 49674 b", "exoplanet_mass": 1.89808076e+26}, {"discovery_year": 2006, "name": "HD 69830 b", "exoplanet_mass": 6.09144e+25}, {"discovery_year": 1999, "name": "HD 75289 b", "exoplanet_ma

In [47]:
i=8 # Not quite, in real life could flag and generate such examples for training!
valid_result=valid_loaded['results'][i]
print('valid_result:\n',valid_result)
gen_result=run_query(generations[i],cursor)
print('gen_result:\n', gen_result)
compare_results(valid_result,gen_result)
print(f"""How are the following two queries different? {generations[i]}, {valid_loaded['answer'][i]} and which one better answers the following question: {valid_loaded['question'][i]}?""")

valid_result:
 [{"planet_type": "Gas Giant", "avg_orbital_radius": 21.5154491121673}, {"planet_type": "Unknown", "avg_orbital_radius": 16.65}, {"planet_type": "Neptune-like", "avg_orbital_radius": 0.2249016860670194}, {"planet_type": "Super Earth", "avg_orbital_radius": 0.10995212107023411}, {"planet_type": "Terrestrial", "avg_orbital_radius": 0.06238086486486486}]
gen_result:
 [{"planet_type": "Gas Giant", "avg_orbital_radius": 21.5154491121673}, {"planet_type": "Unknown", "avg_orbital_radius": 16.65}, {"planet_type": "Neptune-like", "avg_orbital_radius": 0.2249016860670194}, {"planet_type": "Super Earth", "avg_orbital_radius": 0.10995212107023411}, {"planet_type": "Terrestrial", "avg_orbital_radius": 0.06238086486486486}]
How are the following two queries different? SELECT planet_type, AVG(orbital_radius) AS avg_orbital_radius
FROM exoplanets
GROUP BY planet_type
ORDER BY avg_orbital_radius DESC;, SELECT 
    e.planet_type,
    AVG(e.orbital_radius) AS avg_orbital_radius
FROM exoplan

In [33]:
valid_loaded['question'][3]

'Rank exoplanets based on their orbital period and return the top 3 longest-period exoplanets discovered each year.'

In [30]:
run_query("""WITH RankedExoplanets AS (
    SELECT 
        discovery_year,
        e.name,
        e.orbital_period,
        RANK() OVER (PARTITION BY e.discovery_year ORDER BY e.orbital_period DESC) AS period_rank
    FROM exoplanets e
)
SELECT * 
FROM RankedExoplanets
WHERE period_rank <= 3;""",cursor)

'[{"discovery_year": 1992, "name": "PSR B1257+12 d", "orbital_period": 0.26885694, "period_rank": 1}, {"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708, "period_rank": 2}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762, "period_rank": 1}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973, "period_rank": 1}, {"discovery_year": 1996, "name": "47 Ursae Majoris b", "orbital_period": 3.0, "period_rank": 1}, {"discovery_year": 1996, "name": "16 Cygni B b", "orbital_period": 2.2, "period_rank": 2}, {"discovery_year": 1996, "name": "70 Virginis b", "orbital_period": 0.31950718, "period_rank": 3}, {"discovery_year": 1997, "name": "Rho Coronae Borealis b", "orbital_period": 0.10896646, "period_rank": 1}, {"discovery_year": 1998, "name": "HD 210277 b", "orbital_period": 1.2106776, "period_rank": 1}, {"discovery_year": 1998, "name": "GJ 876 b", "orbital_period": 0.16728269, "period_rank": 2}, {"discovery_yea

In [None]:
"[{\"discovery_year\": 1992, \"name\": \"PSR B1257+12 d\", \"orbital_period\": 0.26885694, \"period_rank\": 1}, {\"discovery_year\": 1992, \"name\": \"PSR B1257+12 c\", \"orbital_period\": 0.18206708, \"period_rank\": 2}, {\"discovery_year\": 1994, \"name\": \"PSR B1257+12 b\", \"orbital_period\": 0.06926762, \"period_rank\": 1}, {\"discovery_year\": 1995, \"name\": \"51 Pegasi b\", \"orbital_period\": 0.011498973, \"period_rank\": 1}, {\"discovery_year\": 1996, \"name\": \"47 Ursae Majoris b\", \"orbital_period\": 3.0, \"period_rank\": 1}, {\"discovery_year\": 1996, \"name\": \"16 Cygni B b\", \"orbital_period\": 2.2, \"period_rank\": 2}, {\"discovery_year\": 1996, \"name\": \"70 Virginis b\", \"orbital_period\": 0.31950718, \"period_rank\": 3}, {\"discovery_year\": 1997, \"name\": \"Rho Coronae Borealis b\", \"orbital_period\": 0.10896646, \"period_rank\": 1}, {\"discovery_year\": 1998, \"name\": \"HD 210277 b\", \"orbital_period\": 1.2106776, \"period_rank\": 1}, {\"discovery_year\": 1998, \"name\": \"GJ 876 b\", \"orbital_period\": 0.16728269, \"period_rank\": 2}, {\"discovery_year\": 1998, \"name\": \"HD 168443 b\", \"orbital_period\": 0.15906912, \"period_rank\": 3}, {\"discovery_year\": 1999, \"name\": \"Upsilon Andromedae d\", \"orbital_period\": 3.5, \"period_rank\": 1}, {\"discovery_year\": 1999, \"name\": \"HD 10697 b\", \"orbital_period\": 2.9, \"period_rank\": 2}, {\"discovery_year\": 1999, \"name\": \"HD 222582 b\", \"orbital_period\": 1.6, \"period_rank\": 3}, {\"discovery_year\": 2000, \"name\": \"Epsilon Eridani b\", \"orbital_period\": 7.3, \"period_rank\": 1}, {\"discovery_year\": 2000, \"name\": \"HD 38529 c\", \"orbital_period\": 5.8, \"period_rank\": 2}, {\"discovery_year\": 2000, \"name\": \"HD 168443 c\", \"orbital_period\": 4.8, \"period_rank\": 3}, {\"discovery_year\": 2001, \"name\": \"47 Ursae Majoris c\", \"orbital_period\": 6.6, \"period_rank\": 1}, {\"discovery_year\": 2001, \"name\": \"HD 39091 b\", \"orbital_period\": 5.7, \"period_rank\": 2}, {\"discovery_year\": 2001, \"name\": \"HD 213240 b\", \"orbital_period\": 2.4, \"period_rank\": 3}, {\"discovery_year\": 2002, \"name\": \"55 Cancri d\", \"orbital_period\": 15.3, \"period_rank\": 1}, {\"discovery_year\": 2002, \"name\": \"HD 72659 b\", \"orbital_period\": 9.7, \"period_rank\": 2}, {\"discovery_year\": 2002, \"name\": \"HD 30177 b\", \"orbital_period\": 6.9, \"period_rank\": 3}, {\"discovery_year\": 2003, \"name\": \"PSR B1620-26 b\", \"orbital_period\": 95.0, \"period_rank\": 1}, {\"discovery_year\": 2003, \"name\": \"HD 190360 b\", \"orbital_period\": 7.8, \"period_rank\": 2}, {\"discovery_year\": 2003, \"name\": \"HD 74156 c\", \"orbital_period\": 6.7, \"period_rank\": 3}, {\"discovery_year\": 2004, \"name\": \"DH Tauri b\", \"orbital_period\": 10441.5, \"period_rank\": 1}, {\"discovery_year\": 2004, \"name\": \"2MASS J12073346-3932539 b\", \"orbital_period\": 2885.9, \"period_rank\": 2}, {\"discovery_year\": 2004, \"name\": \"GQ Lupi b\", \"orbital_period\": 1195.9, \"period_rank\": 3}, {\"discovery_year\": 2005, \"name\": \"AB Pictoris b\", \"orbital_period\": 4421.7, \"period_rank\": 1}, {\"discovery_year\": 2005, \"name\": \"HD 217107 c\", \"orbital_period\": 11.7, \"period_rank\": 2}, {\"discovery_year\": 2005, \"name\": \"OGLE-2005-BLG-071L b\", \"orbital_period\": 10.1, \"period_rank\": 3}, {\"discovery_year\": 2006, \"name\": \"HN Pegasi b\", \"orbital_period\": 20692.2, \"period_rank\": 1}, {\"discovery_year\": 2006, \"name\": \"Ophiuchi 11 b\", \"orbital_period\": 20000.0, \"period_rank\": 2}, {\"discovery_year\": 2006, \"name\": \"HD 203030 b\", \"orbital_period\": 11094.6, \"period_rank\": 3}, {\"discovery_year\": 2007, \"name\": \"HD 11506 b\", \"orbital_period\": 4.4, \"period_rank\": 1}, {\"discovery_year\": 2007, \"name\": \"HD 196885 A b\", \"orbital_period\": 3.7, \"period_rank\": 2}, {\"discovery_year\": 2007, \"name\": \"Kappa Coronae Borealis b\", \"orbital_period\": 3.4, \"period_rank\": 3}, {\"discovery_year\": 2008, \"name\": \"FU Tauri b\", \"orbital_period\": 101250.8, \"period_rank\": 1}, {\"discovery_year\": 2008, \"name\": \"USco CTIO 108 b\", \"orbital_period\": 70841.0, \"period_rank\": 2}, {\"discovery_year\": 2008, \"name\": \"1RXS J160929.1-210524 b\", \"orbital_period\": 6505.9, \"period_rank\": 3}, {\"discovery_year\": 2009, \"name\": \"HIP 70849 b\", \"orbital_period\": 47.5, \"period_rank\": 1}, {\"discovery_year\": 2009, \"name\": \"47 Ursae Majoris d\", \"orbital_period\": 38.4, \"period_rank\": 2}, {\"discovery_year\": 2009, \"name\": \"DP Leonis b\", \"orbital_period\": 28.0, \"period_rank\": 3}, {\"discovery_year\": 2010, \"name\": \"Ross 458 c\", \"orbital_period\": 48780.2, \"period_rank\": 1}, {\"discovery_year\": 2010, \"name\": \"HIP 78530 b\", \"orbital_period\": 12738.7, \"period_rank\": 2}, {\"discovery_year\": 2010, \"name\": \"GSC 06214-00210 b\", \"orbital_period\": 6037.4, \"period_rank\": 3}, {\"discovery_year\": 2011, \"name\": \"WD 0806-661 b\", \"orbital_period\": 158840.9, \"period_rank\": 1}, {\"discovery_year\": 2011, \"name\": \"CFBDSIR J145829+101343 b\", \"orbital_period\": 27.5, \"period_rank\": 2}, {\"discovery_year\": 2011, \"name\": \"HIP 5158 c\", \"orbital_period\": 24.7, \"period_rank\": 3}, {\"discovery_year\": 2012, \"name\": \"Kappa Andromedae b\", \"orbital_period\": 253.1, \"period_rank\": 1}, {\"discovery_year\": 2012, \"name\": \"WISEP J121756.91+162640.2 A b\", \"orbital_period\": 130.7, \"period_rank\": 2}, {\"discovery_year\": 2012, \"name\": \"HD 142 c\", \"orbital_period\": 27.8, \"period_rank\": 3}, {\"discovery_year\": 2013, \"name\": \"HD 106906 b\", \"orbital_period\": 13538.6, \"period_rank\": 1}, {\"discovery_year\": 2013, \"name\": \"ROXs 12 b\", \"orbital_period\": 3264.5, \"period_rank\": 2}, {\"discovery_year\": 2013, \"name\": \"ROXs 42 B b\", \"orbital_period\": 1968.3, \"period_rank\": 3}, {\"discovery_year\": 2014, \"name\": \"GU Piscium b\", \"orbital_period\": 155788.8, \"period_rank\": 1}, {\"discovery_year\": 2014, \"name\": \"HD 100546 b\", \"orbital_period\": 249.2, \"period_rank\": 2}, {\"discovery_year\": 2014, \"name\": \"OGLE-2008-BLG-092L b\", \"orbital_period\": 69.0, \"period_rank\": 3}, {\"discovery_year\": 2015, \"name\": \"2MASS J02192210-3925225 b\", \"orbital_period\": 5878.1, \"period_rank\": 1}, {\"discovery_year\": 2015, \"name\": \"VHS J125601.92-125723.9 b\", \"orbital_period\": 3895.8, \"period_rank\": 2}, {\"discovery_year\": 2015, \"name\": \"51 Eridani b\", \"orbital_period\": 32.0, \"period_rank\": 3}, {\"discovery_year\": 2016, \"name\": \"2MASS J22362452+4751425 b\", \"orbital_period\": 4505.7, \"period_rank\": 1}, {\"discovery_year\": 2016, \"name\": \"HR 2562 b\", \"orbital_period\": 80.3, \"period_rank\": 2}, {\"discovery_year\": 2016, \"name\": \"GJ 676 A c\", \"orbital_period\": 38.1, \"period_rank\": 3}, {\"discovery_year\": 2017, \"name\": \"HIP 65426 b\", \"orbital_period\": 630.7, \"period_rank\": 1}, {\"discovery_year\": 2017, \"name\": \"MOA-2012-BLG-006L b\", \"orbital_period\": 46.6, \"period_rank\": 2}, {\"discovery_year\": 2017, \"name\": \"OGLE-2016-BLG-0263L b\", \"orbital_period\": 34.8, \"period_rank\": 3}, {\"discovery_year\": 2018, \"name\": \"PDS 70 b\", \"orbital_period\": 119.2, \"period_rank\": 1}, {\"discovery_year\": 2018, \"name\": \"OGLE-2011-BLG-0173L b\", \"orbital_period\": 35.4, \"period_rank\": 2}, {\"discovery_year\": 2018, \"name\": \"HD 75784 c\", \"orbital_period\": 21.6, \"period_rank\": 3}, {\"discovery_year\": 2019, \"name\": \"USco1556 b\", \"orbital_period\": 360656.2, \"period_rank\": 1}, {\"discovery_year\": 2019, \"name\": \"USco1621 b\", \"orbital_period\": 257742.3, \"period_rank\": 2}, {\"discovery_year\": 2019, \"name\": \"HIP 79098 AB b\", \"orbital_period\": 3311.0, \"period_rank\": 3}, {\"discovery_year\": 2020, \"name\": \"TYC 8998-760-1 c\", \"orbital_period\": 5727.6, \"period_rank\": 1}, {\"discovery_year\": 2020, \"name\": \"TYC 8998-760-1 b\", \"orbital_period\": 2063.1, \"period_rank\": 2}, {\"discovery_year\": 2020, \"name\": \"OGLE-2016-BLG-1227L b\", \"orbital_period\": 19.8, \"period_rank\": 3}, {\"discovery_year\": 2021, \"name\": \"COCONUTS-2 b\", \"orbital_period\": 1101369.9, \"period_rank\": 1}, {\"discovery_year\": 2021, \"name\": \"BD+60 1417 b\", \"orbital_period\": 67794.6, \"period_rank\": 2}, {\"discovery_year\": 2021, \"name\": \"CFHTWIR-Oph 98 b\", \"orbital_period\": 22027.4, \"period_rank\": 3}, {\"discovery_year\": 2022, \"name\": \"mu2 Scorpii b\", \"orbital_period\": 1251.8, \"period_rank\": 1}, {\"discovery_year\": 2022, \"name\": \"AB Aurigae b\", \"orbital_period\": 587.7, \"period_rank\": 2}, {\"discovery_year\": 2022, \"name\": \"HD 105618 c\", \"orbital_period\": 211.3, \"period_rank\": 3}, {\"discovery_year\": 2023, \"name\": \"TOI-1669 b\", \"orbital_period\": 1.4, \"period_rank\": 1}, {\"discovery_year\": 2023, \"name\": \"GJ 1151 c\", \"orbital_period\": 1.0669404, \"period_rank\": 2}, {\"discovery_year\": 2023, \"name\": \"TOI-1694 c\", \"orbital_period\": 1.0655715, \"period_rank\": 3}]"


- Results for query 0 are equivalent, just used different alias:

In [48]:
valid_result=valid_loaded['results'][0]
valid_result

'[{"discovery_year": 2016, "count": 1517}]'

In [49]:
gen_result=run_query(generations[0],cursor)
gen_result

'[{"discovery_year": 2016, "num_discoveries": 1517}]'

In [50]:
compare_results(valid_result,gen_result)

False

- Results for query 3:  The subquery logic for my VALID QUERY is flawed because it does not correctly rank or filter the exoplanets within each discovery_year. The generated query was correct!

In [60]:
valid_loaded['question'][3]

'Rank exoplanets based on their orbital period and return the top 3 longest-period exoplanets discovered each year.'

In [53]:
valid_result=valid_loaded['results'][3]
valid_result

'[{"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973}]'

In [82]:
gen_result=run_query(generations[3],cursor)
gen_result

'[{"discovery_year": 1992, "name": "PSR B1257+12 d", "orbital_period": 0.26885694, "period_rank": 1}, {"discovery_year": 1992, "name": "PSR B1257+12 c", "orbital_period": 0.18206708, "period_rank": 2}, {"discovery_year": 1994, "name": "PSR B1257+12 b", "orbital_period": 0.06926762, "period_rank": 1}, {"discovery_year": 1995, "name": "51 Pegasi b", "orbital_period": 0.011498973, "period_rank": 1}, {"discovery_year": 1996, "name": "47 Ursae Majoris b", "orbital_period": 3.0, "period_rank": 1}, {"discovery_year": 1996, "name": "16 Cygni B b", "orbital_period": 2.2, "period_rank": 2}, {"discovery_year": 1996, "name": "70 Virginis b", "orbital_period": 0.31950718, "period_rank": 3}, {"discovery_year": 1997, "name": "Rho Coronae Borealis b", "orbital_period": 0.10896646, "period_rank": 1}, {"discovery_year": 1998, "name": "HD 210277 b", "orbital_period": 1.2106776, "period_rank": 1}, {"discovery_year": 1998, "name": "GJ 876 b", "orbital_period": 0.16728269, "period_rank": 2}, {"discovery_yea

In [55]:
compare_results(valid_result,gen_result)

False

- Results for query 4:  Generated query actually answers the question exactly while valid query also had discovery year (not wrong, just extra)

In [61]:
valid_loaded['question'][4]

'List exoplanets with a discovery year before 2000 and an eccentricity less than 0.2.'

In [56]:
valid_result=valid_loaded['results'][4]
valid_result

'[{"name": "47 Ursae Majoris b", "discovery_year": 1996, "eccentricity": 0.03}, {"name": "51 Pegasi b", "discovery_year": 1995, "eccentricity": 0.01}, {"name": "55 Cancri b", "discovery_year": 1996, "eccentricity": 0.0}, {"name": "GJ 86 b", "discovery_year": 1999, "eccentricity": 0.04}, {"name": "GJ 876 b", "discovery_year": 1998, "eccentricity": 0.03}, {"name": "HD 10697 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 130322 b", "discovery_year": 1999, "eccentricity": 0.03}, {"name": "HD 177830 b", "discovery_year": 1999, "eccentricity": 0.1}, {"name": "HD 187123 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 192263 b", "discovery_year": 1999, "eccentricity": 0.05}, {"name": "HD 195019 b", "discovery_year": 1998, "eccentricity": 0.01}, {"name": "HD 209458 b", "discovery_year": 1999, "eccentricity": 0.0}, {"name": "HD 217107 b", "discovery_year": 1998, "eccentricity": 0.13}, {"name": "HD 75289 b", "discovery_year": 1999, "eccentricity": 0.03}, {"name": "

In [57]:
gen_result=run_query(generations[4],cursor)
gen_result

'[{"name": "47 Ursae Majoris b", "eccentricity": 0.03}, {"name": "51 Pegasi b", "eccentricity": 0.01}, {"name": "55 Cancri b", "eccentricity": 0.0}, {"name": "GJ 86 b", "eccentricity": 0.04}, {"name": "GJ 876 b", "eccentricity": 0.03}, {"name": "HD 10697 b", "eccentricity": 0.1}, {"name": "HD 130322 b", "eccentricity": 0.03}, {"name": "HD 177830 b", "eccentricity": 0.1}, {"name": "HD 187123 b", "eccentricity": 0.01}, {"name": "HD 192263 b", "eccentricity": 0.05}, {"name": "HD 195019 b", "eccentricity": 0.01}, {"name": "HD 209458 b", "eccentricity": 0.0}, {"name": "HD 217107 b", "eccentricity": 0.13}, {"name": "HD 75289 b", "eccentricity": 0.03}, {"name": "HR 810 b", "eccentricity": 0.14}, {"name": "PSR B1257+12 b", "eccentricity": 0.0}, {"name": "PSR B1257+12 c", "eccentricity": 0.02}, {"name": "PSR B1257+12 d", "eccentricity": 0.03}, {"name": "Rho Coronae Borealis b", "eccentricity": 0.04}, {"name": "Tau Bootis b", "eccentricity": 0.01}, {"name": "Upsilon Andromedae b", "eccentricity"

In [None]:
compare_results(valid_result,gen_result)

False

- Results for query 5: OK, maybe the results of valid query are not satisfying, but it is correct compared to the generated query.

In [64]:
valid_loaded['question'][5]

'Identify the planet types that have not been discovered since 2020, meaning they have no entries from that year onward.'

In [62]:
valid_result=valid_loaded['results'][5]
valid_result

'[{"planet_type": "Unknown"}]'

In [66]:
valid_loaded

Dataset({
    features: ['context', 'question', 'answer', 'results'],
    num_rows: 10
})

In [67]:
run_query(valid_loaded['answer'][5],cursor)

'[{"planet_type": "Unknown"}]'

In [63]:
gen_result=run_query(generations[5],cursor)
gen_result

'[{"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Gas Giant"}, {"planet_type": "Super Earth"}, {"planet_type": "Gas Giant"}, {"planet_type": "Neptune-like"}, {"planet_type": "Neptune-like"}, {"planet_type": "Neptune-like"

In [None]:
compare_results(valid_result,gen_result)

False

- Results for query 6:  OK, valid one is correct; generated query only returns the single exoplanet with the overall minimum mass.

In [71]:
valid_loaded['question'][6]

'What are the names and masses of the exoplanets that have the minimum mass for each discovery year?'

In [72]:
valid_result=valid_loaded['results'][6]
valid_result

'[{"discovery_year": 2001, "name": "47 Ursae Majoris c", "exoplanet_mass": 1.02492e+27}, {"discovery_year": 1995, "name": "51 Pegasi b", "exoplanet_mass": 8.730800000000001e+26}, {"discovery_year": 2004, "name": "55 Cancri e", "exoplanet_mass": 4.771628000000001e+25}, {"discovery_year": 2019, "name": "EPIC 201497682 b", "exoplanet_mass": 1.5527200000000002e+24}, {"discovery_year": 2007, "name": "GJ 581 c", "exoplanet_mass": 3.2846e+25}, {"discovery_year": 2009, "name": "GJ 581 e", "exoplanet_mass": 1.01524e+25}, {"discovery_year": 1998, "name": "HD 187123 b", "exoplanet_mass": 9.926540000000001e+26}, {"discovery_year": 2003, "name": "HD 3651 b", "exoplanet_mass": 4.32744e+26}, {"discovery_year": 2000, "name": "HD 46375 b", "exoplanet_mass": 4.28948e+26}, {"discovery_year": 2002, "name": "HD 49674 b", "exoplanet_mass": 1.89808076e+26}, {"discovery_year": 2006, "name": "HD 69830 b", "exoplanet_mass": 6.09144e+25}, {"discovery_year": 1999, "name": "HD 75289 b", "exoplanet_mass": 9.3001999

In [73]:
gen_result=run_query(generations[6],cursor)
gen_result

'[{"name": "PSR B1257+12 d", "mass": 2.32908e+25}]'

In [None]:
compare_results(valid_result,gen_result)

False

- Results for query 7: Validation query gives discoveries within the last two years, whereas the generated query gives results within the last two years of data available in the data set.  Guess this is a toss up.

In [74]:
valid_loaded['question'][7]

'Find exoplanets discovered in the last two years and list them along with their discovery year and detection method.'

In [75]:
valid_result=valid_loaded['results'][7]
valid_result

'[{"name": "GJ 1151 c", "discovery_year": 2023, "detection_method": "Radial Velocity"}, {"name": "TOI-1669 b", "discovery_year": 2023, "detection_method": "Radial Velocity"}, {"name": "TOI-1694 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-1694 c", "discovery_year": 2023, "detection_method": "Radial Velocity"}, {"name": "TOI-4342 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-4342 c", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-4562 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-4582 b", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "TOI-700 e", "discovery_year": 2023, "detection_method": "Transit"}, {"name": "AB Aurigae b", "discovery_year": 2022, "detection_method": "Direct Imaging"}, {"name": "CoRoT-35 b", "discovery_year": 2022, "detection_method": "Transit"}, {"name": "CoRoT-36 b", "discovery_year": 2022, "detection_method": "Transit"}, {"name": "CoRoT-7 d

In [76]:
gen_result=run_query(generations[7],cursor)
gen_result

'[{"name": "2M0437 b", "discovery_year": 2021, "detection_method": "Direct Imaging"}, {"name": "AU Microscopii c", "discovery_year": 2021, "detection_method": "Transit"}, {"name": "b Centauri AB b", "discovery_year": 2021, "detection_method": "Direct Imaging"}, {"name": "BD-00 4475 b", "discovery_year": 2021, "detection_method": "Radial Velocity"}, {"name": "BD+45 564 b", "discovery_year": 2021, "detection_method": "Radial Velocity"}, {"name": "BD+55 362 b", "discovery_year": 2021, "detection_method": "Radial Velocity"}, {"name": "BD+60 1417 b", "discovery_year": 2021, "detection_method": "Direct Imaging"}, {"name": "BD+63 1405 b", "discovery_year": 2021, "detection_method": "Radial Velocity"}, {"name": "CFHTWIR-Oph 98 b", "discovery_year": 2021, "detection_method": "Direct Imaging"}, {"name": "COCONUTS-2 b", "discovery_year": 2021, "detection_method": "Direct Imaging"}, {"name": "EPIC 201427007 b", "discovery_year": 2021, "detection_method": "Transit"}, {"name": "EPIC 201595106 b", "d

In [None]:
compare_results(valid_result,gen_result)

False

- Results for query 9, has a syntax error near 'AS' and attempts to calculate the average orbital radius for each planet type and orders the results by the average orbital radius in descending order, not exactly the question.

In [77]:
valid_loaded['question'][-1]

'Find the exoplanets with the highest mass-to-radius ratio, showing their name and calculated ratio (return top 5).'

In [78]:
valid_result=valid_loaded['results'][-1]
valid_result

'[{"name": "HD 100546 b", "mass_radius_ratio": 2.068544927536232e+29}, {"name": "K2-52 b", "mass_radius_ratio": 1.7018181818181818e+29}, {"name": "Kepler-718 b", "mass_radius_ratio": 1.683398781313473e+29}, {"name": "PH2 b", "mass_radius_ratio": 1.6815060908084162e+29}, {"name": "Kepler-488 b", "mass_radius_ratio": 1.670347764371895e+29}]'

In [79]:
gen_result=run_query(generations[-1],cursor)
gen_result

'[{"planet_type": "Gas Giant", "avg_orbital_radius": 21.5154491121673}, {"planet_type": "Unknown", "avg_orbital_radius": 16.65}, {"planet_type": "Neptune-like", "avg_orbital_radius": 0.2249016860670194}, {"planet_type": "Super Earth", "avg_orbital_radius": 0.10995212107023411}, {"planet_type": "Terrestrial", "avg_orbital_radius": 0.06238086486486486}]'

In [None]:
compare_results(valid_result,gen_result)

False

- Overall, ~70% accuracy post 2 minutes of fine tuning and uncovered a mistake in my validation set!