In [1]:
!pip install -U \
torch==2.9.0+cu128 torchvision==0.24.0+cu128 torchaudio==2.9.0+cu128 \
transformers==5.0.0 accelerate==1.12.0 datasets==4.5.0 evaluate==0.4.3 \
peft==0.18.1 trl==0.27.2 bitsandbytes==0.49.1 \
huggingface-hub==1.3.7 tokenizers==0.22.2 \
sqlite-utils==3.38 sqlalchemy==2.0.30

Collecting datasets==4.5.0
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.3
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl==0.27.2
  Downloading trl-0.27.2-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes==0.49.1
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting huggingface-hub==1.3.7
  Downloading huggingface_hub-1.3.7-py3-none-any.whl.metadata (13 kB)
Collecting sqlite-utils==3.38
  Downloading sqlite_utils-3.38-py3-none-any.whl.metadata (7.5 kB)
Collecting sqlalchemy==2.0.30
  Downloading SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting pyarrow>=21.0.0 (from datasets==4.5.0)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting sqlite-fts4 (from sqlite-utils==3.38)
  Downloading sqlite_fts4-1.0.3-py3-none-any.whl.metadata (6.6 kB)
Collecting click-default-gro

In [2]:
!pip list | grep -E 'transformers|accelerate|datasets|sqlalchemy|sqlite-utils|moz-sql-parser|evaluate|torch|torchvision'

accelerate                               1.12.0
datasets                                 4.5.0
evaluate                                 0.4.3
sentence-transformers                    5.2.2
sqlalchemy-spanner                       1.17.2
sqlite-utils                             3.38
tensorflow-datasets                      4.9.9
torch                                    2.9.0+cu128
torchao                                  0.10.0
torchaudio                               2.9.0+cu128
torchcodec                               0.8.0+cu128
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.24.0+cu128
transformers                             5.0.0
vega-datasets                            0.9.0


In [3]:
%%bash
python - <<'PY'
import random, numpy as np, torch, os
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
print("Seeds fixed to", SEED)
PY


Seeds fixed to 42


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login
from google.colab import userdata

# Retrieve the Hugging Face API key from Colab's secrets manager
hf_token = userdata.get('HF_API_KEY')

# Log in to Hugging Face Hub using the retrieved token
login(hf_token)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from pathlib import Path
import json

DATA_DIR = Path("/content/drive/MyDrive/chichewa-text2sql/data")

train_path = DATA_DIR / "train.json"
dev_path   = DATA_DIR / "dev.json"
test_path  = DATA_DIR / "test.json"

print("Train exists:", train_path.exists())
print("Dev exists:", dev_path.exists())
print("Test exists:", test_path.exists())

Train exists: True
Dev exists: True
Test exists: True


In [7]:
def load_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


train_data = load_json(train_path)
dev_data   = load_json(dev_path)
test_data  = load_json(test_path)

print(f"Train size: {len(train_data)}")
print(f"Dev size:   {len(dev_data)}")
print(f"Test size:  {len(test_data)}")

Train size: 280
Dev size:   60
Test size:  60


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"  # use preferred model
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [9]:
# reflect every .sqlite file to create a plain-text schema description

import sqlalchemy as sa

DB_PATH = Path("/content/drive/MyDrive/chichewa-text2sql/data/database/chichewa_text2sql.db")

def get_schema_string() -> str:
    """Return a compact textual schema for the given database."""

    engine  = sa.create_engine(f"sqlite:///{DB_PATH}")
    insp    = sa.inspect(engine)

    parts   = []
    for tbl in sorted(insp.get_table_names()):
        cols = [c["name"] for c in insp.get_columns(tbl)]
        parts.append(f"{tbl}({', '.join(cols)})")

    schema_str = ", ".join(parts)
    return schema_str

In [10]:
schema_str = get_schema_string()
print(schema_str)

commodity_prices(id, add_name, epa_name, district, market, month_name, year, commodity, price, collection_date), food_insecurity(id, district, analyzed_population, time_period, percentage_population, insecurity_level, insecurity_desc_short, insecurity_desc_long), mse_daily(id, counter_id, ticker, trade_date, print_time, company_name, sector, high_price, low_price, bid_price, ask_price, previous_close_price, close_price, volume, dividend_mwk, dividend_yield_pct, earnings_yield_pct, pe_ratio, pbv_ratio, market_cap_mwk_mn, profit_after_tax_mwk_mn, shares_outstanding), population(id, region_name, region_code, admin_status, district_code, ea_number, ea_code, ta_code, ta_name, population_male, population_female, number_households, district_name, total_population), production(id, district, crop, yield, season)


In [11]:
import random, numpy as np, torch, os, gc
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# ──────────────────────────────────────────────────────────────────────────
# 1 ▪ zero-shot prompt builder (no demos)
# ──────────────────────────────────────────────────────────────────────────
def build_prompt_zero(nl_question: str, target_schema: str) -> str:
    return (
        "### Instruction:\n"
        "You are an expert SQL developer. Given the database schema and the "
        "question, return ONE valid SQL statement — output ONLY the SQL.\n\n"
        f"### Database Schema:\n{target_schema}\n\n"
        f"### Question:\n{nl_question}\n\n"
        "### SQL:\n"
    )

# ──────────────────────────────────────────────────────────────────────────
# 2 ▪ random-5-shot builder
# ──────────────────────────────────────────────────────────────────────────
NUM_SHOTS = 5

# Convert train_data to a Hugging Face Dataset object to use .shuffle() and .select()
from datasets import Dataset
train_dataset = Dataset.from_list(train_data)

DEMO_SET = [
    {**ex, "schema_str": get_schema_string()}
    for ex in train_dataset.shuffle(seed=SEED).select(range(NUM_SHOTS))
]

def build_prompt_random5(nl_question: str, target_schema: str) -> str:
    parts = ["### Instruction:\nReturn ONE SQL query only, based on the examples provided.\n"]
    for i, ex in enumerate(DEMO_SET, 1):
        parts += [
            f"### Example {i} Schema:\n{ex['schema_str']}",
            f"### Example {i} Question:\n{ex['question_ny']}",
            f"### Example {i} SQL:\n{ex['sql_statement'].strip()}",
        ]
    parts += [
        f"### Database Schema:\n{target_schema}",
        f"### Question:\n{nl_question}",
        "### SQL:\n",
    ]
    return "\n".join(parts)

# ──────────────────────────────────────────────────────────────────────────
# 3 ▪ retrieved-5-shot builder (SBERT nearest neighbours)
# ──────────────────────────────────────────────────────────────────────────
from sentence_transformers import SentenceTransformer, util
embedder   = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
train_emb  = embedder.encode(train_dataset["question_ny"], convert_to_tensor=True)

def get_k_shots(nl_question: str, k: int = 5):
    q_emb  = embedder.encode(nl_question, convert_to_tensor=True)
    hits   = util.semantic_search(q_emb, train_emb, top_k=k)[0]
    demos  = []
    for h in hits:
        ex = train_dataset[int(h["corpus_id"])]
        demos.append({
            "question"   : ex["question_ny"],
            "query"      : ex["sql_statement"],
            "schema_str" : get_schema_string(),
        })
    return demos

def build_prompt_retrieved5(nl_question: str, target_schema: str) -> str:
    demos = get_k_shots(nl_question, 5)
    parts = ["### Instruction:\nReturn ONE SQL query only, based on the examples provided.\n"]
    for i, ex in enumerate(demos, 1):
        parts += [
            f"### Example {i} Schema:\n{ex['schema_str']}",
            f"### Example {i} Question:\n{ex['question']}",
            f"### Example {i} SQL:\n{ex['query'].strip()}",
            "### End\n",
        ]
    parts += [
        f"### Database Schema:\n{target_schema}",
        f"### Question:\n{nl_question}",
        "### SQL:\n",
    ]
    return "\n".join(parts)

# ──────────────────────────────────────────────────────────────────────────
# 4 ▪ shared generate_sql that accepts a *builder* argument
# ──────────────────────────────────────────────────────────────────────────
import re

def make_generator(builder_fn, tokenizer, model): # Modified to accept tokenizer and model
    def _gen(nl_question: str, schema: str) -> str:
        prompt = builder_fn(nl_question, schema)
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        input_len = inputs["input_ids"].shape[1]

        model.config.pad_token_id = tokenizer.eos_token_id
        out = model.generate(
            **inputs,
            max_new_tokens=128,
            num_beams=3,
            early_stopping=True,
            do_sample=False,
        )
        gen_text = tokenizer.decode(out[0, input_len:], skip_special_tokens=True)
        sql = re.split(r"(###|\n\s*\n|```)", gen_text, 1)[0]
        sql = sql.split(";")[0].replace("\n", " ").strip()
        return sql.split(";", 1)[0].strip()
    return _gen

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/526 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 0          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

In [13]:
example      = test_data[52]
display(example)
print("=="*50 + "\n")
schema_str   = get_schema_string()
predicted_sql= generate_sql(example["question_ny"], schema_str)
print("Gold SQL:", example["sql_statement"])
print("Pred SQL:", predicted_sql)

{'id': 330,
 'question_en': 'How many distinct time periods are recorded for Nkhotakota?',
 'question_ny': 'Ndi nthawi ziti zodziwika bwino zomwe zinalembedwa ku Nkhotakota?',
 'sql_statement': "SELECT COUNT(DISTINCT time_period) FROM food_insecurity WHERE district = 'Nkhotakota';",
 'sql_result': '[(2,)]',
 'difficulty_level': 'medium',
 'table': 'food_insecurity'}

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Gold SQL: SELECT COUNT(DISTINCT time_period) FROM food_insecurity WHERE district = 'Nkhotakota';
Pred SQL: 


In [14]:
test_data[2]

{'id': 11,
 'question_en': 'What was the yield of wheat in nkhotakota?',
 'question_ny': 'Ndi tiligu ochuluka bwanj adakololedwa ku Nkhotakota?',
 'sql_statement': "SELECT Yield FROM production WHERE District = 'Nkhotakota' AND Crop = 'Wheat';",
 'sql_result': '[(0.0,)]',
 'difficulty_level': 'easy',
 'table': 'production'}

In [15]:
import re
_ws   = re.compile(r"\s+")
_comm = re.compile(r"\s*,\s*")          # blank(s) ↔ comma ↔ blank(s)

def exact_match(pred: str, gold: str) -> bool:
    def norm(s: str) -> str:
        s = s.strip().lower()
        s = _ws.sub(" ", s)             # collapse runs of whitespace
        s = _comm.sub(", ", s)          # canonical “comma␣”
        s = s.rstrip(';')                # Remove trailing semicolon
        return s
    return norm(pred) == norm(gold)

In [16]:
from sqlglot import parse_one, expressions

def flatten_ast(node):
    """
    Recursively collect all node‐type names and literal values as lowercase strings.
    """
    out = set()

    def walk(n):
        # record the AST node type
        out.add(type(n).__name__.lower())

        # record any literal (e.g. identifiers, strings, numbers)
        if hasattr(n, "this") and isinstance(n.this, (str, int, float)):
            out.add(str(n.this).lower())

        # recurse into child expressions
        for arg in n.args.values():
            if isinstance(arg, list):
                for child in arg:
                    if isinstance(child, expressions.Expression):
                        walk(child)
            elif isinstance(arg, expressions.Expression):
                walk(arg)

    walk(node)
    return out

def component_match(pred_sql, gold_sql):
    try:
        pred_ast = parse_one(pred_sql)
        gold_ast = parse_one(gold_sql)
    except Exception:
        return 0.0

    pred_set = flatten_ast(pred_ast)
    gold_set = flatten_ast(gold_ast)
    if not gold_set:
        return 0.0
    return len(pred_set & gold_set) / len(gold_set)


In [17]:
import sqlite3, pandas as pd, numpy as np
from tqdm import tqdm
from datasets import Dataset

def run_query(sql: str, db_path: Path):
    """Return query result as a sorted list of tuples (order-independent)."""
    try:
        with sqlite3.connect(db_path) as conn:
            df = pd.read_sql_query(sql, conn)
        # sort rows + cols for order-invariant comparison
        return tuple(map(tuple, df.sort_index(axis=1).sort_values(list(df.columns)).to_numpy()))
    except Exception as e:

        return f"ERROR-{e}"

def execution_accuracy(dataset):
    """Compute Spider-style Execution Accuracy on the test split"""
    correct = 0
    for ex in tqdm(dataset, desc="Evaluating"):

        schema  = schema_str

        pred_sql= generate_sql(ex["question_en"], schema)

        # Use global DB_PATH
        gold    = run_query(ex["sql_statement"], DB_PATH)
        pred    = run_query(pred_sql, DB_PATH)

        if gold == pred:
            correct += 1

    return correct / len(dataset)

In [20]:
# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 0          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

example      = test_data[4]
schema_str   = get_schema_string()
predicted_sql= generate_sql(example["question_en"], schema_str)
print("Gold SQL:", example["sql_statement"])
print("Pred SQL:", predicted_sql)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Gold SQL: SELECT Crop, MAX(Yield) AS Max_Yield FROM production WHERE District = 'Dedza' AND Season = '2023-2024';
Pred SQL: SELECT T1.crop FROM production AS T1 INNER JOIN population AS T2 ON T1.district = T2.district_name WHERE T2.district_name = 'Dedza' AND T1.season = '2023-2024' ORDER BY T1.yield DESC LIMIT 1


In [21]:
exact_match(predicted_sql, example["sql_statement"])

False

In [22]:
component_match(predicted_sql, example["sql_statement"])

0.8421052631578947

In [23]:
run_query(example["sql_statement"], DB_PATH)

((418387.0, 'Sweet potatoes'),)

In [24]:
run_query(predicted_sql, DB_PATH)

(('Sweet potatoes',),)

In [None]:
# from datasets import Dataset

# # Convert test_data to a Hugging Face Dataset object
# test_dataset = Dataset.from_list(test_data)

# # Calculate and print execution accuracy
# accuracy = execution_accuracy(test_dataset)
# print(f"Execution Accuracy on test data: {accuracy}")

# zero shot

In [25]:
# Zero shot evaluation on the test data
import time, torch, numpy as np
from tqdm import tqdm
import warnings
from transformers import logging
import random
from datasets import Dataset

random.seed(42)

logging.set_verbosity_error()
warnings.filterwarnings('ignore')

# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 0          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

# 1)
# Use test_data as the evaluation dataset
# Ensure it's converted to a Hugging Face Dataset for shuffle/select methods
test_dataset = Dataset.from_list(test_data)
num_samples = len(test_dataset)
sample_test  = test_dataset.shuffle(seed=42).select(range(num_samples))


# 2) containers
em_scores, cm_scores, ex_scores, times = [], [], [], []

# 3) start fresh CUDA-peak tracking
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

# 4) main loop
for ex in tqdm(sample_test, desc="Evaluating"):
    # db_id = ex["db_id"] # Not used since get_schema_string does not take db_id
    # schema = get_schema_string(db_id) # Incorrect, get_schema_string takes no args
    schema = schema_str # Use the globally defined schema_str

    t0      = time.perf_counter()
    pred_sql= generate_sql(ex["question_ny"], schema)
    times.append(time.perf_counter() - t0)

    gold_sql= ex["sql_statement"]

    # exact + component match
    em_scores.append( float(exact_match(pred_sql, gold_sql)) )
    cm_scores.append( component_match(pred_sql, gold_sql) )

    # execution accuracy
    gold_res= run_query(gold_sql,  DB_PATH)
    pred_res= run_query(pred_sql, DB_PATH)
    ex_scores.append( int(gold_res == pred_res) )

# 5) aggregate & report
print("\n\nExact Match        :", np.mean(em_scores))
print("Component Match    :", np.mean(cm_scores))
print("Execution Accuracy :", np.mean(ex_scores))
print("Avg. Latency  (s)  :", np.mean(times))
print("95% Latency  (s)   :", np.percentile(times, 95))
if torch.cuda.is_available():
    print("GPU Mem Peak       :", torch.cuda.max_memory_allocated() / 1e9, "GB")
else:
    print("GPU Mem Peak       : N/A (CPU)")


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s][A
Evaluating:   2%|▏         | 1/60 [00:08<08:50,  9.00s/it][A
Evaluating:   3%|▎         | 2/60 [00:18<08:43,  9.02s/it][A
Evaluating:   5%|▌         | 3/60 [00:27<08:35,  9.04s/it][A
Evaluating:   7%|▋         | 4/60 [00:36<08:27,  9.06s/it][A
Evaluating:   8%|▊         | 5/60 [00:45<08:18,  9.07s/it][A
Evaluating:  10%|█         | 6/60 [00:54<08:09,  9.07s/it][A
Evaluating:  12%|█▏        | 7/60 [01:03<08:01,  9.08s/it][A
Evaluating:  13%|█▎        | 8/60 [01:12<07:51,  9.08s/it][A
Evaluating:  15%|█▌        | 9/60 [01:21<07:43,  9.08s/it][A
Evaluating:  17%|█▋        | 10/60 [01:30<07:33,  9.08s/it][A
Evaluating:  18%|█▊        | 11/60 [01:39<07:25,  9.08s/it][A
Evaluating:  20%|██        | 12/60 [01:48<07:15,  9.08s/it][A
Evaluating:  22%|██▏       | 13/60 [01:57<07:06,  9.08s/it][A
Evaluating:  23%|██▎       | 14/60 [02:07<06:57,  9.08s/it][A
Evaluating:  25%|██▌       | 15/60 [02:16<06:48,  9.08s/it][A
Evaluatin



Exact Match        : 0.0
Component Match    : 0.16032448472580052
Execution Accuracy : 0.0
Avg. Latency  (s)  : 9.060781327266662
95% Latency  (s)   : 9.086080673699929
GPU Mem Peak       : 16.756563968 GB





# Few-shot (Random Sampling)

In [28]:
# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 1          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

example      = test_data[4]
display(example)
print("=="*50 + "\n")
schema_str   = get_schema_string()
predicted_sql= generate_sql(example["question_ny"], schema_str)
print("Gold SQL:", example["sql_statement"])
print("Pred SQL:", predicted_sql)

{'id': 21,
 'question_en': 'Which crop had the highest yield in Dedza during 2023-2024?',
 'question_ny': 'Ndi mbewu iti idali ndi zokolola zochuluka ku Dedza mu 2023-2024?',
 'sql_statement': "SELECT Crop, MAX(Yield) AS Max_Yield FROM production WHERE District = 'Dedza' AND Season = '2023-2024';",
 'sql_result': "[('Sweet potatoes', 418387.0)]",
 'difficulty_level': 'easy',
 'table': 'production'}


Gold SQL: SELECT Crop, MAX(Yield) AS Max_Yield FROM production WHERE District = 'Dedza' AND Season = '2023-2024';
Pred SQL: SELECT SUM(yield) FROM production WHERE district = 'Dedza' AND season = '2023-2024'


In [29]:
# Few Shot evaluation on a test data (random sampling)
import time, torch, numpy as np
from tqdm import tqdm
import warnings
from transformers import logging
import random
from datasets import Dataset # Import Dataset

random.seed(42)

logging.set_verbosity_error()
warnings.filterwarnings('ignore')

# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 1          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

# 1)
# Use test_data for evaluation
test_dataset = Dataset.from_list(test_data)
num_samples = len(test_dataset)
sample_test  = test_dataset.shuffle(seed=42).select(range(num_samples))


# 2) containers
em_scores, cm_scores, ex_scores, times = [], [], [], []

# 3) start fresh CUDA-peak tracking
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

# 4) main loop
for ex in tqdm(sample_test, desc="Evaluating"):
    # Use global schema_str
    schema  = schema_str

    t0      = time.perf_counter()
    pred_sql= generate_sql(ex["question_ny"], schema) # Corrected to question_ny
    times.append(time.perf_counter() - t0)

    gold_sql= ex["sql_statement"] # Corrected to sql_statement

    # exact + component match
    em_scores.append( float(exact_match(pred_sql, gold_sql)) )
    cm_scores.append( component_match(pred_sql, gold_sql) )

    # execution accuracy
    gold_res= run_query(gold_sql,  DB_PATH) # Use global DB_PATH
    pred_res= run_query(pred_sql, DB_PATH) # Use global DB_PATH
    ex_scores.append( int(gold_res == pred_res) )

# 5) aggregate & report
print("\n\nExact Match        :", np.mean(em_scores))
print("Component Match    :", np.mean(cm_scores))
print("Execution Accuracy :", np.mean(ex_scores))
print("Avg. Latency  (s)  :", np.mean(times))
print("95% Latency  (s)   :", np.percentile(times, 95))
if torch.cuda.is_available():
    print("GPU Mem Peak       :", torch.cuda.max_memory_allocated() / 1e9, "GB")
else:
    print("GPU Mem Peak       : N/A (CPU)")


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s][A
Evaluating:   2%|▏         | 1/60 [00:11<11:19, 11.51s/it][A
Evaluating:   3%|▎         | 2/60 [00:23<11:09, 11.55s/it][A
Evaluating:   5%|▌         | 3/60 [00:34<10:58, 11.56s/it][A
Evaluating:   7%|▋         | 4/60 [00:46<10:47, 11.57s/it][A
Evaluating:   8%|▊         | 5/60 [00:57<10:35, 11.55s/it][A
Evaluating:  10%|█         | 6/60 [01:09<10:22, 11.53s/it][A
Evaluating:  12%|█▏        | 7/60 [01:20<10:10, 11.52s/it][A
Evaluating:  13%|█▎        | 8/60 [01:32<09:58, 11.52s/it][A
Evaluating:  15%|█▌        | 9/60 [01:43<09:47, 11.53s/it][A
Evaluating:  17%|█▋        | 10/60 [01:55<09:36, 11.53s/it][A
Evaluating:  18%|█▊        | 11/60 [02:06<09:25, 11.54s/it][A
Evaluating:  20%|██        | 12/60 [02:18<09:13, 11.54s/it][A
Evaluating:  22%|██▏       | 13/60 [02:30<09:03, 11.57s/it][A
Evaluating:  23%|██▎       | 14/60 [02:41<08:51, 11.55s/it][A
Evaluating:  25%|██▌       | 15/60 [02:53<08:39, 11.55s/it][A
Evaluatin



Exact Match        : 0.03333333333333333
Component Match    : 0.749403875736693
Execution Accuracy : 0.03333333333333333
Avg. Latency  (s)  : 11.52096076768333
95% Latency  (s)   : 11.59985701300011
GPU Mem Peak       : 17.703637504 GB





# Few-shot (all-MiniLM)

In [37]:
# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 2          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

example      = test_data[25]
display(example)
print("=="*50 + "\n")
schema_str   = get_schema_string()
predicted_sql= generate_sql(example["question_ny"], schema_str)
print("Gold SQL:", example["sql_statement"])
print("Pred SQL:", predicted_sql)

{'id': 183,
 'question_en': 'What is the average price of maize in Salima?',
 'question_ny': 'Kodi mtengo wapakatikati wa Chimanga ku Salima ndi uti?',
 'sql_statement': "SELECT AVG(price) FROM commodity_prices WHERE commodity = 'Maize' AND district = 'Salima';",
 'sql_result': '[(798.41875,)]',
 'difficulty_level': 'easy',
 'table': 'commodity_prices'}


Gold SQL: SELECT AVG(price) FROM commodity_prices WHERE commodity = 'Maize' AND district = 'Salima';
Pred SQL: SELECT AVG(price) FROM commodity_prices WHERE commodity = 'Maize'   AND district = 'Salima'   AND collection_date = (       SELECT MAX(collection_date)       FROM commodity_prices       WHERE commodity = 'Maize'   )


In [38]:
print("Exact Match:", exact_match(predicted_sql, example["sql_statement"]))
print("Component Match:", component_match(predicted_sql, example["sql_statement"]))

Exact Match: False
Component Match: 1.0


In [39]:
num_exact_matches = sum(em_scores)
print(f"Number of exact matches: {int(num_exact_matches)}")

Number of exact matches: 2


In [40]:
run_query(example["sql_statement"], DB_PATH)

((np.float64(798.41875),),)

In [41]:
run_query(predicted_sql, DB_PATH)

((None,),)

In [42]:
# Few shot evaluation on the test set (retrieved)
import time, torch, numpy as np
from tqdm import tqdm
import warnings
from transformers import logging
import random
from datasets import Dataset

random.seed(42)

logging.set_verbosity_error()
warnings.filterwarnings('ignore')

# choose the prompt mode here ─────────────────────────────────────────────
PROMPT_MODE = 2          # 0 = zero-shot, 1 = random-5, 2 = retrieved-5
builder = [build_prompt_zero,
           build_prompt_random5,
           build_prompt_retrieved5][PROMPT_MODE]
generate_sql = make_generator(builder, tokenizer, model)

# 1)
# Use test_data for evaluation
test_dataset = Dataset.from_list(test_data)
num_samples = len(test_dataset)
sample_test  = test_dataset.shuffle(seed=42).select(range(num_samples))


# 2) containers
em_scores, cm_scores, ex_scores, times = [], [], [], []

# 3) start fresh CUDA-peak tracking
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

# 4) main loop
for ex in tqdm(sample_test, desc="Evaluating"):
    # Use global schema_str
    schema  = schema_str

    t0      = time.perf_counter()
    pred_sql= generate_sql(ex["question_ny"], schema)
    times.append(time.perf_counter() - t0)

    gold_sql= ex["sql_statement"]

    # exact + component match
    em_scores.append( float(exact_match(pred_sql, gold_sql)) )
    cm_scores.append( component_match(pred_sql, gold_sql) )

    # execution accuracy
    gold_res= run_query(gold_sql,  DB_PATH) # Use global DB_PATH
    pred_res= run_query(pred_sql, DB_PATH) # Use global DB_PATH
    ex_scores.append( int(gold_res == pred_res) )

# 5) aggregate & report
print("\n\nExact Match        :", np.mean(em_scores))
print("Component Match    :", np.mean(cm_scores))
print("Execution Accuracy :", np.mean(ex_scores))
print("Avg. Latency  (s)  :", np.mean(times))
print("95% Latency  (s)   :", np.percentile(times, 95))
if torch.cuda.is_available():
    print("GPU Mem Peak       :", torch.cuda.max_memory_allocated() / 1e9, "GB")
else:
    print("GPU Mem Peak       : N/A (CPU)")


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s][A
Evaluating:   2%|▏         | 1/60 [00:11<11:30, 11.70s/it][A
Evaluating:   3%|▎         | 2/60 [00:23<11:14, 11.63s/it][A
Evaluating:   5%|▌         | 3/60 [00:34<11:02, 11.62s/it][A
Evaluating:   7%|▋         | 4/60 [00:46<10:53, 11.66s/it][A
Evaluating:   8%|▊         | 5/60 [00:58<10:43, 11.70s/it][A
Evaluating:  10%|█         | 6/60 [01:09<10:27, 11.63s/it][A
Evaluating:  12%|█▏        | 7/60 [01:21<10:18, 11.68s/it][A
Evaluating:  13%|█▎        | 8/60 [01:33<10:06, 11.67s/it][A
Evaluating:  15%|█▌        | 9/60 [01:45<09:59, 11.75s/it][A
Evaluating:  17%|█▋        | 10/60 [01:56<09:44, 11.69s/it][A
Evaluating:  18%|█▊        | 11/60 [02:08<09:33, 11.70s/it][A
Evaluating:  20%|██        | 12/60 [02:20<09:21, 11.71s/it][A
Evaluating:  22%|██▏       | 13/60 [02:32<09:11, 11.73s/it][A
Evaluating:  23%|██▎       | 14/60 [02:43<08:58, 11.71s/it][A
Evaluating:  25%|██▌       | 15/60 [02:55<08:47, 11.73s/it][A
Evaluatin



Exact Match        : 0.4
Component Match    : 0.8814522144522144
Execution Accuracy : 0.43333333333333335
Avg. Latency  (s)  : 11.642408460200052
95% Latency  (s)   : 11.77878318234998
GPU Mem Peak       : 17.787153408 GB



