In [1]:
!pip install -U \
torch==2.9.0+cu128 torchvision==0.24.0+cu128 torchaudio==2.9.0+cu128 \
transformers==5.0.0 accelerate==1.12.0 datasets==4.5.0 evaluate==0.4.3 \
peft==0.18.1 trl==0.27.2 bitsandbytes==0.49.1 \
huggingface-hub==1.3.7 tokenizers==0.22.2 \
sqlite-utils==3.38 sqlalchemy==2.0.30

Collecting datasets==4.5.0
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.3
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting trl==0.27.2
  Downloading trl-0.27.2-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes==0.49.1
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting huggingface-hub==1.3.7
  Downloading huggingface_hub-1.3.7-py3-none-any.whl.metadata (13 kB)
Collecting sqlite-utils==3.38
  Downloading sqlite_utils-3.38-py3-none-any.whl.metadata (7.5 kB)
Collecting sqlalchemy==2.0.30
  Downloading SQLAlchemy-2.0.30-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting pyarrow>=21.0.0 (from datasets==4.5.0)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting sqlite-fts4 (from sqlite-utils==3.38)
  Downloading sqlite_fts4-1.0.3-py3-none-any.whl.metadata (6.6 kB)
Collecting click-default-gro

In [2]:
!pip list | grep -E 'peft|trl|triton|bitsandbytes|transformers|accelerate|datasets|sqlalchemy|sqlite-utils|moz-sql-parser|evaluate|torch'

accelerate                               1.12.0
bitsandbytes                             0.49.1
datasets                                 4.5.0
evaluate                                 0.4.3
fastrlock                                0.8.3
peft                                     0.18.1
sentence-transformers                    5.2.2
sqlalchemy-spanner                       1.17.2
sqlite-utils                             3.38
tensorflow-datasets                      4.9.9
torch                                    2.9.0+cu128
torchao                                  0.10.0
torchaudio                               2.9.0+cu128
torchcodec                               0.8.0+cu128
torchdata                                0.11.0
torchsummary                             1.5.1
torchtune                                0.6.1
torchvision                              0.24.0+cu128
transformers                             5.0.0
triton                                   3.5.0
trl                           

In [3]:
%%bash
python - <<'PY'
import random, numpy as np, torch, os
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
print("Seeds fixed to", SEED)
PY


Seeds fixed to 42


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login
from google.colab import userdata

# Retrieve the Hugging Face API key from Colab's secrets manager
hf_token = userdata.get('HF_API_KEY')

# Log in to Hugging Face Hub using the retrieved token
login(hf_token)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:

from pathlib import Path
import json

DATA_DIR = Path("/content/drive/MyDrive/chichewa-text2sql/data")

train_path = DATA_DIR / "train.json"
dev_path   = DATA_DIR / "dev.json"
test_path  = DATA_DIR / "test.json"

print("Train exists:", train_path.exists())
print("Dev exists:", dev_path.exists())
print("Test exists:", test_path.exists())


Train exists: True
Dev exists: True
Test exists: True


In [7]:
def load_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


train_data = load_json(train_path)
dev_data   = load_json(dev_path)
test_data  = load_json(test_path)

print(f"Train size: {len(train_data)}")
print(f"Dev size:   {len(dev_data)}")
print(f"Test size:  {len(test_data)}")

Train size: 280
Dev size:   60
Test size:  60


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "meta-llama/Llama-3.1-8B"  # use preferred model
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
  

In [10]:
# reflect every .sqlite file to create a plain-text schema description

import sqlalchemy as sa

DB_PATH = Path("/content/drive/MyDrive/chichewa-text2sql/data/database/chichewa_text2sql.db")

def get_schema_string() -> str:
    """Return a compact textual schema for the given database."""

    engine  = sa.create_engine(f"sqlite:///{DB_PATH}")
    insp    = sa.inspect(engine)

    parts   = []
    for tbl in sorted(insp.get_table_names()):
        cols = [c["name"] for c in insp.get_columns(tbl)]
        parts.append(f"{tbl}({', '.join(cols)})")

    schema_str = ", ".join(parts)
    return schema_str

In [11]:
schema_str = get_schema_string()
print(schema_str)

commodity_prices(id, add_name, epa_name, district, market, month_name, year, commodity, price, collection_date), food_insecurity(id, district, analyzed_population, time_period, percentage_population, insecurity_level, insecurity_desc_short, insecurity_desc_long), mse_daily(id, counter_id, ticker, trade_date, print_time, company_name, sector, high_price, low_price, bid_price, ask_price, previous_close_price, close_price, volume, dividend_mwk, dividend_yield_pct, earnings_yield_pct, pe_ratio, pbv_ratio, market_cap_mwk_mn, profit_after_tax_mwk_mn, shares_outstanding), population(id, region_name, region_code, admin_status, district_code, ea_number, ea_code, ta_code, ta_name, population_male, population_female, number_households, district_name, total_population), production(id, district, crop, yield, season)


In [12]:
from datasets import Dataset

# Assuming train_data is already loaded from your custom JSON file
train_dataset = Dataset.from_list(train_data)
dev_dataset   = Dataset.from_list(dev_data)

def make_prompt(example):
    # Using the global schema_str since it's a single database schema
    global schema_str

    return (
        "### Instruction:\n"
        "You are an expert SQL developer. Given a database schema and a natural-language\n"
        "question, write ONE syntactically correct SQL query that answers the question.\n"
        "Return **only** the SQL; do not repeat the schema or add explanations.\n\n"

        f"### Database Schema:\n{schema_str}\n\n"

        f"### Question:\n{example['question_en']}\n\n" # Use 'question_en' for the English question

        "### SQL:\n"
        f"{example['sql_statement'].strip()}" # Use 'sql_statement' for the SQL query
    )

train_dataset = train_dataset.map(lambda ex: {"text": make_prompt(ex)}) \
                           .remove_columns(set(train_dataset.column_names) - {"text"})

val_dataset = dev_dataset.map(lambda ex: {"text": make_prompt(ex)}) \
                                   .remove_columns(set(dev_dataset.column_names) - {"text"})

# sanity:
print(train_dataset[0]["text"][:500], "...")
print("Fine-tune set size:", len(train_dataset))
print("Validation set size:", len(val_dataset))

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

### Instruction:
You are an expert SQL developer. Given a database schema and a natural-language
question, write ONE syntactically correct SQL query that answers the question.
Return **only** the SQL; do not repeat the schema or add explanations.

### Database Schema:
commodity_prices(id, add_name, epa_name, district, market, month_name, year, commodity, price, collection_date), food_insecurity(id, district, analyzed_population, time_period, percentage_population, insecurity_level, insecurity_de ...
Fine-tune set size: 280
Validation set size: 60


In [13]:
from transformers import BitsAndBytesConfig, AutoModelForCausalLM

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit               = True,
    bnb_4bit_use_double_quant  = True,
    bnb_4bit_quant_type        = "nf4",
    bnb_4bit_compute_dtype     = torch.bfloat16,   # bf16 saves VRAM vs fp16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_cfg,
    device_map="auto",
)
tokenizer.pad_token_id = tokenizer.eos_token_id


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

In [14]:
from transformers import BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, TaskType, get_peft_model
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, TaskType

lora_cfg = LoraConfig(
    r                = 64,
    lora_alpha       = 128,
    target_modules   = ["q_proj","k_proj","v_proj","o_proj"],
    lora_dropout     = 0.05,
    bias             = "none",
    task_type        = TaskType.CAUSAL_LM,
)

peft_model = get_peft_model(base_model, lora_cfg)

# ↓ this halves activation memory at the cost of ~15 % speed
peft_model.gradient_checkpointing_enable()
peft_model.enable_input_require_grads()

In [15]:
from transformers import TrainingArguments
from trl import SFTTrainer           # same as before

args = TrainingArguments(
    output_dir                  = "/content/llama3_8b_spider_qlora",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 8,          # effective batch 16
    num_train_epochs            = 5,
    learning_rate               = 2e-4,
    fp16                        = False,      # bfloat16 already set
    bf16                        = True,
    save_strategy               = "epoch",
    save_total_limit            = 2,
    seed                        = 42,
    eval_strategy               = "epoch",
    logging_strategy            = "steps",
    logging_steps               = 20,
    load_best_model_at_end      = True,
    metric_for_best_model       = "eval_loss", # Changed to 'eval_loss'
    greater_is_better           = False
)

trainer = SFTTrainer(
    model              = peft_model,
    train_dataset      = train_dataset,
    eval_dataset       = val_dataset,
    args               = args,
)

trainer.train()
trainer.save_model(args.output_dir)     # LoRA adapters + config only (~350 MB)

Adding EOS to train dataset:   0%|          | 0/280 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/280 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/280 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.124076
2,0.528101,0.097387
3,0.097384,0.076225
4,0.063788,0.072817
5,0.047056,0.074248


In [16]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 280
})

In [19]:
import torch, gc
from peft import PeftModel # Import PeftModel

# Ensure output_dir is defined for this cell, matching where the model was saved.
output_dir = "/content/llama3_8b_spider_qlora" # This value comes from args.output_dir in the training cell.

# 1️⃣  free trainer / old models
if 'trainer' in locals() and trainer is not None: del trainer
if 'peft_model' in locals() and peft_model is not None: del peft_model
if 'base_model' in locals() and base_model is not None: del base_model
gc.collect(); torch.cuda.empty_cache()

# 2️⃣  reload quantised base model (4-bit)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, # Corrected: use the global model_name variable
    quantization_config=bnb_cfg,
    device_map="auto", # Align with previous loading
    # torch_dtype=torch.bfloat16, # bnb_cfg already sets compute_dtype
)

# 3️⃣  attach LoRA adapters
ft_model = PeftModel.from_pretrained(base_model, output_dir)  # no device_map needed
ft_model.eval()

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

# Introducing zero-shot prompting to the Qlora Adapters

In [20]:
def build_prompt(nl_question: str, target_schema: str) -> str:
    """
    Zero-shot prompt for the fine-tuned QLoRA adapters of Llama 3.1.

    Layout:
      1.  Instruction block            – tells the model to output ONE query only
      2.  Database schema (Spider)     – table and column names
      3.  Natural-language question
      4.  Sentinel “### SQL:”          – model must continue with a single SQL
    """
    return (
        "### Instruction:\n"
        "You are an expert SQL developer. Given the database schema and the "
        "natural-language question below, write ONE syntactically correct SQL "
        "statement that answers the question. Output *only* the SQL; do not "
        "repeat the schema or add explanations.\n\n"

        f"### Database Schema:\n{target_schema}\n\n"

        f"### Question:\n{nl_question}\n\n"

        "### SQL:\n"
    )

In [22]:
import re

def generate_sql(nl_question: str) -> str:
    global schema_str # Access the global schema string
    prompt     = build_prompt(nl_question, schema_str)
    inputs     = tokenizer(prompt, return_tensors="pt").to(ft_model.device)
    input_len  = inputs["input_ids"].shape[1]

    ft_model.config.pad_token_id = tokenizer.eos_token_id
    out = ft_model.generate(
        **inputs,
        max_new_tokens=128,
        num_beams=5,
        early_stopping=True,
        do_sample=False,
    )
    gen_text = tokenizer.decode(out[0, input_len:], skip_special_tokens=True)
    sql = re.split(r"(###|\n\s*\n|```)", gen_text, maxsplit=1)[0]
    sql = sql.split(";", 1)[0]
    return sql.strip()

In [27]:
sample = test_data[5] # Use a sample from your test_data
# The get_schema_string() is no longer needed here as generate_sql uses global schema_str

print(sample["question_en"]) # Use 'question_en' for the natural language question
print(sample['sql_statement']) # Use 'sql_statement' for the gold SQL
print(generate_sql(sample["question_en"])) # Call generate_sql with only the question

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Which crop had the highest yield in Nkhata Bay during 2023-2024?
SELECT Crop, MAX(Yield) AS Max_Yield FROM production WHERE District = 'Nkhata Bay' AND Season = '2023-2024';
SELECT crop, MAX(yield) AS max_yield FROM production WHERE district = 'Nkhata Bay' AND season = '2023-2024'


# Defining Evaluation Metrics

In [28]:
def exact_match(pred, gold):
    import re

    # More comprehensive whitespace and syntax normalization patterns
    _sql_norm_patterns = [
        (re.compile(r'\s+'), ' '),             # Collapse multiple spaces
        (re.compile(r'\s*,\s*'), ', '),        # Standardize commas
        (re.compile(r'\s*([()])\s*'), r'\1'),  # Remove space around parentheses
        (re.compile(r'\s*([=<>!+\-\*/])\s*'), r' \1 '), # Standardize space around operators
        (re.compile(r'\s+'), ' '),             # Re-collapse spaces after operator insertion
    ]

    def norm(s: str) -> str:
        s = s.strip().lower()
        for pattern, replacement in _sql_norm_patterns:
            s = pattern.sub(replacement, s)
        s = s.rstrip(';')
        return s

    return norm(pred) == norm(gold)




from sqlglot import parse_one, expressions

def flatten_ast(node):
    """
    Recursively collect all node‐type names and literal values as lowercase strings.
    """
    out = set()

    def walk(n):
        # record the AST node type
        out.add(type(n).__name__.lower())

        # record any literal (e.g. identifiers, strings, numbers)
        if hasattr(n, "this") and isinstance(n.this, (str, int, float)):
            out.add(str(n.this).lower())

        # recurse into child expressions
        for arg in n.args.values():
            if isinstance(arg, list):
                for child in arg:
                    if isinstance(child, expressions.Expression):
                        walk(child)
            elif isinstance(arg, expressions.Expression):
                walk(arg)

    walk(node)
    return out

def component_match(pred_sql, gold_sql):
    try:
        pred_ast = parse_one(pred_sql)
        gold_ast = parse_one(gold_sql)
    except Exception:
        return 0.0

    pred_set = flatten_ast(pred_ast)
    gold_set = flatten_ast(gold_ast)
    if not gold_set:
        return 0.0
    return len(pred_set & gold_set) / len(gold_set)




import sqlite3, pandas as pd, numpy as np
from tqdm import tqdm

def run_query(sql: str, db_path: Path):
    """Return query result as a sorted list of tuples (order-independent)."""
    try:
        with sqlite3.connect(db_path) as conn:
            df = pd.read_sql_query(sql, conn)
        # sort rows + cols for order-invariant comparison
        return tuple(map(tuple, df.sort_index(axis=1).sort_values(list(df.columns)).to_numpy()))
    except Exception as e:
        # any failure counts as wrong
        return f"ERROR-{e}"

def execution_accuracy(dataset):
    """Compute Execution Accuracy on a custom dataset."""
    correct = 0
    global schema_str # Access the global schema string
    global DB_PATH    # Access the global database path

    for ex in tqdm(dataset, desc="Evaluating"):
        pred_sql= generate_sql(ex["question_en"]) # Use 'question_en' and updated generate_sql

        gold    = run_query(ex["sql_statement"], DB_PATH) # Use 'sql_statement' and DB_PATH
        pred    = run_query(pred_sql,        DB_PATH)

        if gold == pred:
            correct += 1

    return correct / len(dataset)

# Evaluating QloRA on the test data

In [30]:
import time, torch, numpy as np
from tqdm import tqdm
import warnings
from transformers import logging
import random # Added import random

random.seed(42)

logging.set_verbosity_error()
warnings.filterwarnings('ignore')

# 1) Size using test_data which is 60 samples
num_samples = len(test_data) # Use all of test_data for evaluation

sample_val  = test_data # Directly use test_data as the validation set

# 2) containers
em_scores, cm_scores, ex_scores, times = [], [], [], []

# 3) start fresh CUDA-peak tracking
if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()

# 4) main loop
for ex in tqdm(sample_val, desc="Evaluating"):
    # db_id and schema are not needed here as we use a global schema_str and DB_PATH

    t0      = time.perf_counter()
    pred_sql= generate_sql(ex["question_en"]) # Use 'question_en' and generate_sql which uses global schema_str
    times.append(time.perf_counter() - t0)

    gold_sql= ex["sql_statement"] # Use 'sql_statement' for the gold SQL

    # exact + component match
    em_scores.append( float(exact_match(pred_sql, gold_sql)) )
    cm_scores.append( component_match(pred_sql, gold_sql) )

    # execution accuracy
    # Use the global DB_PATH for execution accuracy
    gold_res= run_query(gold_sql,  DB_PATH)
    pred_res= run_query(pred_sql, DB_PATH)
    ex_scores.append( int(gold_res == pred_res) )

# 5) aggregate & report
print("\n\nExact Match        :", np.mean(em_scores))
print("Component Match    :", np.mean(cm_scores))
print("Execution Accuracy :", np.mean(ex_scores))
print("Avg. Latency  (s)  :", np.mean(times))
print("95% Latency  (s)   :", np.percentile(times, 95))
if torch.cuda.is_available():
    print("GPU Mem Peak       :", torch.cuda.max_memory_allocated() / 1e9, "GB")
else:
    print("GPU Mem Peak       : N/A (CPU)")

Evaluating: 100%|██████████| 60/60 [03:54<00:00,  3.90s/it]



Exact Match        : 0.7
Component Match    : 0.965090341675868
Execution Accuracy : 0.7333333333333333
Avg. Latency  (s)  : 3.885797900166669
95% Latency  (s)   : 5.9726673004494355
GPU Mem Peak       : 22.417632768 GB



