In [1]:
!pip install -U bitsandbytes
!pip install -U triton==3.2.0
!pip install -U transformers peft accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.3->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [None]:
import os
os._exit(0)

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
import os, json, random
import torch
import matplotlib.pyplot as plt
from datasets import Dataset
from google.colab import drive

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

print("Torch version:", torch.__version__)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
    torch.backends.cuda.matmul.allow_tf32 = True
    
if not os.path.exists("/content/drive/MyDrive"):
    drive.mount("/content/drive")
else:
    print("Drive already mounted.")

BASE_DIR = "/content/drive/MyDrive/PROJECT"
DATA_DIR = f"{BASE_DIR}/data_new"

MODEL_DIR = f"{BASE_DIR}/models_phi3mini_A100_sdpa"
EVAL_DIR  = f"{MODEL_DIR}/eval"
PLOTS_DIR = f"{EVAL_DIR}/plots"

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

Torch version: 2.6.0+cu124
GPU: NVIDIA A100-SXM4-80GB
Mounted at /content/drive


In [None]:
MODEL_NAME       = "microsoft/Phi-3-mini-4k-instruct"
MAX_LENGTH       = 700
LR               = 2e-4
BATCH_SIZE       = 32
GRAD_ACCUM_STEPS = 1
EPOCHS           = 2

TRAIN_SUBSAMPLE  = 48000
VAL_SUBSAMPLE    = 9000
TEST_SUBSAMPLE   = 3000

random.seed(42)

In [None]:
def load_jsonl_folder(path):
    files = sorted([f for f in os.listdir(path) if f.endswith(".jsonl")])
    rows = []
    for fn in files:
        full = os.path.join(path, fn)
        with open(full, "r", encoding="utf-8") as f:
            for line in f:
                rows.append(json.loads(line))
    return Dataset.from_list(rows)

train_raw = load_jsonl_folder(f"{DATA_DIR}/train")
val_raw   = load_jsonl_folder(f"{DATA_DIR}/val")
test_raw  = load_jsonl_folder(f"{DATA_DIR}/test")

print("Raw sizes ->", len(train_raw), len(val_raw), len(test_raw))

Raw sizes -> 200000 37500 12500


In [None]:
def subsample(ds, n):
    n = min(n, len(ds))
    idx = list(range(len(ds)))
    random.shuffle(idx)
    return ds.select(idx[:n])

train_raw = subsample(train_raw, TRAIN_SUBSAMPLE)
val_raw   = subsample(val_raw,   VAL_SUBSAMPLE)
test_raw  = subsample(test_raw,  TEST_SUBSAMPLE)

print("After subsample ->")
print("Train:", len(train_raw))
print("Val:  ", len(val_raw))
print("Test: ", len(test_raw))

After subsample ->
Train: 48000
Val:   9000
Test:  3000


In [None]:
def format_row(e):
    return {
        "text": f"### Question:\n{e['question']}\n\n### SQL:\n{e['sql']}"
    }

train_raw = train_raw.map(format_row)
val_raw   = val_raw.map(format_row)
test_raw  = test_raw.map(format_row)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.truncation_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [None]:
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_cfg,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa",
)

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

model.config.use_cache = False
model.gradient_checkpointing_disable()
model.enable_input_require_grads()

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

trainable params: 1,572,864 || all params: 3,822,652,416 || trainable%: 0.0411


In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
    )

train_tok = train_raw.map(tokenize, batched=True, remove_columns=train_raw.column_names)
val_tok   = val_raw.map(tokenize,   batched=True, remove_columns=val_raw.column_names)
test_tok  = test_raw.map(tokenize,  batched=True, remove_columns=test_raw.column_names)

collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
args = TrainingArguments(
    output_dir=MODEL_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    bf16=True,
    fp16=False,
    warmup_ratio=0.03,
    logging_steps=200,

    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",

    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=2,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=collator,
)

train_output = trainer.train()

trainer.model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

test_metrics = trainer.evaluate(test_tok)
with open(f"{EVAL_DIR}/test_metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=2)

with open(f"{EVAL_DIR}/training_output.txt", "w") as f:
    f.write(str(train_output))

losses = [x["loss"] for x in trainer.state.log_history if "loss" in x]
if losses:
    plt.plot(losses)
    plt.title("Training Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.savefig(f"{PLOTS_DIR}/training_loss.png", dpi=200)
    plt.close()

print("Done.")

Epoch,Training Loss,Validation Loss
1,0.0563,0.056558
2,0.0558,0.055597


Done.
