In [1]:
# !pip install torch torchvision torchaudio


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
import evaluate
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
NOTEBOOK_DIR = Path("/".join(__vsc_ipynb_file__.split("/")[:-1]))
print(NOTEBOOK_DIR)

/home/kyre/repos/llm-fine-tuning


In [4]:
# https://huggingface.co/datasets/gretelai/synthetic_text_to_sql
dataset = load_dataset("gretelai/synthetic_text_to_sql")

print("Available data subsets:", dataset.keys())
print("Features: ")
for k, v in dataset["train"][0].items():
    print(f"---> {k:30}: {v}")


Available data subsets: dict_keys(['train', 'test'])
Features: 
---> id                            : 5097
---> domain                        : forestry
---> domain_description            : Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.
---> sql_complexity                : single join
---> sql_complexity_description    : only one join (specify inner, outer, cross)
---> sql_task_type                 : analytics and reporting
---> sql_task_type_description     : generating reports, dashboards, and analytical insights
---> sql_prompt                    : What is the total volume of timber sold by each salesperson, sorted by salesperson?
---> sql_context                   : CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_i

In [5]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["train"])

In [6]:
from tqdm import tqdm

def check_missing_data(dataset, columns):
    for split in dataset.keys():
        print(f"Checking '{split}' split...")
        for column in tqdm(columns):
            if column not in dataset[split].column_names:
                print(f"  Column '{column}' not found in the dataset!")
                continue
            missing_count = sum(1 for example in dataset[split] if not example[column] or example[column].strip() == "")
            print(f"  {column}: {missing_count} missing or empty entries")
        print()

print("Dataset columns:", dataset["train"].column_names)
check_missing_data(dataset, columns=["sql_prompt", "sql"])

Dataset columns: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation']
Checking 'train' split...


 50%|█████     | 1/2 [00:06<00:06,  6.48s/it]

  sql_prompt: 0 missing or empty entries


100%|██████████| 2/2 [00:12<00:00,  6.45s/it]


  sql: 0 missing or empty entries

Checking 'test' split...


 50%|█████     | 1/2 [00:00<00:00,  2.39it/s]

  sql_prompt: 0 missing or empty entries


100%|██████████| 2/2 [00:00<00:00,  2.51it/s]

  sql: 0 missing or empty entries






In [7]:
def normalize_text(text):
    text = text.lower().strip().replace("\n", " ")
    return text

tokenizer = AutoTokenizer.from_pretrained("t5-large")

In [8]:
def preprocess(example):
    input_text = "Translate to SQL: " + normalize_text(example["sql_prompt"])
    output_text = normalize_text(example["sql"])

    model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(output_text, truncation=True, padding="max_length", max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

available_splits = dataset.keys()
train_ds = tokenized_ds["train"]
validation_ds = tokenized_ds["test"] if "test" in available_splits else None
test_ds = tokenized_ds["test"] if "test" in available_splits else None

Map: 100%|██████████| 100000/100000 [00:36<00:00, 2703.56 examples/s]
Map: 100%|██████████| 5851/5851 [00:02<00:00, 2711.67 examples/s]


In [9]:
# !pip install -r requirements/requirements.txt

In [10]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


In [None]:
!pip install "accelerate>=0.26.0"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-large")

training_args = Seq2SeqTrainingArguments(
    output_dir="./sql_model",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=validation_ds,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Currently logged in as: [33merykzarebski0[0m ([33mmind-flayers[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [17]:
import wandb
wandb.finish()
!pip install sacrebleu

Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x73c4ba882000>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 73c60c66e120, raw_cell="import wandb
wandb.finish()
!pip install sacrebleu" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu-24.04/home/kyre/repos/llm-fine-tuning/T5-Large.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x73c4ba882000>> (for post_run_cell), with arguments args (<ExecutionResult object at 73c60c66c140, execution_count=17 error_before_exec=None error_in_exec=[Errno 32] Broken pipe info=<ExecutionInfo object at 73c60c66e120, raw_cell="import wandb
wandb.finish()
!pip install sacrebleu" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu-24.04/home/kyre/repos/llm-fine-tuning/T5-Large.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [15]:
def evaluate_model(test_dataset):
    metric = evaluate.load("sacrebleu")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_labels = [[label] for label in decoded_labels]
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    results = trainer.evaluate(eval_dataset=test_dataset, compute_metrics=compute_metrics)
    return results

if test_ds:
    test_results = evaluate_model(test_ds)
    print(test_results)

Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x73c4ba882000>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 73c4fef15400, raw_cell="def evaluate_model(test_dataset):
    metric = eva.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu-24.04/home/kyre/repos/llm-fine-tuning/T5-Large.ipynb#X15sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x73c4ba882000>> (for post_run_cell), with arguments args (<ExecutionResult object at 73c4fef156d0, execution_count=15 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 73c4fef15400, raw_cell="def evaluate_model(test_dataset):
    metric = eva.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu-24.04/home/kyre/repos/llm-fine-tuning/T5-Large.ipynb#X15sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
def driver(question):
    inputs = tokenizer("Translate to SQL: " + question, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(driver("Find all customers who ordered in 2023"))

Error in callback <bound method _WandbInit._pre_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x73c4ba882000>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 73c60c3c7680, raw_cell="def driver(question):
    inputs = tokenizer("Tran.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu-24.04/home/kyre/repos/llm-fine-tuning/T5-Large.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x73c4ba882000>> (for post_run_cell), with arguments args (<ExecutionResult object at 73c60c3c58b0, execution_count=18 error_before_exec=None error_in_exec=Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select) info=<ExecutionInfo object at 73c60c3c7680, raw_cell="def driver(question):
    inputs = tokenizer("Tran.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://wsl%2Bubuntu-24.04/home/kyre/repos/llm-fine-tuning/T5-Large.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

: 