In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
import evaluate
from pathlib import Path

NOTEBOOK_DIR = Path("/".join(__vsc_ipynb_file__.split("/")[:-1]))
print(NOTEBOOK_DIR)

/home/kyre/repos/llm-fine-tuning


In [12]:
# https://huggingface.co/datasets/gretelai/synthetic_text_to_sql
dataset = load_dataset("gretelai/synthetic_text_to_sql")

print("Available data subsets:", dataset.keys())
print("Features: ")
for k, v in dataset["train"][0].items():
    print(f"---> {k:30}: {v}")


Available data subsets: dict_keys(['train', 'test'])
Features: 
---> id                            : 5097
---> domain                        : forestry
---> domain_description            : Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.
---> sql_complexity                : single join
---> sql_complexity_description    : only one join (specify inner, outer, cross)
---> sql_task_type                 : analytics and reporting
---> sql_task_type_description     : generating reports, dashboards, and analytical insights
---> sql_prompt                    : What is the total volume of timber sold by each salesperson, sorted by salesperson?
---> sql_context                   : CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_i

In [13]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["train"])

In [14]:
from tqdm import tqdm

def check_missing_data(dataset, columns):
    for split in dataset.keys():
        print(f"Checking '{split}' split...")
        for column in tqdm(columns):
            if column not in dataset[split].column_names:
                print(f"  Column '{column}' not found in the dataset!")
                continue
            missing_count = sum(1 for example in dataset[split] if not example[column] or example[column].strip() == "")
            print(f"  {column}: {missing_count} missing or empty entries")
        print()

print("Dataset columns:", dataset["train"].column_names)
check_missing_data(dataset, columns=["sql_prompt", "sql"])

Dataset columns: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation']
Checking 'train' split...


 50%|█████     | 1/2 [00:07<00:07,  7.31s/it]

  sql_prompt: 0 missing or empty entries


100%|██████████| 2/2 [00:14<00:00,  7.19s/it]


  sql: 0 missing or empty entries

Checking 'test' split...


 50%|█████     | 1/2 [00:00<00:00,  2.06it/s]

  sql_prompt: 0 missing or empty entries


100%|██████████| 2/2 [00:00<00:00,  2.26it/s]

  sql: 0 missing or empty entries






In [None]:
from src.preprocessing import create_sharegpt_format
from pathlib import Path
import json

sharegpt_data = create_sharegpt_format(train_df)

output_filename = Path('data/dataset_train.json')
try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(sharegpt_data, f, indent=2, ensure_ascii=False)
    print(f"Successfully saved ShareGPT formatted data to '{output_filename}'")

    if sharegpt_data:
        print("\n--- First Record Example ---")
        print(json.dumps(sharegpt_data[0], indent=2, ensure_ascii=False))

except Exception as e:
    print(f"Error saving JSON file: {e}")

!mkdir $NOTEBOOK_DIR/LLaMA-Factory/data/
!cp -r $NOTEBOOK_DIR/data/ $NOTEBOOK_DIR/LLaMA-Factory/data/

Successfully saved ShareGPT formatted data to 'data/dataset_train.json'

--- First Record Example ---
{
  "conversations": [
    {
      "from": "user",
      "value": "Context:\n'CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');'\n                                    Question:\n'What is the total volume of timber sold by each salesperson, sorted by salesperson?'"
    },
    {
      "from": "assistant",
      "value": "Result: 'SELECT salesperson_id, name, SUM(volume) as total_volume FROM timber_sales JOIN salesperson ON timber_sales.salesperson_id = salesperson.salesperson_id GROUP BY salesperson_id, name 

In [17]:
max_len_str = 0
for record in sharegpt_data:
    max_len_str = max(max_len_str, len(record['conversations'][0]['value']))
print(max_len_str)

2202


In [None]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd $NOTEBOOK_DIR/LLaMA-Factory
%ls


fatal: destination path 'LLaMA-Factory' already exists and is not an empty directory.
/home/kyre/repos/llm-fine-tuning/LLaMA-Factory
CITATION.cff    Makefile      [0m[01;34mcache[0m/       [01;34mexamples[0m/         setup.py
LICENSE         README.md     [01;34mdata[0m/        pyproject.toml    [01;34msrc[0m/
[01;34mLLaMA-Factory[0m/  README_zh.md  [01;34mdocker[0m/      requirements.txt  [01;34mtests[0m/
MANIFEST.in     [01;34massets[0m/       [01;34mevaluation[0m/  [01;34mscripts[0m/          train_qwen3.json
Obtaining file:///home/kyre/repos/llm-fine-tuning/LLaMA-Factory
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Checking if build backend supports build_editable ... [?25ldone
[?25hBuilding wheels for collected packages: llamafactory
  Building editable for

In [None]:
# !GRADIO_SHARE=1 llamafactory-cli webui

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860
* Running on local URL:  http://0.0.0.0:7860
* Running on public URL: https://11bfd50757f61dfa03.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
gio: https://11bfd50757f61dfa03.gradio.live: Operation not supported
Keyboard interruption in main thread... closing server.
^C
Traceback (most recent call last):
  File "/home/kyre/repos/llm-fine-tuning/.env/lib/python3.12/site-packages/gradio/blocks.py", line 2997, in block_thread
    time.sleep(0.1)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/kyre/repos/llm-fine-tuning/.env/bin/llamafactory-cli", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/home/kyre/repos/llm-fine-tuning/LLaMA-Factory/src/lla

In [None]:
training_args = {
    # "deepspeed": DS_CONFIG_PATH,
    "cutoff_len": 1024,
    "dataset": "train_sql_dataset",
    "ddp_timeout": 9000,
    "do_train": True,
    "finetuning_type": "lora",
    "use_dora": True,
    "fp16": True,
    "lora_rank": 16,
    "lora_alpha": 32,
    "flash_attn": "fa2",
    "gradient_accumulation_steps": 8,
    "learning_rate": 5e-5,
    "logging_steps": 8,
    "lora_target": "q_proj,v_proj",
    "lr_scheduler_type": "cosine",
    "model_name_or_path": "Qwen/Qwen3-0.6B",
    "num_train_epochs": 3,
    "output_dir": "out",
    "overwrite_cache": True,
    "overwrite_output_dir": True,
    "per_device_train_batch_size": 4,
    "plot_loss": True,
    "report_to": None, # wnadb
    "save_steps": 1000,
    "gradient_checkpointing": True,
    "stage": "sft",
    "template": "qwen3",
    "warmup_steps": 100,
    "weight_decay": 0.1
}

json.dump(training_args, open(f"{NOTEBOOK_DIR}/LLaMA-Factory/train_qwen3.json", "w", encoding="utf-8"), indent=2)


In [None]:
!cd $NOTEBOOK_DIR/LLaMA-Factory
# !llamafactory-cli train train_qwen3.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[INFO|2025-05-04 03:01:44] llamafactory.hparams.parser:401 >> Process rank: 0, world size: 1, device: cuda:0, distributed training: False, compute dtype: torch.bfloat16
[INFO|tokenization_utils_base.py:2060] 2025-05-04 03:01:44,457 >> loading file vocab.json from cache at /home/kyre/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/6130ef31402718485ca4d80a6234f70d9a4cf362/vocab.json
[INFO|tokenization_utils_base.py:2060] 2025-05-04 03:01:44,457 >> loading file merges.txt from cache at /home/kyre/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/6130ef31402718485ca4d80a6234f70d9a4cf362/merges.txt
[INFO|tokenization_utils_base.py:2060] 2025-05-04 03:01:44,457 >> loading file tokenizer.json from cache at /home/kyre/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/6130ef31402718485ca4d80a6234f70d9a4cf362/tokenizer.json
[INFO|tokenization_utils_base.py:2060] 2025-05-04 03:01:44,457 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py

In [None]:
# def normalize_text(text):
#     text = text.lower().strip().replace("\n", " ")
#     return text

# tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
# def preprocess(example):
#     input_text = "Translate to SQL: " + normalize_text(example["sql_prompt"])
#     output_text = normalize_text(example["sql"])

#     model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=128)
#     labels = tokenizer(output_text, truncation=True, padding="max_length", max_length=128)
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# tokenized_ds = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

# available_splits = dataset.keys()
# train_ds = tokenized_ds["train"]
# validation_ds = tokenized_ds["test"] if "test" in available_splits else None
# test_ds = tokenized_ds["test"] if "test" in available_splits else None

Map: 100%|██████████| 5851/5851 [00:02<00:00, 2664.39 examples/s]


In [None]:
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# training_args = Seq2SeqTrainingArguments(
#     output_dir="./sql_model",
#     eval_strategy="epoch",  # Changed from evaluation_strategy
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     learning_rate=2e-5,
#     num_train_epochs=5,
#     weight_decay=0.01,
#     save_total_limit=2,
#     predict_with_generate=True,
#     logging_dir="./logs"
# )

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_ds,
#     eval_dataset=validation_ds,
#     tokenizer=tokenizer
# )

# trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# def evaluate_model(test_dataset):
#     metric = evaluate.load("sacrebleu")

#     def compute_metrics(eval_pred):
#         predictions, labels = eval_pred
#         decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#         decoded_labels = [[label] for label in decoded_labels]
#         result = metric.compute(predictions=decoded_preds, references=decoded_labels)
#         return {"bleu": result["score"]}

#     results = trainer.evaluate(eval_dataset=test_dataset, compute_metrics=compute_metrics)
#     return results

# if test_ds:
#     test_results = evaluate_model(test_ds)
#     print(test_results)

In [None]:
# def driver(question):
#     inputs = tokenizer("Translate to SQL: " + question, return_tensors="pt").input_ids
#     outputs = model.generate(inputs, max_length=128)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# print(driver("Find all customers who ordered in 2023"))