In [1]:
from datasets import load_dataset
import os 
import pandas as pd
import sys
    
NOTEBOOK_DIR = os.path.dirname(os.path.abspath("__file__"))
print(NOTEBOOK_DIR)

  from .autonotebook import tqdm as notebook_tqdm


/home/kyre/repos/llm-fine-tuning


In [2]:
# https://huggingface.co/datasets/gretelai/synthetic_text_to_sql
dataset = load_dataset("gretelai/synthetic_text_to_sql")

print("Available data subsets:", dataset.keys())
print("Features: ")
for k, v in dataset["train"][0].items():
    print(f"---> {k:30}: {v}")

Available data subsets: dict_keys(['train', 'test'])
Features: 
---> id                            : 5097
---> domain                        : forestry
---> domain_description            : Comprehensive data on sustainable forest management, timber production, wildlife habitat, and carbon sequestration in forestry.
---> sql_complexity                : single join
---> sql_complexity_description    : only one join (specify inner, outer, cross)
---> sql_task_type                 : analytics and reporting
---> sql_task_type_description     : generating reports, dashboards, and analytical insights
---> sql_prompt                    : What is the total volume of timber sold by each salesperson, sorted by salesperson?
---> sql_context                   : CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_i

In [3]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

In [4]:
from src.preprocessing import create_sharegpt_format
from pathlib import Path
import json

# Convert the dataset to ShareGPT format
for df, df_name in [(train_df, "dataset_train"), (test_df, "dataset_test")]:
    sharegpt_data = create_sharegpt_format(df)

    output_filename = Path(f'data/{df_name}.json')
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(sharegpt_data, f, indent=2, ensure_ascii=False)
        print(f"Successfully saved ShareGPT formatted data to '{output_filename}'")

    except Exception as e:
        print(f"Error saving JSON file: {e}")
        
if sharegpt_data:
    print("\n--- First Record Example ---")
    print(json.dumps(sharegpt_data[0], indent=2, ensure_ascii=False))


!mkdir $NOTEBOOK_DIR/LLaMA-Factory/data/
!cp -r $NOTEBOOK_DIR/data/ $NOTEBOOK_DIR/LLaMA-Factory/data/

Successfully saved ShareGPT formatted data to 'data/dataset_train.json'
Successfully saved ShareGPT formatted data to 'data/dataset_test.json'

--- First Record Example ---
{
  "conversations": [
    {
      "from": "user",
      "value": "You are provided with the following database schema and context:\n\n--- SCHEMA START ---\nCREATE TABLE creative_ai (application_id INT, name TEXT, region TEXT, explainability_score FLOAT); INSERT INTO creative_ai (application_id, name, region, explainability_score) VALUES (1, 'ApplicationX', 'Europe', 0.87), (2, 'ApplicationY', 'North America', 0.91), (3, 'ApplicationZ', 'Europe', 0.84), (4, 'ApplicationAA', 'North America', 0.93), (5, 'ApplicationAB', 'Europe', 0.89);\n--- SCHEMA END ---\n\nUsing only this information, write an SQL query that answers the following question:\n\n\"What is the average explainability score of creative AI applications in 'Europe' and 'North America' in the 'creative_ai' table?\"\n\nOutput only the SQL query, with no addi

In [5]:
# Check the maximum length of the first conversation string in the ShareGPT data

max_len_str = 0
for record in sharegpt_data:
    max_len_str = max(max_len_str, len(record['conversations'][0]['value']))
print(max_len_str)

1587


In [6]:
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
%cd $NOTEBOOK_DIR/LLaMA-Factory
%ls

fatal: destination path 'LLaMA-Factory' already exists and is not an empty directory.
/home/kyre/repos/llm-fine-tuning/LLaMA-Factory
CITATION.cff    Makefile      [0m[01;34mcache[0m/       [01;34mexamples[0m/         setup.py
LICENSE         README.md     [01;34mdata[0m/        pyproject.toml    [01;34msrc[0m/
[01;34mLLaMA-Factory[0m/  README_zh.md  [01;34mdocker[0m/      requirements.txt  [01;34mtests[0m/
MANIFEST.in     [01;34massets[0m/       [01;34mevaluation[0m/  [01;34mscripts[0m/          train_qwen3.json


In [7]:
# !GRADIO_SHARE=1 llamafactory-cli webui

In [8]:
training_args = {
    # "deepspeed": DS_CONFIG_PATH,
    "cutoff_len": 1024,
    "dataset": "train_sql_dataset",
    "ddp_timeout": 9000,
    "do_train": True,
    "finetuning_type": "lora",
    "use_dora": True,
    "fp16": True,
    "lora_rank": 16,
    "lora_alpha": 32,
    "flash_attn": "fa2",
    "gradient_accumulation_steps": 8,
    "learning_rate": 1e-5,
    "logging_steps": 8,
    "lora_target": "q_proj,v_proj",
    "lr_scheduler_type": "cosine",
    "model_name_or_path": "Qwen/Qwen3-0.6B",
    "num_train_epochs": 1,
    "output_dir": "out",
    "overwrite_cache": True,
    "overwrite_output_dir": True,
    "per_device_train_batch_size": 2,
    "plot_loss": True,
    "report_to": "wandb", # wnadb
    "save_steps": 250,
    "gradient_checkpointing": True,
    "stage": "sft",
    "template": "qwen3",
    "warmup_steps": 100,
    "weight_decay": 0.01,
    "max_steps": 5000
}

json.dump(training_args, open(f"{NOTEBOOK_DIR}/LLaMA-Factory/train_qwen3.json", "w", encoding="utf-8"), indent=2)

In [9]:
!export $(cat $NOTEBOOK_DIR/.env | xargs)
!cd $NOTEBOOK_DIR/LLaMA-Factory

In [None]:
# Run the training command from the terminal
# !llamafactory-cli train LLaMA-Factory/train_qwen3.json
# !llamafactory-cli export llama-factory-configs/merge_lora.yaml
# !llamafactory-cli chat llama-factory-configs/infer_lora.yaml