In [1]:
from datasets import load_dataset
import pandas as pd

# Load dataset
dataset = load_dataset("gretelai/synthetic_text_to_sql")

# Convert to pandas DataFrames
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_t5_format(df):
    t5_data = []
    for _, row in df.iterrows():
        input_text = (
            f"Translate to SQL:\n"
            f"Context:\n{row['sql_context']}\n\n"
            f"Question:\n{row['sql_prompt']}"
        )
        output_text = row['sql']
        t5_data.append({"input": input_text, "target": output_text})
    return t5_data

In [3]:
from pathlib import Path
import json

for df, df_name in [(train_df, "t5_train"), (test_df, "t5_test")]:
    t5_data = create_t5_format(df)
    output_filename = Path(f'data/{df_name}.json')
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(t5_data, f, indent=2, ensure_ascii=False)
    print(f"Saved T5-formatted data to '{output_filename}'")

Saved T5-formatted data to 'data/t5_train.json'
Saved T5-formatted data to 'data/t5_test.json'


In [4]:
NOTEBOOK_DIR = "./"

In [5]:
!cp -r $NOTEBOOK_DIR/data/ $NOTEBOOK_DIR/LLaMA-Factory/data/

In [6]:
training_args_t5 = {
    "cutoff_len": 1024,
    "dataset": "t5_train",
    "ddp_timeout": 9000,
    "do_train": True,
    "finetuning_type": "lora",
    "fp16": True,
    "lora_rank": 8,
    "lora_alpha": 16,
    "gradient_accumulation_steps": 4,
    "learning_rate": 3e-4,
    "logging_steps": 10,
    "lr_scheduler_type": "cosine",
    "model_name_or_path": "t5-small",
    "num_train_epochs": 3,
    "output_dir": "out_t5",
    "overwrite_cache": True,
    "overwrite_output_dir": True,
    "per_device_train_batch_size": 4,
    "plot_loss": True,
    "report_to": "none",
    "save_steps": 250,
    "gradient_checkpointing": True,
    "stage": "sft",
    "template": "default",
    "warmup_steps": 100,
    "weight_decay": 0.01,
    "max_steps": 1000
}

json.dump(training_args_t5, open(f"{NOTEBOOK_DIR}/LLaMA-Factory/train_t5.json", "w", encoding="utf-8"), indent=2)