# SimpleSQL: Text-to-SQL Finetuning in Google Colab

This notebook provides a complete pipeline for finetuning text-to-SQL models in Google Colab.

## Setup Instructions

1. **Enable GPU**: Runtime → Change runtime type → GPU
2. **Run all cells** sequentially
3. **Save results** to Google Drive before session ends


In [None]:
# Install dependencies
!pip install -q transformers torch peft accelerate bitsandbytes datasets \
    sqlparse sqlalchemy pymysql pyyaml tqdm python-dotenv wandb \
    sentencepiece protobuf sacrebleu rouge-score requests

# Verify GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


In [None]:
# Clone repository (or upload files manually)
import os

# Option 1: Clone from GitHub
!git clone https://github.com/yourusername/simplesql.git
os.chdir('simplesql')

# Option 2: Upload files manually via Colab file browser
# Then uncomment: os.chdir('/content/your_uploaded_folder')

print(f"Current directory: {os.getcwd()}")


In [None]:
# Mount Google Drive for persistent storage (optional)
from google.colab import drive
drive.mount('/content/drive')

# Create directories in Drive
!mkdir -p /content/drive/MyDrive/simplesql/{models,data,results}
print("Google Drive mounted and directories created")


In [None]:
# Download Spider benchmark dataset
!mkdir -p data/benchmarks/spider
!wget -q https://yale-lily.github.io/spider/dataset/spider.zip -O /tmp/spider.zip
!unzip -q /tmp/spider.zip -d data/benchmarks/spider/
!rm /tmp/spider.zip

# Verify download
import json
with open('data/benchmarks/spider/train_spider.json', 'r') as f:
    train_data = json.load(f)
print(f"Downloaded {len(train_data)} training examples")


In [None]:
# Configure project for Colab
import yaml

config = {
    'model': {
        'name': 'defog/sqlcoder-7b-2',
        'ollama_name': 'sqlcoder-7b',
        'max_tokens': 512,
        'temperature': 0.1,
        'top_p': 0.9
    },
    'benchmark': {
        'name': 'spider',
        'test_split': 'test',
        'database_path': './data/benchmarks/spider/database',
        'data_path': './data/benchmarks/spider'
    },
    'evaluation': {
        'metrics': ['exact_match', 'execution_accuracy'],
        'timeout': 30,
        'max_examples': None
    },
    'finetuning': {
        'method': 'qlora',
        'lora_r': 16,
        'lora_alpha': 32,
        'lora_dropout': 0.1,
        'learning_rate': 2e-4,
        'batch_size': 4,
        'num_epochs': 3,
        'gradient_accumulation_steps': 4,
        'max_seq_length': 2048,
        'load_in_4bit': True,
        'load_in_8bit': False,
        'output_dir': './models/finetuned',
        'save_steps': 100,  # Save frequently for Colab
        'eval_steps': 500
    },
    'ollama': {
        'base_url': 'http://localhost:11434',
        'timeout': 300,
        'context_window': 4096
    },
    'paths': {
        'models_dir': './models',
        'data_dir': './data',
        'results_dir': './results'
    },
    'logging': {
        'level': 'INFO',
        'use_wandb': False,
        'wandb_project': 'text-to-sql-finetuning'
    }
}

# Save config
with open('config-colab.yaml', 'w') as f:
    yaml.dump(config, f)

print("Configuration saved to config-colab.yaml")


In [None]:
# Run finetuning
from src.finetuner import TextToSQLFinetuner
from src.utils import load_config

# Load config
config = load_config('config-colab.yaml')

# Create finetuner
finetuner = TextToSQLFinetuner(config)

# Prepare datasets
print("Preparing training dataset...")
train_dataset = finetuner.prepare_spider_dataset(split='train')
print(f"Loaded {len(train_dataset)} training examples")

print("Preparing evaluation dataset...")
eval_dataset = finetuner.prepare_spider_dataset(split='dev')
print(f"Loaded {len(eval_dataset)} evaluation examples")

# Run finetuning
print("\nStarting finetuning...")
print(f"Method: {config['finetuning']['method']}")
print(f"Batch size: {config['finetuning']['batch_size']}")
print(f"Epochs: {config['finetuning']['num_epochs']}")

finetuner.finetune(train_dataset, eval_dataset)

print("\nFinetuning complete!")


In [None]:
# Save to Google Drive
!cp -r models/finetuned /content/drive/MyDrive/simplesql/models/ 2>/dev/null || echo "Drive not mounted, skipping"
!cp -r results /content/drive/MyDrive/simplesql/ 2>/dev/null || echo "Drive not mounted, skipping"

print("Model and results saved (if Drive is mounted)")


In [None]:
# Download results
from google.colab import files
import zipfile

# Create zip of results
!zip -r results.zip results/ 2>/dev/null || echo "No results to zip"
!zip -r finetuned_model.zip models/finetuned/ 2>/dev/null || echo "No model to zip"

# Download
try:
    files.download('results.zip')
    print("Results downloaded!")
except:
    print("No results.zip to download")

try:
    files.download('finetuned_model.zip')
    print("Model downloaded!")
except:
    print("No model.zip to download")
