<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/deepseek_text2sql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## fine tune

In [None]:
# --- 1. Set Up Your Environment ---
!pip install scikit-learn -q # For potential evaluation metrics (optional)
!pip install -U transformers -q
!pip install -U datasets -q
!pip install -U accelerate -q
!pip install -U peft -q
!pip install -U trl -q # For SFTTrainer
!pip install -U bitsandbytes -q
!pip install unsloth -q # Recommended for speed and efficiency
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git # For latest Unsloth

In [2]:
!rm -rf /content/deepseek_r1_text2sql_finetuned

In [None]:
# 0. Initial Setup
# Install necessary libraries if running in Colab/Jupyter (ensure these are installed first)
# !pip install -U transformers datasets accelerate peft trl bitsandbytes unsloth scikit-learn
import torch
import io
import pandas as pd
import json
from datasets import load_dataset, Dataset # Added Dataset for potential manual splits
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer, AutoTokenizer
from huggingface_hub import login # Optional: for pushing model to Hub

# Ensure you are logged into Hugging Face if you plan to push models or use private datasets
# login() # Uncomment and run if needed

# 1. Load the Model and Tokenizer
print("Loading DeepSeek-R1 model and tokenizer...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length=2048, # Adjust if your combined input/output is longer
    dtype=None, # Automatically chooses bfloat16 or float16 based on GPU
    load_in_4bit=True, # Enable 4-bit quantization for memory efficiency
)
print("Model and tokenizer loaded.")

# 2. Apply LoRA Adapters
print("Applying LoRA adapters...")
model = FastLanguageModel.get_peft_model(
    model,
    r=16, # Rank of the LoRA matrices (common values: 8, 16, 32, 64)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16, # Scaling factor for LoRA weights
    lora_dropout=0, # Dropout rate for LORA (set to 0 for inference)
    bias="none", # Or "all", "lora_only"
    use_gradient_checkpointing=True, # Recommended for memory saving
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)
print("LoRA adapters applied.")

# 3. Load Dataset
print("Loading and preparing b-mc2/sql-create-context dataset...")

# Load the dataset directly from the Hugging Face Hub
dataset_name = "b-mc2/sql-create-context"
full_dataset = load_dataset(dataset_name)

# The b-mc2/sql-create-context dataset typically has a single 'train' split.
# You might need to split it manually for training and evaluation.
# Let's assume we still want a 90/10 split.
raw_dataset_split = full_dataset['train'].train_test_split(test_size=0.1, seed=42)
train_dataset_raw = raw_dataset_split['train']
eval_dataset_raw = raw_dataset_split['test']


# --- NEW: Select a specific number of samples ---
# Define the desired number of samples for training and evaluation
num_train_samples = 2000 # Set your desired number of training samples (e.g., 2000 instead of all)
num_eval_samples = 200   # Set your desired number of evaluation samples (e.g., 200)

# Ensure the desired number of samples doesn't exceed the available samples
num_train_samples = min(num_train_samples, len(train_dataset_raw))
num_eval_samples = min(num_eval_samples, len(eval_dataset_raw))

# Select the desired number of samples from the raw splits
# We use slicing [start:end] to select the first `num_..._samples` entries
train_dataset_selected = train_dataset_raw.select(range(num_train_samples))
eval_dataset_selected = eval_dataset_raw.select(range(num_eval_samples))

print(f"Raw dataset loaded from {dataset_name}. Training entries: {len(train_dataset_raw)}, Evaluation entries: {len(eval_dataset_raw)}")
print('\n')
print(f"Selected sample sizes: Training entries: {len(train_dataset_selected)}, Evaluation entries: {len(eval_dataset_selected)}")
# --- END NEW ---


# 4. Define the Formatting Function for Text-to-SQL
# This function converts entries from b-mc2/sql-create-context dataset
# into the chat format that the DeepSeek-R1 model will be trained on.

def format_sql_example(example):
    """
    Formats an example from the b-mc2/sql-create-context dataset into a chat template.

    Args:
        example (dict): A dictionary representing one row in the dataset.
                        Expected keys: 'question', 'context', 'answer'.

    Returns:
        dict: The example dictionary with an added 'text' key containing the
              formatted chat string.
    """
    # Extract data from the dataset example
    question = example["question"]
    context = example["context"] # This usually contains the CREATE TABLE statements
    answer = example["answer"]   # This is the correct SQL query

    # Construct the prompt for the model
    # The prompt should provide the database schema as context and the user's question.
    # The model should learn to generate the SQL query (the 'answer').
    instruction = "Given the database schema below, write a SQL query that answers the following question."

    full_user_prompt = f"{instruction}\n\nDatabase Schema:\n{context}\n\nQuestion: {question}"

    # Combine into the chat format for training
    # The 'user' role contains the context and question.
    # The 'assistant' role contains the target SQL query.
    messages = [
        {"role": "user", "content": full_user_prompt},
        {"role": "assistant", "content": answer}
    ]

    # Apply the chat template using the tokenizer.
    # tokenize=False returns a string.
    # add_special_tokens=False prevents adding BOS/EOS tokens here,
    # as the SFTTrainer handles tokenization and special tokens during training.
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False, add_special_tokens=False)

    return example

# Apply the formatting function to your selected datasets
print("Applying formatting function to datasets for Text-to-SQL...")

# --- MODIFIED: Use the selected datasets for mapping ---
# Instead of mapping over train_dataset_raw and eval_dataset_raw,
# we map over the datasets that contain the selected number of samples.
train_dataset = train_dataset_selected.map(format_sql_example, batched=False)
eval_dataset = eval_dataset_selected.map(format_sql_example, batched=False)
# --- END MODIFIED ---

print("Dataset preparation complete with Text-to-SQL formatting.")
print("Dataset preparation complete with Text-to-SQL formatting.")

# 5. Set Up and Configure the Trainer
print("Setting up SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset, # Now using the formatted datasets
    eval_dataset=eval_dataset,   # Now using the formatted datasets
    dataset_text_field="text", # This field holds the formatted chat messages
    max_seq_length=2048, # Ensure this is sufficient for your long itineraries
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(), # Use fp16 if bfloat16 not supported
        bf16=torch.cuda.is_bf16_supported(),     # Use bf16 if supported (recommended)
        logging_steps=10,
        #output_dir="./deepseek_r1_tourism_planner_finetuned", # Consistent output directory name
        output_dir="./deepseek_r1_text2sql_finetuned", # Consistent output directory name
        optim="adamw_8bit",
        seed=3407,
        save_steps=500,
        save_total_limit=2,
        eval_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False, # Lower loss is better
        report_to="none", # Disable logging to Weights & Biases if not needed
    ),
)
print("SFTTrainer configured.")

In [None]:
# 6. Start Training
print('\n')
print("Starting training...")
from unsloth import unsloth_train
# trainer_stats = trainer.train() << Buggy gradient accumulation
# https://unsloth.ai/blog/gradient
trainer_stats = unsloth_train(trainer)
#trainer.train() # Uncomment to start the training
print("Training complete.")
print('\n')


# 7. Save Your Fine-tuned Model
#output_dir = "./deepseek_r1_tourism_planner_finetuned"
output_dir = "./deepseek_r1_text2sql_finetuned"
print(f"Saving fine-tuned model to {output_dir}...")
model.save_pretrained(output_dir, tokenizer) # Uncomment to save the model and tokenizer
print("Model saved locally.")



Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 3 | Total steps = 3,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss,Validation Loss
500,0.9573,1.044319


Unsloth: Will smartly offload gradients to save VRAM!


## Model evaluation

In [None]:
!pip install colab-env -q
import colab_env

In [None]:
!rm rf /content/gdrive/MyDrive/model/deepseek_r1_text2sql_finetuned
!cp -r /content/deepseek_r1_text2sql_finetuned /content/gdrive/MyDrive/model/deepseek_r1_text2sql_finetuned

In [None]:
!pip install rouge_score -q
!pip install sacrebleu -q

In [None]:
## Model evaluation Refactored for b-mc2/sql-create-context dataset
# --- Evaluation Code Block ---

import torch
import json
import pandas as pd # For saving results
from datasets import load_dataset # For loading evaluation data

# Import libraries for model loading and generation
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # TextStreamer is optional
from unsloth import FastLanguageModel # Using Unsloth for efficient loading

# Import libraries for metrics calculation
from rouge_score import rouge_scorer
import sacrebleu
import numpy as np # For calculating averages

# --- Configuration ---
# Define the path where your fine-tuned model was saved
# Make sure this path points to the directory containing the saved model files
# This path should now point to the model fine-tuned on the SQL dataset
fine_tuned_model_path = "/content/gdrive/MyDrive/model/deepseek_r1_text2sql_finetuned" # Example Google Drive path or local path

# Define the name of the dataset from Hugging Face Hub
dataset_name = "b-mc2/sql-create-context"

# Number of examples from the evaluation set to run inference on
# Set to a smaller number for quick testing, or len(eval_dataset) for full evaluation
num_examples_to_evaluate = 5 # Evaluate a specific number of examples from the test split

# Maximum sequence length used during fine-tuning (must match)
max_seq_length = 2048 # Ensure this matches the value used during training

# --- 1. Load the Fine-tuned Model and Tokenizer ---
print(f"Loading fine-tuned model from {fine_tuned_model_path}...")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=fine_tuned_model_path, # Load from your saved model directory
        max_seq_length=max_seq_length,     # Must match the max_seq_length used during fine-tuning
        dtype=None,                        # Will auto-detect from saved config
        load_in_4bit=True,                 # Load in 4-bit for memory efficiency
    )
    # Ensure the model is on GPU if available
    if torch.cuda.is_available():
        model.to("cuda")
        print("Model moved to GPU.")
    else:
        print("CUDA not available. Model loading on CPU (will be slower).")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please check that the model path is correct and the model files exist.")
    # Exit or handle the error appropriately if model loading fails
    exit() # Example: Exit the script


print("Model and tokenizer loaded.")

# Optional: Set up TextStreamer for real-time output during generation
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# --- 2. Load the Evaluation Dataset ---
# We will load the same dataset used for training but use the test split
print(f"Loading evaluation dataset from {dataset_name}...")
try:
    # Load the dataset directly from the Hugging Face Hub
    full_dataset = load_dataset(dataset_name)

    # The b-mc2/sql-create-context dataset typically has a single 'train' split.
    # We split it manually for training and evaluation, 90/10 split, same seed as training.
    raw_dataset_split_for_eval = full_dataset['train'].train_test_split(test_size=0.1, seed=42)
    # Use the 'test' split created here for evaluation
    eval_dataset = raw_dataset_split_for_eval['test']


    print(f"Evaluation dataset loaded with {len(eval_dataset)} entries.")

    # Adjust num_examples_to_evaluate if it's larger than the available dataset
    num_examples_to_evaluate = min(num_examples_to_evaluate, len(eval_dataset))
    # Select the specific number of examples for evaluation
    eval_dataset_selected = eval_dataset.select(range(num_examples_to_evaluate))

    print(f"Evaluating on {len(eval_dataset_selected)} examples from the test set.")

except FileNotFoundError:
    print(f"Error: Dataset file not found at {dataset_name}")
    print("Please ensure you have access to the dataset on the Hugging Face Hub.")
    exit() # Example: Exit if dataset not found
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()


# --- 3. Define the PROMPT CONSTRUCTION function for inference ---
# This function should mirror the 'user' part of your formatting function used during training.
# It takes raw data and turns it into the prompt the model expects for SQL generation.

def construct_sql_inference_prompt(question, context):
    """Constructs the full user prompt for the model based on SQL question and schema context."""

    # The instruction for the model to generate a SQL query
    instruction = "Given the database schema below, write a SQL query that answers the following question."

    # Construct the full user prompt for the model
    full_user_prompt = f"{instruction}\n\nDatabase Schema:\n{context}\n\nQuestion: {question}"

    # Apply the chat template for the single user turn
    messages = [{"role": "user", "content": full_user_prompt}]

    # Tokenize the prompt for generation
    tokenized_input = tokenizer.apply_chat_template(messages, return_tensors="pt", add_special_tokens=True)

    # Move input to GPU if CUDA is available
    if torch.cuda.is_available():
        tokenized_input = tokenized_input.to("cuda")

    return tokenized_input


# --- 4. Evaluation Loop with Metrics ---
print("\n--- Starting Evaluation Loop ---")
results = []
rouge_scores = []
bleu_scores = [] # Note: BLEU might not be the best metric for SQL, but we'll include it for comparison.

# Initialize ROUGE scorer
# using 'rouge1', 'rouge2', and 'rougeL' f-measure (F1 score)
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Generation parameters (can be adjusted)
generation_kwargs = {
    "max_new_tokens": 256, # Max length for SQL queries is likely shorter
    "use_cache": True,
    "temperature": 0.1,     # Often lower temperature for code/SQL generation for more deterministic output
    "top_p": 0.9,          # Adjust nucleus sampling
    "do_sample": True,      # Enable sampling
    "pad_token_id": tokenizer.eos_token_id, # Set padding token
    #"streamer": streamer,  # Uncomment if using TextStreamer
}


# Loop through the selected evaluation examples
for i in range(len(eval_dataset_selected)): # Use the length of the selected dataset
    example = eval_dataset_selected[i] # Access example from the selected subset
    original_question = example["question"]
    database_context = example["context"] # This is the database schema
    ground_truth_answer = example["answer"] # This is the correct SQL query


    print(f"\n--- Evaluation Case {i+1}/{len(eval_dataset_selected)} ---")
    print(f"Original Question: {original_question}")
    print(f"Database Schema:\n{database_context}")


    print("\n--- Ground Truth SQL Query (from dataset) ---\n")
    print(ground_truth_answer)


    # Construct the prompt for the model and get token IDs
    input_ids = construct_sql_inference_prompt(original_question, database_context)

    # Generate the SQL query using the model
    try:
        # Note: The model was fine-tuned to output the SQL query immediately after the user prompt.
        # We generate starting from the end of the prompt.
        outputs = model.generate(
            input_ids=input_ids,
            **generation_kwargs # Pass the generation parameters
        )

        generated_text = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)

        # Post-process the generated text if necessary (e.g., remove trailing unwanted characters)
        # Simple example: remove trailing whitespace or potential chat remnants
        generated_text = generated_text.strip()
        # You might need more sophisticated parsing depending on model output
        # For example, if it includes chat turns after the SQL, you'd need to extract just the SQL.


    except Exception as e:
        print(f"Error during text generation for example {i+1}: {e}")
        generated_text = "Error generating query." # Indicate failure


    print("\n--- Generated SQL Query ---\n")
    print(generated_text)

    # --- Calculate Metrics for the current example ---
    current_rouge_scores = None
    current_bleu_score = None

    if generated_text != "Error generating query.":
        try:
            # ROUGE Score Calculation
            # Pass ground truth and generated text strings to the scorer
            current_rouge_scores = rouge_scorer_obj.score(ground_truth_answer, generated_text)
            rouge_scores.append(current_rouge_scores)
            print(f"ROUGE Scores: ROUGE-1: {current_rouge_scores['rouge1'].fmeasure:.4f}, ROUGE-2: {current_rouge_scores['rouge2'].fmeasure:.4f}, ROUGE-L: {current_rouge_scores['rougeL'].fmeasure:.4f}")

            # BLEU Score Calculation
            current_bleu = sacrebleu.corpus_bleu([generated_text], [[ground_truth_answer]])
            current_bleu_score = current_bleu.score
            bleu_scores.append(current_bleu_score)
            print(f"BLEU Score: {current_bleu_score:.4f}")

        except Exception as e:
            print(f"Error calculating metrics for example {i+1}: {e}")
            # Metrics for this example will be None
    else:
         print("Skipping metric calculation due to generation error.")


    # Store results for later analysis
    results.append({
        "original_question": original_question,
        "database_context": database_context, # Optional, might be too long to store all
        "generated_query": generated_text,
        "ground_truth_query": ground_truth_answer, # Store ground truth SQL
        "rouge_scores": current_rouge_scores,       # Store the detailed ROUGE scores dict
        "bleu_score": current_bleu_score            # Store the BLEU score (float or None)
    })

print("\n--- Evaluation Loop Finished ---")

# --- 5. Analysis and Metrics Summary ---
print("\n--- Overall Evaluation Summary ---")

# Calculate and print average metrics
# Filter out None values before calculating averages
valid_rouge_scores = [s for s in rouge_scores if s is not None]
valid_bleu_scores = [s for s in bleu_scores if s is not None]

if valid_rouge_scores:
    avg_rouge1 = np.mean([s['rouge1'].fmeasure for s in valid_rouge_scores])
    avg_rouge2 = np.mean([s['rouge2'].fmeasure for s in valid_rouge_scores])
    avg_rougeL = np.mean([s['rougeL'].fmeasure for s in valid_rouge_scores])
    print(f"Average ROUGE-1 F-measure (over {len(valid_rouge_scores)} examples): {avg_rouge1:.4f}")
    print(f"Average ROUGE-2 F-measure (over {len(valid_rouge_scores)} examples): {avg_rouge2:.4f}")
    print(f"Average ROUGE-L F-measure (over {len(valid_rouge_scores)} examples): {avg_rougeL:.4f}")
else:
    print("No valid ROUGE scores were calculated.")


if valid_bleu_scores:
    avg_bleu = np.mean(valid_bleu_scores)
    print(f"Average BLEU Score (over {len(valid_bleu_scores)} examples): {avg_bleu:.4f}")
else:
    print("No valid BLEU scores were calculated.")

# --- Save Results ---
# Convert the list of results dictionaries to a pandas DataFrame and save to JSONL
try:
    df_results = pd.DataFrame(results)

    output_filename = "sql_evaluation_results_with_metrics.jsonl" # New filename
    df_results.to_json(output_filename, orient="records", lines=True)
    print(f"\nEvaluation complete. Detailed results saved to '{output_filename}'.")
except Exception as e:
    print(f"Error saving results to JSONL: {e}")
    print("Consider saving fewer details (e.g., only text outputs and average metrics) if this persists.")