In [None]:
# Install necessary libraries
!pip install transformers datasets evaluate

# Import required modules
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import evaluate
import numpy as np

dataset = load_dataset("wikisql")
print(dataset["train"][0])

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

wikisql.py:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

In [None]:


# -----------------------------------------------------------
# Step 1: Environment Setup
# -----------------------------------------------------------
# We install and import all libraries needed to load the dataset,
# initialize a pre-trained BART model, tokenize our data, and evaluate
# our results using BLEU and ROUGE metrics.

# -----------------------------------------------------------
# Step 2: Data Preparation
# -----------------------------------------------------------
# Load the SQL generation dataset from Hugging Face.
# The dataset "b-mc2/sql-create-context" contains samples with 'context', 'question', and 'answer'.


# Split the training data into a training and validation set (80/20 split).
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# -----------------------------------------------------------
# Step 3: Preprocessing & Tokenization
# -----------------------------------------------------------
# Load the pre-trained BART tokenizer and model.
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Define a preprocessing function that:
# 1. Combines the 'context' and 'question' fields to form the input.
# 2. Uses the 'answer' field as the target (SQL query).
# 3. Tokenizes both the inputs and targets.
def preprocess_function(examples):
    # Use question as input; optionally, add table header
    inputs = [q + " | " + " , ".join(t["header"]) for q, t in zip(examples["question"], examples["table"])]

    # Use the human-readable SQL as target
    targets = [sql["human_readable"] for sql in examples["sql"]]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



# Apply the preprocessing function to the train and validation datasets.
# batched=True ensures that the function processes multiple samples at once.
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

# -----------------------------------------------------------
# Step 4: Data Collator
# -----------------------------------------------------------
# The DataCollatorForSeq2Seq automatically pads the inputs and labels to the maximum length in the batch.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



In [None]:
# !pip install --upgrade transformers


In [None]:
# -----------------------------------------------------------
# Step 5: Fine-Tuning Setup
# -----------------------------------------------------------
# Define the training arguments for the Hugging Face Trainer.
# These include output directory, learning rate, batch size, number of epochs, etc.
#from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",              # check pointing each epoch
    save_total_limit=2,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
)


In [None]:
!pip install rouge_score


In [None]:

# -----------------------------------------------------------
# Step 6: Evaluation Metrics Setup (BLEU and ROUGE)
# -----------------------------------------------------------
# Define a compute_metrics function to evaluate model predictions using BLEU and ROUGE.
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 (the default ignore index for labels) with the tokenizer's pad token id.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode the predictions and labels into human-readable text.
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Load the BLEU and ROUGE evaluation metrics.
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")

    # Compute BLEU; note that the reference for BLEU should be a list of lists.
    bleu = bleu_metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

    # Compute ROUGE scores.
    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Extract the BLEU score and ROUGE F1 scores for ROUGE-1, ROUGE-2, and ROUGE-L.
    return {
        "bleu": bleu["bleu"],
        "rouge1": rouge["rouge1"],
        "rouge2": rouge["rouge2"],
        "rougeL": rouge["rougeL"],
    }


In [None]:
import os
from transformers.trainer_utils import get_last_checkpoint


In [None]:
# Define a safe conversion function that filters out None tokens
def safe_convert_tokens_to_string(tokens):
    return "".join([token if token is not None else "" for token in tokens])

# Override the tokenizer's convert_tokens_to_string method with our safe version
tokenizer.convert_tokens_to_string = safe_convert_tokens_to_string

# Update compute_metrics to define computed_metric_value
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute your desired metric here.
    # For demonstration, we're using a dummy value. Replace this with your metric computation (e.g., BLEU, ROUGE, etc.)
    computed_metric_value = 0.0
    return {"metric_name": computed_metric_value}

In [None]:

# -----------------------------------------------------------
# Step 7: Trainer Initialization and Model Fine-Tuning
# -----------------------------------------------------------
# Initialize the Trainer with our model, training arguments, datasets, data collator, and evaluation metrics.

# Initialize the Trainer with our model, training arguments, datasets, data collator, and evaluation metrics.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the pre-trained BART model on our SQL generation task.
last_checkpoint = None
if os.path.isdir(training_args.output_dir):
    last_checkpoint = get_last_checkpoint(training_args.output_dir)

if last_checkpoint is not None:
    print(f"Resuming from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("No checkpoint found. Starting training from scratch.")
    trainer.train()

    # Nirman Key: 3c756d61f1e64f4a7716d57a61805f7158a99f3d

In [None]:
# -----------------------------------------------------------
# Step 8: Model Evaluation
# -----------------------------------------------------------
# Save the final trained model (in addition to the checkpoints).
trainer.save_model("./results")  # or any path you prefer

# Evaluate the model on the validation set and print the BLEU and ROUGE scores.
results = trainer.evaluate()
print("Evaluation Results:")
print(results)


In [None]:
# -----------------------------------------------------------
# Step 9: Inference and Generation
# -----------------------------------------------------------
# Generate predictions on a small sample from the validation set to visually inspect the outputs.
# Here we select 5 samples from the validation set.
sample_dataset = val_dataset.select(range(5))
predictions = trainer.predict(sample_dataset)

# Decode the model predictions and the corresponding ground truth labels.
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

# Display the results for comparison.
for i, (pred, label) in enumerate(zip(decoded_preds, decoded_labels)):
    print(f"\nSample {i + 1}:")
    print("Generated SQL Query:", pred)
    print("Ground Truth SQL Query:", label)