In [None]:
!pip install transformers datasets evaluate


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [None]:
# Import required modules
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

In [None]:
# Step 2: Data Preparation

# Load the SQL generation dataset from Hugging Face.
# The dataset "b-mc2/sql-create-context" contains samples with 'context', 'question', and 'answer'.
dataset = load_dataset("b-mc2/sql-create-context")

# Split the training data into a training and validation set (80/20 split).
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

In [None]:
# Step 3: Preprocessing & Tokenization

# Load the pre-trained BART tokenizer and model.
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
# Define a preprocessing function that:
# 1. Combines the 'context' and 'question' fields to form the input.
# 2. Uses the 'answer' field as the target (SQL query).
# 3. Tokenizes both the inputs and targets.
def preprocess_function(examples):
    # Combine the context and question into one input string
    inputs = [c + " " + q for c, q in zip(examples["context"], examples["question"])]
    targets = examples["answer"]

    # Tokenize the input texts with maximum length, truncation, and padding.
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Tokenize the target texts (SQL queries)
    # Using the tokenizer in target mode ensures proper handling of the target sequences.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Add the tokenized targets to the model inputs under the key "labels"
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Apply the preprocessing function to the train and validation datasets.
# batched=True ensures that the function processes multiple samples at once.
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/62861 [00:00<?, ? examples/s]



Map:   0%|          | 0/15716 [00:00<?, ? examples/s]

In [None]:
# Step 4: Data Collator

# The DataCollatorForSeq2Seq automatically pads the inputs and labels to the maximum length in the batch.
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
!pip install --upgrade transformers




In [None]:
# Step 5: Fine-Tuning Setup

# Define the training arguments using Seq2SeqTrainingArguments.
# These include parameters such as output directory, learning rate, batch size, number of epochs, etc.
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",              # Directory for model checkpoints and logs
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    learning_rate=2e-5,                  # Learning rate for the optimizer
    per_device_train_batch_size=8,       # Training batch size per device
    per_device_eval_batch_size=8,        # Evaluation batch size per device
    num_train_epochs=3,                  # Total number of training epochs
    weight_decay=0.01,                   # Strength of weight decay
    save_total_limit=2,                  # Limit on total checkpoints saved
    predict_with_generate=True,          # Enable prediction with generation (needed for text generation tasks)
    fp16=True,                           # Use mixed precision training if available
)



In [None]:
# Step 6: Evaluation Metrics Setup (BLEU and ROUGE)

# # Define a compute_metrics function to evaluate predictions using BLEU and ROUGE.
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred

#     # Replace -100 (default ignore index) with the tokenizer's pad token id.
#     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

#     # Decode predictions and labels into text.
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Load BLEU and ROUGE metrics using the evaluate library.
#     bleu_metric = evaluate.load("bleu")
#     rouge_metric = evaluate.load("rouge")

#     # Compute BLEU score; note that references should be a list of lists.
#     bleu = bleu_metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

#     # Compute ROUGE scores.
#     rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

#     # Return BLEU and ROUGE metrics.
#     return {
#         "bleu": bleu["bleu"],
#         "rouge1": rouge["rouge1"].mid.fmeasure,
#         "rouge2": rouge["rouge2"].mid.fmeasure,
#         "rougeL": rouge["rougeL"].mid.fmeasure,
#     }



def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Replace -100 (ignore index) with the tokenizer's pad token id.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels into text.
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Load BLEU and ROUGE metrics using the evaluate library.
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")

    # Compute BLEU score. Note: the references need to be a list of lists.
    bleu = bleu_metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

    # Compute ROUGE scores.
    rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Return the computed metrics directly, using the float values for ROUGE scores.
    return {
        "bleu": bleu["bleu"],
        "rouge1": rouge["rouge1"],
        "rouge2": rouge["rouge2"],
        "rougeL": rouge["rougeL"],
    }


In [None]:
!pip install rouge_score




In [None]:
! pip install wandb




In [None]:
import os
import wandb

# Set W&B API Key as an environment variable
os.environ["WANDB_API_KEY"] = "77ea72d1f575dd02f94ffeea2d786ec7ba7b2362" #KEY

# Initialize W&B
wandb.login()

True

In [None]:
# Step 7: Trainer Initialization and Model Fine-Tuning

# Initialize the Seq2SeqTrainer with our model, training arguments, datasets, data collator, and evaluation metrics.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the pre-trained BART model on our SQL generation task.
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# Step 8: Model Evaluation

# Evaluate the model on the validation set and print BLEU and ROUGE scores.
results = trainer.evaluate()
print("Evaluation Results:")
print(results)

In [None]:
# Step 9: Inference and Generation

# Generate predictions on a small sample from the validation set to visually inspect outputs.
sample_dataset = val_dataset.select(range(5))
predictions = trainer.predict(sample_dataset)

# Decode the model predictions and the ground truth labels.
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

# Display the generated SQL queries alongside the ground truth.
for i, (pred, label) in enumerate(zip(decoded_preds, decoded_labels)):
    print(f"\nSample {i + 1}:")
    print("Generated SQL Query:", pred)
    print("Ground Truth SQL Query:", label)

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("b-mc2/sql-create-context")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

sql_create_context_v4.json:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/78577 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['answer', 'question', 'context'],
        num_rows: 78577
    })
})


In [None]:
import pandas as pd

# Convert the train split to a DataFrame
train_df = dataset['train'].to_pandas()

# Display the first few rows
print(train_df.head())

                                              answer  \
0           SELECT COUNT(*) FROM head WHERE age > 56   
1  SELECT name, born_state, age FROM head ORDER B...   
2  SELECT creation, name, budget_in_billions FROM...   
3  SELECT MAX(budget_in_billions), MIN(budget_in_...   
4  SELECT AVG(num_employees) FROM department WHER...   

                                            question  \
0  How many heads of the departments are older th...   
1  List the name, born state and age of the heads...   
2  List the creation year, name and budget of eac...   
3  What are the maximum and minimum budget of the...   
4  What is the average number of employees of the...   

                                             context  
0                    CREATE TABLE head (age INTEGER)  
1  CREATE TABLE head (name VARCHAR, born_state VA...  
2  CREATE TABLE department (creation VARCHAR, nam...  
3  CREATE TABLE department (budget_in_billions IN...  
4  CREATE TABLE department (num_employees INTEGER..

In [None]:
from datasets import DatasetDict

train_valid_split = dataset['train'].train_test_split(test_size=0.2)
train_data = train_valid_split['train']
validation_data = train_valid_split['test']

In [None]:
def preprocess_data(data):
    # Combine question and context to form the full input if needed
    questions = [f"{item['context']} {item['question']}" for item in data]  # Including context as part of the question
    sql_queries = [item['answer'] for item in data]
    return questions, sql_queries

In [None]:
# Preprocess the train and validation data
train_questions, train_sql_queries = preprocess_data(train_data)
validation_questions, validation_sql_queries = preprocess_data(validation_data)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')  # Adjust vocab size as needed
tokenizer.fit_on_texts(train_questions + train_sql_queries)

max_seq_len = 50

# Tokenize and pad sequences
train_questions_seq = pad_sequences(tokenizer.texts_to_sequences(train_questions), padding='post')
train_decoder_input = pad_sequences(tokenizer.texts_to_sequences(train_sql_queries), padding='post')[:, :-1]
train_decoder_output = pad_sequences(tokenizer.texts_to_sequences(train_sql_queries), padding='post')[:, 1:]

validation_questions_seq = pad_sequences(tokenizer.texts_to_sequences(validation_questions), padding='post')
validation_decoder_input = pad_sequences(tokenizer.texts_to_sequences(validation_sql_queries), padding='post')[:, :-1]
validation_decoder_output = pad_sequences(tokenizer.texts_to_sequences(validation_sql_queries), padding='post')[:, 1:]

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Attention, Concatenate
from tensorflow.keras.models import Model

# hyperparameters
vocab_size = 20000 # not sure what size to give, can increase/decrease
embedding_dim = 256
lstm_units = 256

# Encoder
encoder_input = Input(shape=(None,), name='encoder_input')
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_input)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

# Decoder
decoder_input = Input(shape=(None,), name='decoder_input')
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_input)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention
attention_layer = Attention(name='attention_layer')
attention_result = attention_layer([decoder_outputs, encoder_outputs])
concat = Concatenate(axis=-1)([decoder_outputs, attention_result])

#output layers
output_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = output_dense(concat)


model = Model([encoder_input, decoder_input], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
import numpy as np

train_questions_seq = np.array(train_questions_seq)
train_decoder_input = np.array(train_decoder_input)
train_decoder_output = np.array(train_decoder_output)

validation_questions_seq = np.array(validation_questions_seq)
validation_decoder_input = np.array(validation_decoder_input)
validation_decoder_output = np.array(validation_decoder_output)

In [None]:
# # Prepare the decoder input and output
# train_decoder_input = train_sql_queries_seq[:, :-1]
# train_decoder_output = train_sql_queries_seq[:, 1:]

# dev_decoder_input = pad_sequences(tokenizer.texts_to_sequences(dev_sql_queries), padding='post')[:, :-1]
# dev_decoder_output = pad_sequences(tokenizer.texts_to_sequences(dev_sql_queries), padding='post')[:, 1:]

In [None]:
# # import numpy as np

# # train_questions_seq = np.array(train_questions_seq).astype(int)
# # train_decoder_input = np.array(train_decoder_input).astype(int)
# # train_decoder_output = np.array(train_decoder_output).astype(int)

# # dev_questions = np.array(dev_questions).astype(int)
# # dev_decoder_input = np.array(dev_decoder_input).astype(int)
# # dev_decoder_output = np.array(dev_decoder_output).astype(int)

# import numpy as np

# train_questions_seq = np.array(train_questions_seq)
# train_decoder_input = np.array(train_decoder_input)
# train_decoder_output = np.array(train_decoder_output)

# dev_questions_seq = np.array(validation_questions_seq)
# dev_decoder_input = np.array(dev_decoder_input)
# dev_decoder_output = np.array(dev_decoder_output)

In [None]:
# Training
model.fit(
    [train_questions_seq, train_decoder_input],
    train_decoder_output,
    validation_data=([validation_questions_seq, validation_decoder_input], validation_decoder_output),
    batch_size=64,
    epochs=10
)

Epoch 1/10
[1m 14/983[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:18:25[0m 16s/step - accuracy: 0.6878 - loss: 7.9410

In [None]:
test_questions, test_sql_queries = preprocess_data(validation_data)
test_questions_seq = pad_sequences(tokenizer.texts_to_sequences(test_questions), padding='post')
test_decoder_input = pad_sequences(tokenizer.texts_to_sequences(test_sql_queries), padding='post')[:, :-1]
test_decoder_output = pad_sequences(tokenizer.texts_to_sequences(test_sql_queries), padding='post')[:, 1:]

# Evaluate the model
test_loss, test_accuracy = model.evaluate([test_questions_seq, test_decoder_input], test_decoder_output)
print(f'Test Accuracy: {test_accuracy}')


[1m497/497[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 27ms/step - accuracy: 0.9320 - loss: 0.3667
Test Accuracy: 0.9334960579872131
