# Training Pipeline for Natural Language Question to SPARQL Query

In [4]:
!pip install transformers evaluate torch

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-1.13.1-cp310-cp310-manylinux1_x86_64.whl (887.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Collecting nvidia-cuda-runtime-cu11==11.7.99
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cublas-cu11==11.10.3.66
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cudnn-cu11==8.5.0.96
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[2K     [90m━━━━━━━━━━━━

In [5]:
import sys
import os
import time
import re
import random
import evaluate

from collections import Counter
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GPTJModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

## Select Model

In [6]:
selection = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(selection)
if selection=="t5-small":
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
elif selection=="yazdipour/text-to-sparql-t5-base":
    model = AutoModelForSeq2SeqLM.from_pretrained("yazdipour/text-to-sparql-t5-base")
else:
    model = GPTJModel.from_pretrained("hf-internal-testing/tiny-random-gptj")

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


ImportError: 
AutoModelForSeq2SeqLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.


## Configure Environment

In [None]:
# set seed
random.seed(0)
np.random.seed(0)

## Import Data

In [None]:
!wget https://raw.githubusercontent.com/AskNowQA/LC-QuAD2.0/master/dataset/train.json
!wget https://raw.githubusercontent.com/AskNowQA/LC-QuAD2.0/master/dataset/test.json

In [None]:
TRAIN_URL = 'https://raw.githubusercontent.com/AskNowQA/LC-QuAD2.0/master/dataset/train.json'
TEST_URL = 'https://raw.githubusercontent.com/AskNowQA/LC-QuAD2.0/master/dataset/test.json'
train = pd.read_json(TRAIN_URL)
test = pd.read_json(TEST_URL)
train, val = train_test_split(train, test_size=0.2)
train.head()

## Feature Selection

In [None]:
X = "question"
Y = "sparql_wikidata"

## Helper Functions

In [None]:
source_lang = X
target_lang = Y
prefix = "translate English to SPARQL: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training Pipeline

In [None]:
# Configure Pipeline
shuffle = True

# hyperparams
buffer_size = 10000
batch_size = 32
epochs = 2
learning_rate = 5e-5
weight_decay=0.01
params = f"epochs={epochs}_lr={learning_rate}_wd={weight_decay}"

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"results/{selection}_{params}",
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train.map(preprocess_function, batched=True),
    eval_dataset=val.map(preprocess_function, batched=True),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()