# Training Pipeline for Natural Language Question to SPARQL Query

In [None]:
!pip install transformers evaluate torch

In [15]:
!pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tabulate>=0.8.9
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting lxml
  Downloading lxml-4.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting portalocker
  Downloading portalocker-2.7.0-py2.py3

In [16]:
import sys
import os
import time
import re
import random
import evaluate

from collections import Counter
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GPTJModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from datasets import load_dataset, Dataset

## Select Model

In [2]:
selection = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(selection)
if selection=="t5-small":
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
elif selection=="yazdipour/text-to-sparql-t5-base":
    model = AutoModelForSeq2SeqLM.from_pretrained("yazdipour/text-to-sparql-t5-base")
else:
    model = GPTJModel.from_pretrained("hf-internal-testing/tiny-random-gptj")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Configure Environment

In [3]:
# set seed
random.seed(0)
np.random.seed(0)

## Import Data

In [6]:
dataset = load_dataset("lc_quad")

train = dataset["train"].to_pandas()
train = train.dropna(subset=['question', 'sparql_wikidata'])
train, exclude = train_test_split(train, test_size=0.8) # making dataset small for pipeline testing
train = Dataset.from_pandas(train)
train = train.train_test_split(test_size=0.2)


# test = dataset["test"].to_pandas()
# test = test.dropna(subset=['question', 'sparql_wikidata'])
# test = Dataset.from_pandas(test)

Found cached dataset lc_quad (/home/jstil/.cache/huggingface/datasets/lc_quad/default/2.0.0/139ee1f12aca006669dcc1f282ec02e126c69e7595453db443ab022643d54086)


  0%|          | 0/2 [00:00<?, ?it/s]

## Preprocess Data

In [17]:
X = "question"
Y = "sparql_wikidata"
prefix = "translate English to SPARQL: "
sacrebleu = evaluate.load("sacrebleu")

In [8]:
def preprocess_function(examples):
    inputs = [prefix + example[X] for example in examples]
    targets = [example[Y] for example in examples]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

training = preprocess_function(train["train"])
validation = preprocess_function(train["test"])
# testing = preprocess_function(test)

In [9]:
print(training)

{'input_ids': [[13959, 1566, 12, 6760, 4280, 2247, 434, 10, 2645, 19, 7291, 1138, 2565, 31, 7, 2472, 138, 1236, 58, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 366, 410, 2318, 23, 172, 86, 4268, 32, 2318, 521, 836, 26551, 911, 8, 2760, 38, 2698, 4737, 13, 8, 5197, 13, 3068, 13, 17665, 58, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 363, 19, 8, 2214, 564, 13, 11065, 8561, 189, 24, 65, 8, 7072, 13, 8177, 7, 58, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 363, 33, 8, 4913, 690, 84, 456, 28, 8, 2068, 3, 776, 1635, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 1129, 66, 8, 18631, 2602, 6, 149, 186, 33, 5571, 58, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 2645, 33, 8, 4999, 2765, 13, 2549, 25182, 58, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 363, 19, 8, 3244, 16, 8, 936, 18557, 358, 6, 24, 19, 8, 4818, 13, 46, 7353, 1722, 3735, 58, 1], [13959, 1566, 12, 6760, 4280, 2247, 434, 10, 3520, 8, 240, 1647, 3812, 13, 8, 350, 5, 4729, 382, 4081, 12, 314, 28212, 58, 1], 

## Helper Functions

In [10]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training Pipeline

In [11]:
# Configure Pipeline
shuffle = True

# hyperparams
buffer_size = 10000
batch_size = 32
epochs = 2
learning_rate = 5e-5
weight_decay=0.01
params = f"epochs={epochs}_lr={learning_rate}_wd={weight_decay}"

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"results/{selection}_{params}",
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=training,
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
***** Running training *****
  Num examples = 3
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2


AttributeError: 'tokenizers.Encoding' object has no attribute 'keys'