# Training Pipeline for Natural Language Question to SPARQL Query

In [1]:
#!pip install transformers evaluate torch

In [2]:
#!pip install sacrebleu

In [3]:
import sys
import os
import time
import re
import random
import evaluate

from collections import Counter
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GPTJModel, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
import sklearn.metrics as metric

from datasets import load_dataset, Dataset



## Select Model

In [4]:
selection = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(selection)
if selection=="t5-small":
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
elif selection=="yazdipour/text-to-sparql-t5-base":
    model = AutoModelForSeq2SeqLM.from_pretrained("yazdipour/text-to-sparql-t5-base")
else:
    model = GPTJModel.from_pretrained("hf-internal-testing/tiny-random-gptj")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


## Configure Environment

In [5]:
# set seed
random.seed(0)
np.random.seed(0)

## Import Data

In [6]:
dataset = load_dataset("lc_quad")

train = dataset["train"].to_pandas()
train = train.dropna(subset=['question', 'sparql_wikidata'])
train = Dataset.from_pandas(train)
train = train.train_test_split(test_size=0.2)


# test = dataset["test"].to_pandas()
# test = test.dropna(subset=['question', 'sparql_wikidata'])
# test = Dataset.from_pandas(test)

Found cached dataset lc_quad (/home/jstil/.cache/huggingface/datasets/lc_quad/default/2.0.0/139ee1f12aca006669dcc1f282ec02e126c69e7595453db443ab022643d54086)


  0%|          | 0/2 [00:00<?, ?it/s]

## Preprocess Data

In [7]:
X = "question"
Y = "sparql_wikidata"
prefix = "translate English to SPARQL: "
sacrebleu = evaluate.load("sacrebleu")

In [8]:
def preprocess_function(examples):
    inputs = [prefix + example for example in examples[X]]
    targets = [example for example in examples[Y]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_data = train.map(preprocess_function, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

## Helper Functions

In [9]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training Pipeline

In [10]:
# Configure Pipeline
shuffle = True

# hyperparams
buffer_size = 10000
batch_size = 32
epochs = 5
learning_rate = 5e-5
weight_decay=0.01
params = f"epochs={epochs}_lr={learning_rate}_wd={weight_decay}"

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"results/{selection}_{params}",
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: subgraph, sparql_wikidata, sparql_dbpedia18, template_index, template, question, __index_level_0__, NNQT_question, paraphrased_question, uid. If subgraph, sparql_wikidata, sparql_dbpedia18, template_index, template, question, __index_level_0__, NNQT_question, paraphrased_question, uid are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15433
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2415
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pa

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.636715,2.4947,19.0
2,1.420100,0.582108,3.4409,19.0
3,0.690800,0.566394,3.6147,19.0
4,0.631200,0.558675,3.7347,19.0
5,0.607300,0.556748,3.7715,19.0


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: subgraph, sparql_wikidata, sparql_dbpedia18, template_index, template, question, __index_level_0__, NNQT_question, paraphrased_question, uid. If subgraph, sparql_wikidata, sparql_dbpedia18, template_index, template, question, __index_level_0__, NNQT_question, paraphrased_question, uid are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3859
  Batch size = 32
Saving model checkpoint to results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-500
Configuration saved in results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-500/config.json
Model weights saved in results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-500/pytorch_model.bin
tokenizer config file saved in results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-500/tokenizer_config.json
Special tokens fi

TrainOutput(global_step=2415, training_loss=0.7969982771152788, metrics={'train_runtime': 658.7982, 'train_samples_per_second': 117.13, 'train_steps_per_second': 3.666, 'total_flos': 933075456688128.0, 'train_loss': 0.7969982771152788, 'epoch': 5.0})

## Inference

In [13]:
text = "translate English to SPARQL: What are the most common types of liver infection"

In [19]:
import os

def all_subdirs_of(b='.'):
    result = []
    for d in os.listdir(b):
        bd = os.path.join(b, d)
        if os.path.isdir(bd): result.append(bd)
    return result

latest_dir = all_subdirs_of(f"results/{selection}_{params}/")[-1]
print(latest_dir)

results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-2000


In [20]:
translator = pipeline("translation", model=f"{latest_dir}")

loading configuration file results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-2000/config.json
Model config T5Config {
  "_name_or_path": "results/t5-small_epochs=5_lr=5e-05_wd=0.01/checkpoint-2000",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams"

In [21]:
translator(text)

[{'translation_text': "SELECT DISTINCT ?sbj ?sbj_label WHERE  ?sbj wdt:P31 wd:Q5 . ?sbj rdfs:label ?sbj_label . FILTER(CONTAINS(lcase(?sbj_label), 'en')) . FILTER (lang(?sbj_label) = 'en')  LIMIT 25"}]