In [29]:
import transformers
from datasets import load_dataset, load_metric

In [30]:
bird_processed_dataset = load_dataset("csv", data_files="datasets/BIRD/train/train_initial.csv")

bird_processed_dataset

Found cached dataset csv (/home/daniil/.cache/huggingface/datasets/csv/default-2a679b917dd5f579/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'sql'],
        num_rows: 9428
    })
})

In [32]:
datasets_train_test = bird_processed_dataset["train"].train_test_split(test_size=262)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=700)

bird_processed_dataset["train"] = datasets_train_validation["train"]
bird_processed_dataset["validation"] = datasets_train_validation["test"]
bird_processed_dataset["test"] = datasets_train_test["test"]

bird_processed_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sql'],
        num_rows: 7504
    })
    validation: Dataset({
        features: ['question', 'sql'],
        num_rows: 700
    })
    test: Dataset({
        features: ['question', 'sql'],
        num_rows: 262
    })
})

In [33]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

model_pretrained = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_pretrained, model_max_length=512)

[nltk_data] Downloading package punkt to /home/daniil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
bird_processed_dataset["train"] = bird_processed_dataset["train"].shuffle()
bird_processed_dataset["validation"] = bird_processed_dataset["validation"].shuffle()
bird_processed_dataset["test"] = bird_processed_dataset["test"].shuffle()

bird_processed_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sql'],
        num_rows: 7504
    })
    validation: Dataset({
        features: ['question', 'sql'],
        num_rows: 700
    })
    test: Dataset({
        features: ['question', 'sql'],
        num_rows: 262
    })
})

In [36]:
prefix = "translate English to SQL "
max_input_length = 512
max_target_length = 128

def prepare_data(examples):
    inputs = [prefix + text for text in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["sql"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [37]:
tokenized_datasets = bird_processed_dataset.map(prepare_data, batched=True)

Map:   0%|          | 0/7504 [00:00<?, ? examples/s]



Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [38]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [39]:
batch_size = 8
model_name = "t5-small-english-to-sql-bird-translation"
model_dir = f"models/{model_name}"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=8e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    report_to="tensorboard"
)

In [40]:
import numpy as np

data_collator = DataCollatorForSeq2Seq(tokenizer)

metric = load_metric("bleu")

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    predictions = [pred.split() for pred in predictions]
    
    # Replace -100 in the labels as we can't decode them.
    references = np.where(references != -100, references, tokenizer.pad_token_id)
    references = tokenizer.batch_decode(references, skip_special_tokens=True)
    references = [ref.split() for ref in references]
    references = [[ref] for ref in references]
    
    
    # Compute BLEU scores
    result = metric.compute(predictions=predictions, references=references)
    
    
    return result

  metric = load_metric("bleu")


In [41]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_pretrained)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [42]:
# Start TensorBoard before training to monitor it in progress
%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 5676), started 0:04:29 ago. (Use '!kill 5676' to kill it.)

In [43]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length
100,3.58,2.236026,0.002941,"[0.3135593220338983, 0.06330087134802666, 0.024359775140537165, 0.012688342585249802]",0.059085,0.26118,4602,17620
200,2.3224,1.755156,0.039452,"[0.5778282746307417, 0.28090093389489107, 0.208779668136946, 0.12168792934249265]",0.155684,0.349659,6161,17620
300,2.0138,1.548743,0.035521,"[0.5925305643945737, 0.28059191804211725, 0.20126886895646467, 0.11654232055569848]",0.142142,0.338876,5971,17620
400,1.7886,1.43732,0.037385,"[0.5970149253731343, 0.2872420262664165, 0.20647948164146868, 0.12039624079248158]",0.146306,0.342225,6030,17620
500,1.6801,1.351371,0.035515,"[0.592462311557789, 0.2806451612903226, 0.20021881838074398, 0.11729827275070895]",0.142072,0.33882,5970,17620
600,1.5991,1.278411,0.035359,"[0.5928023992002666, 0.27706525839305923, 0.19556714471968709, 0.11216389244558259]",0.144325,0.340636,6002,17620
700,1.4946,1.230341,0.035957,"[0.5896460469732054, 0.2759072203516648, 0.1941455015066724, 0.11198378515328097]",0.147442,0.343133,6046,17620
800,1.4746,1.186313,0.035451,"[0.5947679324894515, 0.2872727272727273, 0.20508287292817678, 0.12101411395713539]",0.138922,0.336266,5925,17620
900,1.4255,1.145833,0.031192,"[0.5968870234347674, 0.28736548425667596, 0.20078740157480315, 0.11350455675227837]",0.124743,0.324518,5718,17620
1000,1.3576,1.11835,0.034597,"[0.5969257045260461, 0.2892337536372454, 0.20763187429854096, 0.12370311252992817]",0.13407,0.332293,5855,17620


Trainer is attempting to log a value of "[0.3135593220338983, 0.06330087134802666, 0.024359775140537165, 0.012688342585249802]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.5778282746307417, 0.28090093389489107, 0.208779668136946, 0.12168792934249265]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.5925305643945737, 0.28059191804211725, 0.20126886895646467, 0.11654232055569848]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.5970149253731343, 0.2872420262664165, 0.20647948164146868, 0.12039624079248158]" of type <class 'list'> 

Trainer is attempting to log a value of "[0.6112981668537224, 0.2993973310374516, 0.21211353269133298, 0.1337030191004313]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6178451178451179, 0.3056392595781317, 0.21743537759756715, 0.13639162561576354]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6183364839319471, 0.30915032679738563, 0.21825192802056556, 0.13713212273011896]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6236258617477175, 0.31733447610884935, 0.22586337282581295, 0.144124847001224]" of type <class 'list'> for 

Trainer is attempting to log a value of "[0.6339437417406079, 0.334130954970633, 0.23915832691814215, 0.16041275797373358]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6357075023741691, 0.33997809419496167, 0.24372574385510995, 0.16392924826279218]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6362773029439696, 0.3417305585980285, 0.2463130659767141, 0.16803537586860393]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.6370692919348732, 0.3419903972064601, 0.24678001030396704, 0.16588124410933083]" of type <class 'list'> for 

TrainOutput(global_step=7504, training_loss=1.0775056307885185, metrics={'train_runtime': 1799.0405, 'train_samples_per_second': 33.369, 'train_steps_per_second': 4.171, 'total_flos': 556692550189056.0, 'train_loss': 1.0775056307885185, 'epoch': 8.0})

In [53]:
model_name = "t5-small-english-to-sql-bird-translation/checkpoint-10000"
model_dir = f"models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [54]:
text = "for movie_platform database: What is the name of the longest movie title? When was it released?"

inputs = ["translate English to SQL " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=4, max_length=128)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_query = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_query) # SELECT movie_title, movie_release_year FROM movies ORDER BY LENGTH(movie_popularity) DESC LIMIT 1

SELECT T2.title FROM movie AS T1 INNER JOIN movie_platform AS T2 ON T1.movie_id = T2.movie_id WHERE T1.movie_title = 'Last'
