In [1]:
import transformers
from datasets import load_dataset, load_metric

In [2]:
wikisql_processed_dataset = load_dataset("csv", data_files="datasets/eng2SQL.csv")

wikisql_processed_dataset

Found cached dataset csv (/home/daniil/.cache/huggingface/datasets/csv/default-6466c2472f5c33b7/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 56092
    })
})

In [3]:
datasets_train_test = wikisql_processed_dataset["train"].train_test_split(test_size=2000)
datasets_train_validation = datasets_train_test["train"].train_test_split(test_size=2000)

wikisql_processed_dataset["train"] = datasets_train_validation["train"]
wikisql_processed_dataset["validation"] = datasets_train_validation["test"]
wikisql_processed_dataset["test"] = datasets_train_test["test"]

wikisql_processed_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 52092
    })
    validation: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 2000
    })
})

In [17]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)
print(type(tokenizer))

[nltk_data] Downloading package punkt to /home/daniil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>


In [5]:
# keep only a subsample of the datasets
wikisql_processed_dataset["train"] = wikisql_processed_dataset["train"].shuffle().select(range(50000))
wikisql_processed_dataset["validation"] = wikisql_processed_dataset["validation"].shuffle().select(range(1000))
wikisql_processed_dataset["test"] = wikisql_processed_dataset["test"].shuffle().select(range(1000))

wikisql_processed_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'human_sql'],
        num_rows: 1000
    })
})

In [6]:
prefix = "translate English to SQL: "
max_input_length = 512
max_target_length = 128

def preprocess_data(examples):
    inputs = [prefix + text for text in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["human_sql"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_datasets = wikisql_processed_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [9]:
batch_size = 8
model_name = "t5-small-english-to-sql-translation"
model_dir = f"models/{model_name}"

print(transformers.__version__)

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    report_to="tensorboard"
)

4.28.0


In [10]:
import numpy as np

data_collator = DataCollatorForSeq2Seq(tokenizer)

metric = load_metric("bleu")

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    predictions = [pred.split() for pred in predictions]
    
    # Replace -100 in the labels as we can't decode them.
    references = np.where(references != -100, references, tokenizer.pad_token_id)
    references = tokenizer.batch_decode(references, skip_special_tokens=True)
    references = [ref.split() for ref in references]
    references = [[ref] for ref in references]
    
    
    # Compute BLEU scores
    result = metric.compute(predictions=predictions, references=references)

    # Extract ROUGE f1 scores
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)
    
    return result

  metric = load_metric("bleu")


In [11]:
# Function that returns an untrained model to be trained
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [47]:
# Start TensorBoard before training to monitor it in progress
%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 35473), started 0:47:28 ago. (Use '!kill 35473' to kill it.)

In [46]:
trainer.train()



Step,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length
100,1.7141,0.673105,0.246682,"[0.6257223656707243, 0.3915603532875368, 0.2809391814772542, 0.20580218068535824]",0.715035,0.748826,8133,10861
200,0.7439,0.464659,0.386579,"[0.7621794085163823, 0.5770037767519933, 0.48642055618799807, 0.39522237327636434]",0.716912,0.750299,8149,10861
300,0.5571,0.387855,0.415098,"[0.7820873608222195, 0.6128537571448487, 0.5258383282034668, 0.4390102455055094]",0.719724,0.752509,8173,10861
400,0.4689,0.334806,0.441577,"[0.8002186057809084, 0.6433508432402544, 0.557106191851139, 0.4749713412304165]",0.726844,0.758125,8234,10861
500,0.4247,0.300943,0.458524,"[0.8058475292980548, 0.6629105400577161, 0.5776644894057671, 0.49933674436232706]",0.731842,0.762085,8277,10861
600,0.4144,0.273629,0.470772,"[0.8129600579220466, 0.6780568135034993, 0.5947192619691427, 0.5190088897295253]",0.733002,0.763005,8287,10861
700,0.3806,0.257351,0.478449,"[0.8162429208338354, 0.6857103712837375, 0.6050166693125894, 0.5319871673900736]",0.734392,0.76411,8299,10861
800,0.3319,0.246412,0.486605,"[0.8209386281588448, 0.6964432284541724, 0.6160063391442155, 0.5435028248587571]",0.735666,0.765123,8310,10861
900,0.3316,0.233286,0.492325,"[0.8258095582039244, 0.7050773231148214, 0.6247027112731885, 0.5524778594309403]",0.735318,0.764847,8307,10861
1000,0.3119,0.221475,0.500129,"[0.8316914944738106, 0.713817586018569, 0.6337760910815939, 0.5627347858752817]",0.737285,0.766412,8324,10861


Trainer is attempting to log a value of "[0.6257223656707243, 0.3915603532875368, 0.2809391814772542, 0.20580218068535824]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.7621794085163823, 0.5770037767519933, 0.48642055618799807, 0.39522237327636434]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.7820873608222195, 0.6128537571448487, 0.5258383282034668, 0.4390102455055094]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8002186057809084, 0.6433508432402544, 0.557106191851139, 0.4749713412304165]" of type <class 'list'> for key "

Trainer is attempting to log a value of "[0.8523506071900926, 0.7559108924422577, 0.6838689251226848, 0.6204626669174347]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8510280149092221, 0.7549542162088287, 0.6826025011872724, 0.6185819070904646]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8526366482061161, 0.7580071174377224, 0.6868062163019346, 0.6241990199773841]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.85252014916396, 0.7586489812662383, 0.68715349279265, 0.6245059288537549]" of type <class 'list'> for key "eval/p

Trainer is attempting to log a value of "[0.8553012048192771, 0.7635616438356164, 0.6944444444444444, 0.6326415094339622]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.8553012048192771, 0.7635616438356164, 0.6944444444444444, 0.6326415094339622]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=6250, training_loss=0.2762901531982422, metrics={'train_runtime': 1560.7577, 'train_samples_per_second': 32.036, 'train_steps_per_second': 4.004, 'total_flos': 696972212699136.0, 'train_loss': 0.2762901531982422, 'epoch': 1.0})

In [12]:
model_name = "t5-small-english-to-sql-translation/checkpoint-6200"
model_dir = f"models/{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 512

In [13]:
text = "Who had the fastest lap on [table_30134667_2][Date]?"

inputs = ["translate English to SQL: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=4, max_length=128)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_query = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_query)

SELECT Fastest Lap FROM table_30134667_2 WHERE Date = [table_30134667_2][Date]


In [14]:
text = "How many teams have [table_1969634_1][Nickname] as a nickname?"

inputs = ["translate English to SQL: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=4, max_length=128)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_query = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_query)

SELECT COUNT Team FROM table_1969634_1 WHERE Nickname = [table_1969634_1][Nickname]


In [15]:
text = "What is [table_19744915_16][Couple]'s result?"

inputs = ["translate English to SQL: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=4, max_length=128)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_query = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_query)

SELECT Result FROM table_19744915_16 WHERE Cup = [table_19744915_16][Couple]


In [16]:
text = "If the track is the [table_22670216_1][Track] and the winning driver is [table_22670216_1][Winning@Driver], what was the location?"

inputs = ["translate English to SQL: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=4, max_length=128)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_query = nltk.sent_tokenize(decoded_output.strip())[0]

print(predicted_query)

SELECT Location FROM table_22670216_1 WHERE Track = [table_22670216_1][Track] AND Winning Driver = [table_22670216_1][Winning@Driver]
