In [None]:
#!pip install transformers
#!pip install transformers==4.1.1
!pip install transformers
!pip install sentencepiece
!pip install -q datasets sacrebleu


import os
from os import listdir
from os.path import isfile, join
import shutil
from pathlib import Path
import json
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, precision_score, recall_score, f1_score
import numpy as np
from datasets import Dataset
from datasets import load_metric
import random

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")

In [None]:
def get_sql(query):
  input_text = "translate English to SQL: %s </s>" % query
  features = tokenizer([input_text], return_tensors='pt')

  output = model.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])
  
  return tokenizer.decode(output[0])

In [None]:
query = "messi has 12 cards" ## red or yellow ?
get_sql(query)

# Load Pythia data

In [None]:
import re
generator = random.Random(42) ## deterministic results

def cleanQuery(query, tableName):
  pattern = re.compile("SELECT id, ", re.IGNORECASE)
  query = pattern.sub("SELECT ", query)
  pattern = re.compile("SELECT index, ", re.IGNORECASE)
  query = pattern.sub("SELECT ", query)
  query = query.replace("),", ") ,", 1)
  indexStart = query.find("CONCAT(")
  indexEnd = query.find(") ,")
  #if indexEnd == -1: indexEnd = query.find("),")
  if indexStart != -1 and indexEnd != -1:
    query = query[:indexStart] + query[indexEnd+4:]
  query = query.replace(tableName, "dataframe")
  query = query.replace("\n", " ")
  query = query.replace("\t", " ")
  query = query.replace("  ", " ").strip()
  return query

def loadSchema(fileSchema):
  f = open(fileSchema)
  dataset = json.load(f)
  attributes = dataset['attributes']
  tableName = dataset['datasetName']
  attrNames = []
  for attr in attributes:
    attrNames.append(attr['name'])
  return attrNames, tableName

#def loadDataFromFile(file, schemaString, dropDuplicates = True):
def loadDataFromFile(file, attrList, tableName, dropDuplicates = True, shuffleSchema = True):  
  f = open(file)
  data = json.load(f)
  examples = []
  sentencesSet = set()
  querySet = set()
  for example in data:
      selectedData = pd.DataFrame.from_dict(example['dataframe'])
      sentence = example['sentence']
      if sentence.endswith(".") == False: 
        sentence += "."
      label = example['matchType']
      if label == "uniform_true": continue
      query = example['a_query']
      queryOrig = example['a_query']
      query =  cleanQuery(query, tableName)
      if attrList is None:
        #dataframe = example['dataframe']
        #schemaString = "|".join(list(selectedData.columns))
        columns = example['dataframe'].keys()
        schema = "|".join(list(columns))
      else:
        attr = attrList
        if shuffleSchema:
          generator.shuffle(attr)
        columns = list(example['dataframe'].keys())
        for i in range(0, 3):
          a = attr[i]
          if a not in columns and a.lower() != "id" and a.lower() != "index":
            columns.append(a)
        generator.shuffle(columns)
        if "id" in columns: columns.remove("id")
        if "index" in columns: columns.remove("index")
        schema = "|".join(columns)
      #schema = schema.replace("id|", "", 1) ## replace id
      sentence = sentence + "schema:" + schema
      #if label != "NO_AMBIGUITY":
      if label not in ["NO_AMBIGUITY"]:  
        query = "none"
      if dropDuplicates and sentence not in sentencesSet and queryOrig not in querySet:
        pass
        ##example = {"sentence": sentence, "query": query}
        ##examples.append(example)
        ##sentencesSet.add(sentence)
        ##querySet.add(queryOrig)
      if dropDuplicates and queryOrig not in querySet:
        example = {"sentence": sentence, "query": query}
        examples.append(example)
        querySet.add(queryOrig)
      if dropDuplicates == False:
        example = {"sentence": sentence, "query": query}
        examples.append(example)
  return examples

def loadData(folder):
  files = [f for f in listdir(folder) if isfile(join(folder, f))]
  examples = []
  for f in files:
    fileName = folder + f
    exs = loadDataFromFile(fileName)
    examples += exs
  return examples

In [None]:
examples = []
for sentenceFile, schemaFile in mappings:
  print("Sentence File:", sentenceFile)
  print("Schema File:", schemaFile)
  schema = loadSchema(schemaFile)
  examplesInFile = loadDataFromFile(sentenceFile, schema)
  print(len(examplesInFile))
  examples += examplesInFile


In [None]:
for ex in examples[0:20]:
  sentence = ex['sentence']
  expectedQ = ex['query']
  predictedQ = get_sql(sentence)
  print("Sentence:", sentence)
  print("Excpected:", expectedQ)
  print("Predicted:", predictedQ)
  print("*"*50)

In [None]:
train, test = train_test_split(examples, test_size=0.2, random_state=42, shuffle=True)

In [None]:
def removeNone(dataset, percentageToRemove):
  for example in list(dataset):
    if example['query'] == "none" and generator.uniform(0, 1) >= percentageToRemove:
      dataset.remove(example)

In [None]:
SENTENCES_FOLDER = "MyDrive/nl2sql/" ## DATA FOLDER. Change it. Dataset used available at: https://drive.google.com/drive/folders/1EQ7C4PXhK2xinutK2ONcM9LjO9kB9J9T?usp=sharing 
SCHEMA_FOLDER = "MyDrive/nl2sql/schema/"

mappingsTrain = [
            (SENTENCES_FOLDER+"adults_sentence.json",                     SCHEMA_FOLDER+"adults.json", 1.0),
            (SENTENCES_FOLDER+"heart_2020_sentence.json",                 SCHEMA_FOLDER+"heart_2020.json", 1.0),
            (SENTENCES_FOLDER+"laptop_sentence.json",                     SCHEMA_FOLDER+"laptop.json", 1.0), ##90
            (SENTENCES_FOLDER+"soccer_small_sentence.json",               SCHEMA_FOLDER+"soccer_small.json", 1.0),
            (SENTENCES_FOLDER+"mushroom_sentence.json",                   SCHEMA_FOLDER+"mushroom.json", 1.0),
            (SENTENCES_FOLDER+"superstore_sentence.json",                 SCHEMA_FOLDER+"superstore.json", 1.0),
            ##### FROM TEMPLATES 
            (SENTENCES_FOLDER+"adults_template_sentence.json",                     SCHEMA_FOLDER+"adults.json", 1.0),
            (SENTENCES_FOLDER+"heart_2020_template_sentence.json",                 SCHEMA_FOLDER+"heart_2020.json", 1.0),
            (SENTENCES_FOLDER+"laptop_template_sentence.json",                     SCHEMA_FOLDER+"laptop.json", 1.0), ##95
            (SENTENCES_FOLDER+"soccer_small_template_sentence.json",               SCHEMA_FOLDER+"soccer_small.json", 1.0), ##95
            (SENTENCES_FOLDER+"mushroom_template_sentence.json",                   SCHEMA_FOLDER+"mushroom.json", 1.0),
            (SENTENCES_FOLDER+"superstore_template_sentence.json",                 SCHEMA_FOLDER+"superstore.json", 1.0),
            ]


mappingsTest = [
            (SENTENCES_FOLDER+"abalone_sentence.json",                    SCHEMA_FOLDER+"abalone.json", 1.0),
            (SENTENCES_FOLDER+"basket_acronyms_sentence.json",            SCHEMA_FOLDER+"basket_acronyms.json", 0.8),
            (SENTENCES_FOLDER+"iris_sentence.json",                       SCHEMA_FOLDER+"iris.json", 1.0),
            (SENTENCES_FOLDER+"wineqt_sentence.json",                     SCHEMA_FOLDER+"wineqt.json", 1.0),
            ##### FROM TEMPLATES 
            (SENTENCES_FOLDER+"abalone_template_sentence.json",                    SCHEMA_FOLDER+"abalone.json", 1.0),
            (SENTENCES_FOLDER+"basket_acronyms_template_sentence.json",            SCHEMA_FOLDER+"basket_acronyms.json", 0.9),
            (SENTENCES_FOLDER+"iris_template_sentence.json",                       SCHEMA_FOLDER+"iris.json", 1.0),
            (SENTENCES_FOLDER+"basket_full_names_template_sentence.json",            SCHEMA_FOLDER+"basket_full_names.json", 0.9),
            ]

shuffle = True
train = []
#mpTemp = []
#mpTemp.append(mappingsTrain[-1])
#mappingsTrain = mpTemp
for sentenceFile, schemaFile, percentage in mappingsTrain:
  print("Sentence File:", sentenceFile)
  print("Schema File:", schemaFile)
  attrNames, tableName = loadSchema(schemaFile)
  #attrNames = None
  examplesInFile = loadDataFromFile(sentenceFile, attrNames, tableName)
  print(len(examplesInFile))
  percentageRemove = 1.0 - percentage
  if percentageRemove > 0:
    removeNone(examplesInFile, percentageRemove)
    print("Filtered dataset. Add: ", len(examplesInFile), "examples")
  train += examplesInFile

random.Random(42).shuffle(train)
print("*******************************")
test = []
for sentenceFile, schemaFile, percentage in mappingsTest:
  print("Sentence File:", sentenceFile)
  print("Schema File:", schemaFile)
  attrNames, tableName = loadSchema(schemaFile)
  #attrNames = None
  examplesInFile = loadDataFromFile(sentenceFile, attrNames, tableName)
  print(len(examplesInFile))
  percentageRemove = 1.0 - percentage
  if percentageRemove > 0:
    removeNone(examplesInFile, percentageRemove)
    print("Filtered dataset. Add: ", len(examplesInFile), "examples")
  test += examplesInFile

train, validation = train_test_split(train, test_size=0.2, random_state=42, shuffle=True)

In [None]:
print(len(train))
print(len(validation))
print(len(test))
print(train[0])
print(validation[2])
print(test[1])

In [None]:
## reduce the size of the train
red = True
if red:
  train = train[0:300]
print(len(train))

In [None]:
percentageToRemove = 0.4
sampleTrain = False
sampleTest = False

if sampleTrain:
  for example in list(train):
    if example['query'] == "none" and generator.uniform(0, 1) >= percentageToRemove:
      train.remove(example)

if sampleTest:
  for example in list(test):
    if example['query'] == "none" and generator.uniform(0, 1) >= percentageToRemove:
      test.remove(example)

In [None]:
def toDict(dataset):
  sentences = []
  queries = []
  for example in dataset:
    sentences.append(example['sentence'])
    queries.append(example['query'])
  d = {"sentence": sentences, "query": queries}
  return d

train = train + validation ## comment if separated
trainDataset = Dataset.from_dict(toDict(train))
validationDataset = Dataset.from_dict(toDict(validation))
testDataset = Dataset.from_dict(toDict(test))

In [None]:
def format_dataset(example):
 return {'input': 'transalte to AMBSQL: ' + example['sentence'], 'target': example['query']}

In [None]:
trainDataset = trainDataset.map(format_dataset, remove_columns=trainDataset.column_names)
validationDataset = validationDataset.map(format_dataset, remove_columns=validationDataset.column_names)
testDataset = testDataset.map(format_dataset, remove_columns=testDataset.column_names)

In [None]:
print(trainDataset[0])
print(validationDataset[0])
print(testDataset[0])

In [None]:
def distributionFromData(dataset):
  freq = {"query": 0, "none": 0}
  for i in range(0, dataset.__len__()):
    example = dataset[i]
    y = example['target']
    if y == "none":
      freq["none"] += 1
    else:
      freq["query"] += 1
  return freq

freqTrain = distributionFromData(trainDataset)
freqTest = distributionFromData(testDataset)
freqValidation = distributionFromData(validationDataset)
print(freqTrain)
print(freqValidation)
print(freqTest)

In [None]:
# map article and summary len to dict as well as if sample is longer than 512 tokens
def map_to_length(x):
  x["input_len"] = len(tokenizer(x["input"]).input_ids)
  x["input_longer_256"] = int(x["input_len"] > 256)
  x["input_longer_128"] = int(x["input_len"] > 128)
  x["input_longer_64"] = int(x["input_len"] > 64)
  x["out_len"] = len(tokenizer(x["target"]).input_ids)
  x["out_longer_256"] = int(x["out_len"] > 256)
  x["out_longer_128"] = int(x["out_len"] > 128)
  x["out_longer_64"] = int(x["out_len"] > 64)
  return x

sample_size = 300
data_stats = trainDataset.select(range(sample_size)).map(map_to_length, num_proc=4)

def compute_and_print_stats(x):
  if len(x["input_len"]) == sample_size:
    print(
        "Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}".format(
            sum(x["input_len"]) / sample_size,
            sum(x["input_longer_256"]) / sample_size,
            sum(x["input_longer_128"]) / sample_size,
            sum(x["input_longer_64"]) / sample_size,   
            sum(x["out_len"]) / sample_size,
            sum(x["out_longer_256"]) / sample_size,
            sum(x["out_longer_128"]) / sample_size,
            sum(x["out_longer_64"]) / sample_size,
        )
    )

output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

In [None]:
# tokenize the examples

INPUT_LENGTH = 128
OUTPUT_LENGTH = 64

def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=INPUT_LENGTH)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=OUTPUT_LENGTH)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [None]:
train_data = trainDataset.map(convert_to_features, batched=True, remove_columns=trainDataset.column_names)
validation_data = trainDataset.map(convert_to_features, batched=True, remove_columns=validationDataset.column_names)
test_data = testDataset.map(convert_to_features, batched=True, remove_columns=testDataset.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
validation_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

In [None]:
train_data[0]

In [None]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

In [None]:
# set training arguments - Feel free to adapt it

MODEL_DIR = "/content/t5-small-finetuned-amb-sql"
EPOCHS = 20
BATCH_SIZE = 16

training_args = Seq2SeqTrainingArguments(
    output_dir= MODEL_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    #save_steps=1000,
    #eval_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=EPOCHS + 1,
    load_best_model_at_end=True,
    push_to_hub=False
    #fp16=True, 
)

In [None]:
metric = load_metric("sacrebleu")

In [None]:
from datasets.utils.file_utils import T
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    dec_labels = []
    for dp in decoded_labels:
      dec_labels.append(dp[0])
    #print(decoded_preds)
    #print(decoded_labels)
    binaryLabels = ["none"]*len(dec_labels)
    binaryLabels = np.array(binaryLabels) == np.array(dec_labels)
    binaryPred = ["none"]*len(decoded_preds)
    binaryPred = np.array(binaryPred) == np.array(decoded_preds)
    #print(binaryLabels)
    #print(binaryPred)
    #print(np.unique(binaryLabels))
    #print(np.unique(binaryPred))
    precision = precision_score(binaryLabels, binaryPred)
    recall = recall_score(binaryLabels, binaryPred)
    f1 = f1_score(binaryLabels, binaryPred)
    result["prec"] = precision
    result["rec"] = recall
    result["f1"] = f1
    return result

In [None]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
    #eval_dataset=validation_data,
)

In [None]:
trainer.evaluate(eval_dataset=test_data.select(range(100)))

In [None]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=test_data.select(range(40)))

In [None]:
trainer.evaluate(eval_dataset=test_data)

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR+"/checkpoint-552")

In [None]:
def translate(text):
    inputs = tokenizer(text, padding='longest', max_length=INPUT_LENGTH, return_tensors='pt')
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=OUTPUT_LENGTH)

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
testDataset = Dataset.from_dict(toDict(test))
for i in range(0,200,1):
#for i in range(150,230,1):  
  print('translate to SQL: ' + testDataset[i]['sentence'])
  print('Predict.: ' + translate('transalte to AMBSQL: ' + testDataset[i]['sentence']))
  print('Expected: ' + testDataset[i]['query'])
  print('=================================\n')

In [None]:
#sentence = "curry has 12 rebounds"
#sentence = "curry has 12 field goals"
sentence = "player had 12 fouls"
#sentence = "player has 44 appearences"
#schema = "schema: name|3 point goals|1 point score|field goal|3 point field goal|reb|dreb|foul|appearences"
schema = "schema:field goal|3 point field goal|fouls"

#sentence = "messi has 12 cards"
#schema = "schema: player|goals|appearences|red cards|yellow cards|minutes"
predText = sentence + schema
print('Predict.: ' + translate('transalte to AMBSQL: ' + predText))

In [None]:
dir_path = MODEL_DIR
shutil.rmtree(dir_path, ignore_errors=True)
print("Deleted '%s' directory successfully" % dir_path)

In [None]:
tokenizerBaseline = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
modelBaseline = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")

In [None]:
def get_sql(query):
  input_text = "translate English to SQL: %s </s>" % query
  features = tokenizerBaseline([input_text], return_tensors='pt')

  output = modelBaseline.generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'])
  
  return tokenizer.decode(output[0])

In [None]:
testDataset = Dataset.from_dict(toDict(test))
tp = 0
fp = 0
tn = 0
fn = 0
tpList = []
fpList = []
tnList = []
fnList = []
for i in range(0,len(testDataset)):
  query = testDataset[i]['sentence']
  #predicted = get_sql(query)
  #expected = testDataset[i]['query']
  predicted = translate('transalte to AMBSQL: ' + testDataset[i]['sentence'])
  expected = testDataset[i]['query']
  sentence = testDataset[i]['sentence']
  if predicted == "none" and expected == "none":
    tp += 1
    tpList.append((predicted, expected, sentence))
  if predicted == "none" and expected != "none":
    fp += 1
    fpList.append((predicted, expected, sentence))
  if predicted != "none" and expected == "none":
    fn += 1
    fnList.append((predicted, expected, sentence))
  if predicted != "none" and expected != "none":
    tn += 1
    tnList.append((predicted, expected, sentence))

In [None]:
if tp + fp == 0:
  precision = 0.0
else:
  precision = tp / (tp + fp)
recall = tp / (tp + fn)
if precision + recall == 0:
  f1 = 0
else:
  f1 = (2 * precision * recall)/(precision + recall)
accuracy = (tp + tn) / (tp + tn + fp + fn)

print(precision, recall, f1, accuracy,  sep="\t")
print("TP:", tp)
print("FP:", fp)
print("TN:", tn)
print("FN:", fn)

In [None]:
#eval_preds = []
preds = []
expecteds = []
for predicted, expected, sentence in tnList:
  pred = predicted.replace("<pad> ", "")
  #print("Pred:", pred)
  #print("Expe:", expected)
  #print("Sent:", sentence)
  #print('=================================\n')
  preds.append(pred)
  expecteds.append(expected)
  #eval_preds.append((pred, expected))

preds, expecteds = postprocess_text(preds, expecteds)
resultBleu = metric.compute(predictions=preds, references=expecteds)
print(resultBleu)

## FINE TUNE A BIGGER MODEL

In [None]:
! nvidia-smi

In [None]:
! pip install -q transformers datasets

In [None]:
CKPT = 't5-large'
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained(CKPT)
model = T5ForConditionalGeneration.from_pretrained(CKPT)

In [None]:
from datasets import load_dataset

ds_1 = load_dataset('wikisql')

In [None]:
def format_dataset_1(example):
 return {'input': 'translate to SQL: ' + example['question'], 'target': example['sql']['human_readable']}

In [None]:
ds_1 = ds_1.map(format_dataset_1, remove_columns=ds_1['train'].column_names)

In [None]:
from datasets import concatenate_datasets

train_data = concatenate_datasets([ds_1['train'], ds_1['validation']]).shuffle(seed=42)
test_data = concatenate_datasets([ds_1['test']]).shuffle(seed=42)

In [None]:
# map article and summary len to dict as well as if sample is longer than 512 tokens
def map_to_length(x):
  x["input_len"] = len(tokenizer(x["input"]).input_ids)
  x["input_longer_256"] = int(x["input_len"] > 256)
  x["input_longer_128"] = int(x["input_len"] > 128)
  x["input_longer_64"] = int(x["input_len"] > 64)
  x["out_len"] = len(tokenizer(x["target"]).input_ids)
  x["out_longer_256"] = int(x["out_len"] > 256)
  x["out_longer_128"] = int(x["out_len"] > 128)
  x["out_longer_64"] = int(x["out_len"] > 64)
  return x

sample_size = 10000
data_stats = train_data.select(range(sample_size)).map(map_to_length, num_proc=4)

def compute_and_print_stats(x):
  if len(x["input_len"]) == sample_size:
    print(
        "Input Mean: {}, %-Input > 256:{},  %-Input > 128:{}, %-Input > 64:{} Output Mean:{}, %-Output > 256:{}, %-Output > 128:{}, %-Output > 64:{}".format(
            sum(x["input_len"]) / sample_size,
            sum(x["input_longer_256"]) / sample_size,
            sum(x["input_longer_128"]) / sample_size,
            sum(x["input_longer_64"]) / sample_size,   
            sum(x["out_len"]) / sample_size,
            sum(x["out_longer_256"]) / sample_size,
            sum(x["out_longer_128"]) / sample_size,
            sum(x["out_longer_64"]) / sample_size,
        )
    )

output = data_stats.map(
  compute_and_print_stats, 
  batched=True,
  batch_size=-1,
)

In [None]:
# tokenize the examples
INPUT_ENCODINGS_SIZE_T5_LARGE = 64
TARGET_ENCODINGS_SIZE_T5_LARGE = 64
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=INPUT_ENCODINGS_SIZE_T5_LARGE)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=TARGET_ENCODINGS_SIZE_T5_LARGE)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [None]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

In [None]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

In [None]:
# set training arguments - Feel free to adapt it
EPOCHS = 1
BATCH_SIZE = 8
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/t5-large-finetuned-wikisql-nl-sql",
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    #save_steps=1000,
    #eval_steps=1000,
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True,
    push_to_hub=False
    #fp16=True, 
)

In [None]:
! pip install -q sacrebleu

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

In [None]:
trainer.train()

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!gsutil cp -r '/content/t5-large-finetuned-wikisql-nl-sql/' 'gs://pythia_t5/nl-sql/'

## Code for automatically click connect (run on console cmd+alt+i)
```
function KeepClicking(){
console.log("Clicking");
document.querySelector("colab-connect-button").click()
}
setInterval(KeepClicking,60000)
```



In [None]:
#!gsutil cp -r '/content/t5-large-finetuned-wikisql-nl-sql/' 'gs://pythia_t5/nl-sql/'
!gsutil cp -r 'gs://pythia_t5/nl-sql/t5-large-finetuned-wikisql-nl-sql/checkpoint-8097/' '/content/'

In [None]:
MODEL_DIR = "/content/checkpoint-8097/"

In [None]:
#model = AutoModelWithLMHead.from_pretrained(MODEL_DIR)
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)