In [None]:
# ===================== Traing base T5 ==================== # 
import sys
import random
import pandas as pd
import numpy as np
import torch
import tqdm
from datasets import load_dataset, DatasetDict
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

MODEL_BASE_DIR      = "/Users/sree/.cache/huggingface/hub"
IN_MODEL_NAME       = "t5-base"
OUT_MODEL_NAME      = "my--t5-base-finetuned-text-to-SQL" ##-finetuned-{source_lang}-to-{target_lang}"
OUTPUT_DIR          = MODEL_BASE_DIR + "/models--" + OUT_MODEL_NAME 
#LOG_DIR             = OUTPUT_DIR + "/logs"
TASK_PREFIX         = "Translate English to SQL: "
DATA_PATH           = "./data/my_flat_sql_data.json"

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
set_seed(42)

def initialize_optimizer(model):
    # initialize optimizer
    no_decay = ["bias", "LayerNorm.weight"]
    grouped_parameters = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,},]
    return AdamW(grouped_parameters, lr=3e-4, eps=1e-8)


tokenize = lambda tokenizer, text: tokenizer.encode_plus(
    text, max_length=96, padding=True, truncation=True, return_tensors="pt")

def process_data(record):
    input = TASK_PREFIX + record['question'] #+ " </s>"
    expected_ouput = record['sql'] #+ " </s>"

    tokenized_input = tokenize(tokenizer, input)
    input_ids  = tokenized_input["input_ids"]
    attention_mask = tokenized_input["attention_mask"]
    
    # model_max_length=512,
    tokenized_output = tokenize(tokenizer, expected_ouput)
    lm_labels = tokenized_output["input_ids"]
    decoder_attention_mask=  tokenized_output["attention_mask"]

    output = model(
        input_ids = input_ids, attention_mask = attention_mask, 
        labels = lm_labels, decoder_attention_mask = decoder_attention_mask)
    
    loss = output[0]
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# initialize model, tokenizer & optimizer
tokenizer = T5Tokenizer.from_pretrained(IN_MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(IN_MODEL_NAME)
optimizer = initialize_optimizer(model)

my_ds = load_dataset('json', data_files = DATA_PATH)
print(my_ds)
epochs = 2

model.train(mode=True)
for epoch in range(epochs):
    print ("epoch ",epoch)
    my_ds.map(process_data)

In [None]:
test_sent = TASK_PREFIX + "how many times did Lebron James win?" #+  "</s>"
test_tokenized = tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

model.eval()
beam_outputs = model.generate(
    input_ids=test_input_ids,
    attention_mask=test_attention_mask,
    temperature = .95,
    max_new_tokens=64,
    #max_length=64,
    
    #early_stopping=True,
    #num_beams=10,
    #num_return_sequences=1, #3
    #no_repeat_ngram_size=2 #2
    
    # ----- Beam Search w/ return sequences -----#
    #early_stopping=True,
    #num_beams=10,
    #no_repeat_ngram_size=2,
    #num_return_sequences=5, #num_return_sequences<=num_beams

    # ----- Top P & Top K sampling -----# ANALYSIS - much faster than beam search
    do_sample=True,
    top_k=5, 
    top_p=3,
    num_return_sequences=1
)

for beam_output in beam_outputs:
    output = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(output)