In [None]:
# ===================== Training T5 Model ======================#
# ==============================================================#

import sys, os
from os import environ
from dotenv import load_dotenv
import logging
from logging.config import fileConfig

import datasets
import numpy as np
import evaluate
from datasets import load_dataset, load_metric, DatasetDict
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import Trainer, TrainingArguments, AdamW
from transformers import DataCollatorForSeq2Seq

# ==============================================================#
# ====================== GLOBAL VARIABLES ======================#

GlobalVar           = 0
train_batch_size    = 8
eval_batch_size     = 8
max_input_length    = 512
max_target_length   = 64

MODEL_DIR           = "/Users/sree/.cache/huggingface/hub/models--mrm8488--t5-base-finetuned-wikiSQL/"
MODEL_NAME          = "mrm8488/t5-base-finetuned-wikiSQL"

DS_LOCAL            = 'my_sql_data.json'
DS_HUGGINGFACE      = 'wikisql'
USE_LOCAL_DATASET   = False
DATASET             = DS_LOCAL if USE_LOCAL_DATASET else DS_HUGGINGFACE
DATA_SAMPLING       = True
SAMPLING_SPLIT      = {'test':50, 'train':20, 'validation':20}

DATASET_PATH         = "../data/dataset/"
ENCODED_DATASET_PATH = "../data/encoded-" + DATASET
ENCODED_DATASET_PATH += "-sample" if DATA_SAMPLING else ""
TASK_PREFIX         = "translate English to SQL : "

# use_fast=True param to speed up tokenization
# initialize model & it's tokenizer
tokenizer           = AutoTokenizer.from_pretrained(MODEL_NAME)
model               = AutoModelWithLMHead.from_pretrained(MODEL_NAME)

# initialize optimizer
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-4, eps=1e-8)

training_args       = TrainingArguments(output_dir='./results')

# ==============================================================#
# ====================== GLOBAL FUNCTIONS ======================#

def compute_metric(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}

def get_training_args():
    """
    seq_training_args = Seq2SeqTrainingArguments(
        MODEL_DIR,
        evaluation_strategy="steps",
        eval_steps=100,
        logging_strategy="steps",
        logging_steps=100,
        save_strategy="steps",
        save_steps=200,
        learning_rate=4e-5,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=1,
        predict_with_generate=True,
        load_best_model_at_end=True,
        metric_for_best_model="rouge1"
        #fp16=True, # can be used with CUDA devices, not CPU
        #report_to="tensorboard" #still to make it work
    )
    """

    training_args = TrainingArguments(
        output_dir='../results',
        evaluation_strategy='steps',
        eval_steps = 1000,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=1,
        save_steps=1000,
        save_total_limit=2,
        metric_for_best_model="rouge1"
    )
    return training_args

def get_trainer(tokenized_dataset):
    
    """
    seq_trainer = Seq2SeqTrainer(
        model_init      = model,
        args            = training_args,
        train_dataset   = tokenized_dataset["train"],
        eval_dataset    = tokenized_dataset["validation"],
        data_collator   = data_collator,
        tokenizer       = tokenizer,
        compute_metrics = compute_metric
    )
    """
    trainer = Trainer(
        model           = model,
        args            = training_args,
        train_dataset   = tokenized_dataset["train"],
        eval_dataset    = tokenized_dataset["validation"],
        data_collator   = data_collator, #extra
        tokenizer       = tokenizer, #extra
        compute_metrics = compute_metric #extra
    )
    return trainer

def load_data():
    my_ds = load_dataset(
        'json', data_files = DATASET_PATH + DATASET) if USE_LOCAL_DATASET else load_dataset(DATASET)
    
    # Applicable for HF dataset only, loading samples for fast testing
    if( not USE_LOCAL_DATASET and DATA_SAMPLING):
        # shuffle(seed=20) for randomness
        my_ds = DatasetDict({
            "test":my_ds["test"].shuffle(seed=20).select(range(SAMPLING_SPLIT['test'])),
            "train": my_ds["train"].shuffle(seed=20).select(range(SAMPLING_SPLIT['train'])), 
            "validation": my_ds["validation"].shuffle(seed=20).select(range(SAMPLING_SPLIT['validation']))
            })
        
    #train_dataset = load_dataset('wikisql', split=datasets.Split.TRAIN)
    #valid_dataset = load_dataset('wikisql', split=datasets.Split.VALIDATION)
    """
    DatasetDict({
        test: Dataset({
            features: ['phase', 'question', 'table', 'sql'],
            num_rows: 15878
        })
        validation: Dataset({
            features: ['phase', 'question', 'table', 'sql'],
            num_rows: 8421
        })
        train: Dataset({
            features: ['phase', 'question', 'table', 'sql'],
            num_rows: 56355
        })
    })
    """
    return my_ds

def check_interrupt():
    global GlobalVar
    if GlobalVar > 0:
        if GlobalVar > 5:
            sys.exit()
        GlobalVar += 1

def preprocess_data(record):
    
    #check_interrupt()
    #print("\nQUE == " , record['question'])
    #print("\nSQL == " , record['sql'])
    
    input_question = []
    expected_sqls = []
    input_question += [TASK_PREFIX + q for q in record['question']] 
    expected_sqls += [q['human_readable'] for q in record['sql']]

    print(input_question)
    print(expected_sqls)
    
    model_inputs = tokenizer(input_question, max_length=max_input_length, truncation=True)
    
    #with tokenizer.as_target_tokenizer():
    labels = tokenizer(expected_sqls, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    #Return a dictionary containing the token ids and attention masks of the inputs, and only token ids of the labels
    return model_inputs

def tokenize_dataset(dataset_to_tokenize):
    
    if not os.path.exists(ENCODED_DATASET_PATH):
        print("Tokenizing dataset ... it will take some time")
        tokenized_dataset = dataset_to_tokenize.map(preprocess_data, batched=True)
        tokenized_dataset.save_to_disk(ENCODED_DATASET_PATH)
    else:
        print("Loading tokenized dataset from cache")
        tokenized_dataset = datasets.load_from_disk(ENCODED_DATASET_PATH)

    return tokenized_dataset


# ==============================================================#
# ======================== MAIN PROGRAM ========================#

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

full_dataset = load_data()
print("-"*20, "Dataset Loaded ")

tokenized_dataset = tokenize_dataset(full_dataset)
print("-"*20, "Dataset Tokenized ")
print("TKNZD: ", tokenized_dataset)

sys.exit(1)

training_args = get_training_args()
print("-"*20, "Training Args Ready ")

metric = evaluate.load("rouge")
print("-"*20, "Rouge Metric Ready ")

data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = get_trainer(tokenized_dataset)
print("-"*20, "Trainer Ready ")

# Start tensorboard
%load_ext tensorboard
%tensorboard --logdir '{model_dir}'/runs

trainer.train()
model.save_pretrained()

In [None]:
# ================================ Utility Methods ===============================#
# ================================================================================#

from transformers import (AutoTokenizer, AutoModelWithLMHead, 
                          T5Tokenizer, T5ForConditionalGeneration, 
                          GPT2Tokenizer, GPT2LMHeadModel, 
                          OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
MODEL_CLASSES = {
    "mrm8488/t5-base-finetuned-wikiSQL": (AutoModelWithLMHead, AutoTokenizer),
    "anusha/t5-base-finetuned-wikiSQL-sql-to-en_15i": (AutoModelWithLMHead, AutoTokenizer),
    "t5-small": (T5ForConditionalGeneration, T5Tokenizer),
    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
}

def get_model(model_args):
    # Initialize the model and tokenizer
    try:
        model_class, tokenizer_class = MODEL_CLASSES[ model_args["model_name"] ]
    except KeyError:
        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained( model_args["model_name"] )
    model = model_class.from_pretrained( model_args["model_name"], pad_token_id=tokenizer.eos_token_id )

    return tokenizer, model


def get_encoded_input(tokenizer, model_args, input_text):

    # will have input_ids & attention_mask keys & corresponding tensors
    encoded_input = tokenizer(
        input_text,
        padding=True, 
        max_length = model_args["max_target_length"],
        truncation=True, 
        add_special_tokens=False, 
        return_tensors='pt')
    return encoded_input


def get_encoded_output(model, model_args, encoded_input):

    # URL: https://huggingface.co/blog/how-to-generate
    output = model.generate(
        input_ids = encoded_input['input_ids'],
        attention_mask = encoded_input['attention_mask'], 
        temperature = model_args["temperature"],
        max_new_tokens=model_args["max_target_length"],

        # ----- Beam Search w/ return sequences -----#
        #num_beams=5, 
        #no_repeat_ngram_size=2,
        #early_stopping=True,
        #num_return_sequences=5, #num_return_sequences<=num_beams
        
        # ----- Top P & Top K sampling -----#
        #do_sample=True,
        #top_k=top_k, 
        #top_p=top_p,
        #num_return_sequences=3
    )
    return output

def get_input_text():
    task_prefix = "translate English to SQL: "

    query = "What was North Melbourne's score as the home team?"
    query = "Cars built after 2020 and manufactured in Italy"
    #query = "which customers ordered in 1997 but did not order in 1998. Use cust_id in order table"
    #query = "which customers in order table ordered in 1997 but did &* ^ # not order in 1998"
    #query = "which customers (order table) ordered in 1997 (order_data column) but did not order in 1998"

    #input_text = "translate English to SQL: %s </s>" % query
    #input_text = "translate English to SQL: %s" % query
    input_text = f"<pad>{task_prefix} {query}</s>"
    
    print("INPUT: ", input_text)
    return input_text

In [None]:
# ====================== Using Pretrained or RetrainedModel ======================#
# ================================================================================#

model_name = "gpt2"
model_name = "openai-gpt"
model_name ="t5-small" # https://huggingface.co/t5-base
model_name = "mrm8488/t5-base-finetuned-wikiSQL"

model_args = {
    "model_name": model_name,
    "temperature": 0.95,
    "max_input_length": 512,
    "max_target_length": 64,
}

# Data Loading & Preprocesing
# ---------------------------
input_text = get_input_text()


# Get Tokenizer & Model
# ---------------------------
tokenizer, model = get_model(model_args)


# Input Data Tokenization
# ---------------------------
encoded_input = get_encoded_input(tokenizer, model_args, input_text)


# Model Architecture
# ---------------------------
output = get_encoded_output(model, model_args, encoded_input)


# Model Training
# ---------------------------
# for training, we can provide labels to tokenize


# Traslation output Prediction
# ---------------------------
result = tokenizer.decode(output[0], skip_special_tokens = True, clean_up_tokenization_spaces=True)
print("OUTPUT: ", result)
