In [None]:
# ================================ Utility Methods ===============================#
# ================================================================================#

from transformers import (AutoTokenizer, AutoModelWithLMHead, 
                          T5Tokenizer, T5ForConditionalGeneration, 
                          GPT2Tokenizer, GPT2LMHeadModel, 
                          OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
MODEL_CLASSES = {
    "mrm8488/t5-base-finetuned-wikiSQL": (AutoModelWithLMHead, AutoTokenizer),
    "anusha/t5-base-finetuned-wikiSQL-sql-to-en_15i": (AutoModelWithLMHead, AutoTokenizer),
    "t5-small": (T5ForConditionalGeneration, T5Tokenizer),
    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "gpt2": (GPT2LMHeadModel, GPT2Tokenizer),
}

def get_model(model_args):
    # Initialize the model and tokenizer
    try:
        model_class, tokenizer_class = MODEL_CLASSES[ model_args["model_name"] ]
    except KeyError:
        raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)")

    tokenizer = tokenizer_class.from_pretrained( model_args["model_name"] )
    model = model_class.from_pretrained( model_args["model_name"], pad_token_id=tokenizer.eos_token_id )

    return tokenizer, model


def get_encoded_input(tokenizer, model_args, input_text):

    # will have input_ids & attention_mask keys & corresponding tensors
    encoded_input = tokenizer(
        input_text,
        padding=True, 
        max_length = model_args["max_target_length"],
        truncation=True, 
        add_special_tokens=False, 
        return_tensors='pt')
    return encoded_input


def get_encoded_output(model, model_args, encoded_input):

    # URL: https://huggingface.co/blog/how-to-generate
    output = model.generate(
        input_ids = encoded_input['input_ids'],
        attention_mask = encoded_input['attention_mask'], 
        temperature = model_args["temperature"],
        max_new_tokens=model_args["max_target_length"],

        # ----- Beam Search w/ return sequences -----#
        #num_beams=5, 
        #no_repeat_ngram_size=2,
        #early_stopping=True,
        #num_return_sequences=5, #num_return_sequences<=num_beams
        
        # ----- Top P & Top K sampling -----#
        #do_sample=True,
        #top_k=top_k, 
        #top_p=top_p,
        #num_return_sequences=3
    )
    return output

def get_input_text():
    task_prefix = "translate English to SQL: "

    query = "What was North Melbourne's score as the home team?"
    query = "Cars built after 2020 and manufactured in Italy"
    #query = "which customers ordered in 1997 but did not order in 1998. Use cust_id in order table"
    #query = "which customers in order table ordered in 1997 but did &* ^ # not order in 1998"
    #query = "which customers (order table) ordered in 1997 (order_data column) but did not order in 1998"

    #input_text = "translate English to SQL: %s </s>" % query
    #input_text = "translate English to SQL: %s" % query
    input_text = f"<pad>{task_prefix} {query}</s>"
    
    print("INPUT: ", input_text)
    return input_text

In [None]:
# ====================== Using Pretrained or RetrainedModel ======================#
# ================================================================================#

model_name = "gpt2"
model_name = "openai-gpt"
model_name ="t5-small" # https://huggingface.co/t5-base
model_name = "mrm8488/t5-base-finetuned-wikiSQL"

model_args = {
    "model_name": model_name,
    "temperature": 0.95,
    "max_input_length": 512,
    "max_target_length": 64,
}

# Data Loading & Preprocesing
# ---------------------------
input_text = get_input_text()


# Get Tokenizer & Model
# ---------------------------
tokenizer, model = get_model(model_args)


# Input Data Tokenization
# ---------------------------
encoded_input = get_encoded_input(tokenizer, model_args, input_text)


# Model Architecture
# ---------------------------
output = get_encoded_output(model, model_args, encoded_input)


# Model Training
# ---------------------------
# for training, we can provide labels to tokenize


# Traslation output Prediction
# ---------------------------
result = tokenizer.decode(output[0], skip_special_tokens = True, clean_up_tokenization_spaces=True)
print("OUTPUT: ", result)
