### Finetune Code Generation model for transformation of input dict to template_dict using Flan-T5-small and LoRA

The approach is to first have a training dataset of the input and template and the corrected code to transform the input to the template format. Then we use that to finetune an LLM model.

<a href="https://colab.research.google.com/github/gvenkat21/projects/blob/main/review_feedback_nudge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os

### Install Dependencies

In [None]:
!pip install -q bitsandbytes datasets accelerate
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main

In [None]:
import torch
torch.cuda.is_available()

### Getting the data ready

In [None]:
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('valid.csv')

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

In [None]:
dataset = {}
dataset['train'] = train_dataset
dataset['validation'] = valid_dataset

In [None]:
# data preprocessing
text_column = "prompt"
label_column = "python_code"

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs


train_processed_datasets = dataset['train'].map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

valid_processed_datasets = dataset['validation'].map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)
train_dataset = train_processed_datasets
eval_dataset = valid_processed_datasets

### Train the Model

In [None]:
# Select CUDA device index
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-small"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

In [None]:
for name, param in model.named_parameters():
    print(f"Parameter: {name}, Type: {param.dtype}")

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "temp",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    gradient_accumulation_steps=1,
    auto_find_batch_size=True,
    num_train_epochs=1,
    save_steps=100,
    save_total_limit=8,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer.train()

In [None]:
model.eval()

In [None]:
input_text = """input_dict = {‘PolicyDate': ['2023-05-09', '2023-05-02', '2023-05-06'],
 'Name': ['Jackson', 'Smith', 'Martinez'],
 'PlanType': [' Frank', ' Jane', ' Carol'],
 'Policy_ID': ['BronzePackage', 'SilverPackage', 'BronzePackage'],
 'PremiumAmount': ['QR17171', 'CD67890', 'KL14141'],
 'Hobby': ['Writing', 'Reading', 'Swimming'],
 'MaritalStatus': ['Divorced', 'Single', 'Divorced'],
 'StartDate': ['2023-05-09', '2023-05-02', '2023-05-06'],
 'Employee_Name': ['Frank Jackson', 'Jane Smith', 'Carol Martinez'],
 'Plan_Name': ['Bronze', 'Silver', 'Bronze'],
 'PolicyID': ['QR17171', 'CD67890', 'KL14141'],
 'Cost': [50, 100, 50]}
 template_dict = {'Date': ['09-05-2023', '02-05-2023', '06-05-2023'],
 'EmployeeName': ['Frank Jackson', 'Jane Smith', 'Carol Martinez'],
 'Plan': ['Bronze', 'Silver', 'Bronze'],
 'PolicyNumber': ['QR17171', 'CD67890', 'KL14141'],
 'Premium': [50, 100, 50]}
->"""
inputs = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=2048)

print("input: ", input_text)
print(" output prediction: ", tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("path/to/HF/", use_auth_token=True)