# Importing the Libraries

In [4]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from peft import LoraConfig,get_peft_model
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import TrainerCallback


# Loading the model and tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Before Fine-Tuning result

In [7]:
input_text = "Write an email to my manager requesting a Performance Review Meeting"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids,max_length=512)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

I have a meeting scheduled for Friday, October 29th at 2:00 p.m. at the Houston Center for the Performing Arts. I will be able to attend. I will be able to provide you with a copy of the agenda and a copy of the presentation. I will also be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be able to provide you with a copy of the presentation. I will be abl

In [8]:
#We can clearly see that the model is repetitive and not able to udnerstand the instruction provided clearly

# Loading the Dataset

In [9]:
dataset_train = load_dataset('csv', data_files='emails.csv',split='train')
dataset_test=load_dataset('csv', data_files='emails_test.csv',split='train')

In [10]:
dataset_train

Dataset({
    features: ['Instruction', 'Response'],
    num_rows: 217
})

In [11]:
dataset_test

Dataset({
    features: ['Instruction', 'Response'],
    num_rows: 10
})

In [12]:
dataset_train= dataset_train.shuffle()

In [13]:
dataset_test= dataset_test.shuffle()

# Tokenizing the Dataset

In [14]:
def tokenize_function(email):
    instruction_tokenized = tokenizer(email['Instruction'], truncation=True, padding='max_length', max_length=512)
    response_tokenized = tokenizer(email['Response'], truncation=True, padding='max_length', max_length=512)

    tokenized_email = {
        'input_ids': instruction_tokenized['input_ids'],
        'attention_mask': instruction_tokenized['attention_mask'],
        'labels': response_tokenized['input_ids']
    }
    
    return tokenized_email


In [15]:
tokenized_dataset_train=dataset_train.map(tokenize_function,batched=True)
tokenized_dataset_test=dataset_test.map(tokenize_function,batched=True)

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

# Setting up the LoRA Configuration

In [16]:
peft_config=LoraConfig(task_type="CAUSAL_LM",
                       r=32,
                       lora_alpha=64,
                       lora_dropout=0.05,
                       bias='none'

)
                       

In [17]:
model=get_peft_model(model,peft_config)

In [18]:
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093


# Setting up the Training Arguments

In [21]:
lr=1e-3
batch_size=1
num_pochs=5
training_args=TrainingArguments(
    output_dir="./chatbot",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_pochs,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=1,
)

    

# Training

In [22]:
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    )


trainer.train()
    
    

Epoch,Training Loss,Validation Loss
1,2.3062,0.272967
2,0.2372,0.228347
3,0.1845,0.199863
4,0.1626,0.188123
5,0.1474,0.183418




TrainOutput(global_step=1085, training_loss=0.6075741376744986, metrics={'train_runtime': 653.0356, 'train_samples_per_second': 1.661, 'train_steps_per_second': 1.661, 'total_flos': 754757452431360.0, 'train_loss': 0.6075741376744986, 'epoch': 5.0})

In [23]:
trainer.save_model("finetuned_chatbot")



In [24]:
fine_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_chatbot")

In [27]:
input_text = "Write an email to my for Performance Evaluation Discussion Request"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = fine_model.generate(input_ids,max_length=512)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))


Subject: Performance Evaluation Discussion Request Dear [Manager's Name], I hope you are well. I am writing to request a discussion on my performance evaluation. I am currently working on my performance and I am looking forward to discussing my performance. Please let me know a suitable time for this discussion. Best regards, [Your Name]
