In [None]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate 
import torch 
import numpy as np 


In [None]:
torch.cuda.is_available()

In [None]:
#loading base model 
model_id = 'microsoft/DialoGPT-medium' 
device = 'mps' #should probably be changed to mps 
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = 'right') #required by dialogpt to be right side
if tokenizer.pad_token is None: 
    tokenizer.pad_token = tokenizer.eos_token #perhaps use other token
model = AutoModelForCausalLM.from_pretrained(model_id)
model = model.to(device) 

In [None]:
#load dataset 

dataset = load_dataset('/Users/christoffer/Desktop/Aubay/Mini projects/chatbot_project/chatbot/') 
print(dataset) 

In [None]:
for i in range(1): 
    print(dataset['train'][i])

In [None]:
dict_list = dataset["train"][0]['questions'] 
print(dict_list)

In [None]:
for i in range(5): 
    print(dict_list[i])

In [None]:
len_dict_list = len(dict_list)
question_list = []
answer_list = []
for i in range(len_dict_list): 
    question, answer = dict_list[i]['question'], dict_list[i]['answer']
    #print(f"Question: {question} Answer: {answer}") 
    question_list.append(question)
    answer_list.append(answer)  

In [None]:
#preprocessing dataset 
import pandas as pd 

data = pd.DataFrame({
    'question' : question_list, 
    'answer' : answer_list
}) 

data.rename(columns={"question":"context", "answer":"response"}, inplace=True) 
data['input'] = data["context"] + "" + data["response"]

print(data.head(3))
print("Number of columns:", len(data.columns))
print("Column names:", data.columns.tolist())
print("Number of rows:", len(data))

In [None]:
#Convert the dataset back to a huggingface Dataset object
hf_dataset = Dataset.from_pandas(data)
print(hf_dataset)

In [None]:
#chat with the model 

def chat(inp=None):
    model_cpu = model.to("mps")  
    if inp is None:
        inp = []  # Use an empty list if no predefined inputs are provided
    
    for step in range(2):  # Limit the conversation to 5 exchanges
        if not inp:
            # If no predefined input, take user input
            new_user_input_ids = tokenizer.encode(input(">> User: ") + tokenizer.eos_token, return_tensors='pt')
        else:
            # Use predefined input for the current step
            print(">> User: ", inp[step])
            new_user_input_ids = tokenizer.encode(inp[step] + tokenizer.eos_token, return_tensors='pt')
        
        # Concatenate new user input with chat history (if exists)
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
        
        # Generate a response from the model
        chat_history_ids = model_cpu.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
        
        # Decode and print the model's response
        print(">> DialogGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

# Example usage
chat(["Hello, how are you?", "What's your name?"])


In [None]:
#creating the training pipeline 

def encode(examples): 
    encoded = tokenizer(examples['input'], truncation=True, padding='max_length', max_length=40, return_tensors='pt') 
    encoded['labels'] = encoded['input_ids'][:] #supervised learning 
    return encoded 
    

In [None]:
#split into training and validation set 
split_datasets = hf_dataset.train_test_split(test_size = 0.15) 
encoded_data = split_datasets.map(encode, batched=True) 

#train_dataset = split_datasets['train'] 
#validation_dataset = split_datasets['test']  
#
#print("Training Dataset:")
#print(train_dataset)
#print("\nValidation Dataset:")
#print(validation_dataset) 

In [None]:
from transformers import TrainingArguments, Trainer 

In [None]:
training_args = TrainingArguments(
    output_dir="fine_tuned_dialogpt", 
    num_train_epochs=5, 
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=2, 
    weight_decay=0.01, 
    learning_rate=2e-5, 
)

In [None]:
training_args.device

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=encoded_data["train"], 
    eval_dataset=encoded_data["test"]
)

In [None]:
#performance measure before fine-tuning 
pre_eval_results = trainer.evaluate(encoded_data["test"])
pre_eval_predictions = trainer.predict(encoded_data["test"].select(range(10)))

In [None]:
print(pre_eval_results)

In [None]:
def generate_response(predictions): 
    responses = [] 
    for idx, pred in enumerate(predictions): 
        response = tokenizer.decode(np.argmax(pred, axis=-1), skip_special_tokens=True) 
        response 
        responses.append(response) 

    return responses

In [None]:
trainer.train()

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('fine_tuned_dialogpt')
tokenizer.save_pretrained('fine_tuned_dialogpt') 

In [None]:
post_eval_results = trainer.evaluate(encoded_data["test"]) 
post_eval_predictions = trainer.predict(encoded_data["test"].select(range(10)))

In [None]:
post_eval_results

In [None]:
res = generate_response(pre_eval_predictions.predictions) 
res 

In [None]:
res = generate_response(post_eval_predictions.predictions) 
res

In [92]:
#Comparing responses from base model and fine-tuned model  

def chat(model, tokenizer, num_turns=4):
    model = model.to("cpu")  # Move model to CPU for demonstration
    chat_history_ids = None

    print("Start chatting (type 'quit' to stop):")
    for step in range(num_turns):
        user_input = input(">> User: ")
        if user_input.lower() == 'quit':
            break

        new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if chat_history_ids is not None else new_user_input_ids

        chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
        response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
        print(">> DialogGPT: {}".format(response))

In [None]:
# Load the pre-trained model
pretrained_model = AutoModelForCausalLM.from_pretrained(model_id)
pretrained_model = pretrained_model.to(device)

# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("fine_tuned_dialogpt")
fine_tuned_model = fine_tuned_model.to(device)

In [93]:
print("Chatting with base model:")
chat(pretrained_model, tokenizer)

Chatting with base model:
Start chatting (type 'quit' to stop):


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: I'm good, how are you?


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: I'm good at customer service, but I'm not good at customer service.


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: I don't know, I'm not a customer service person.


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: I don't know, I'm not a customer service person.


In [94]:
print("\nChatting with Fine-tuned Model:")
chat(fine_tuned_model, tokenizer)



Chatting with Fine-tuned Model:
Start chatting (type 'quit' to stop):


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: I'm good, how are you?


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: I'm good at customer service, yes.


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: You can return your product by contacting our customer support team via our website. We will assist you with the return process once you have contacted our customer support team via our website.


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


>> DialogGPT: We offer a return policy of 1 year for all products purchased through our website. Please contact our customer support team with your questions or concerns.
