In [1]:
import numpy as np # to get the predictions
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
empathetic_dialogues_ds = load_dataset("Estwld/empathetic_dialogues_llm")
print(empathetic_dialogues_ds)

def extract_conversation_pairs(example):
    pairs = []
    for i in range(len(example['conversations']) - 1):
        if example['conversations'][i]['role'] == 'user' and example['conversations'][i + 1]['role'] == 'assistant':
            user_input = example['conversations'][i]['content']
            assistant_reponse = example['conversations'][i + 1]['content']
            pairs.append((user_input, assistant_reponse))
    return pairs

def preprocess_dataset(dataset):
    all_pairs = []
    for example in dataset:
        all_pairs.extend(extract_conversation_pairs(example))
    return all_pairs

train_pairs = preprocess_dataset(empathetic_dialogues_ds['train'])
eval_pairs = preprocess_dataset(empathetic_dialogues_ds['valid'])
print(train_pairs[0])
print(eval_pairs[0])

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'situation', 'emotion', 'conversations'],
        num_rows: 19533
    })
    valid: Dataset({
        features: ['conv_id', 'situation', 'emotion', 'conversations'],
        num_rows: 2770
    })
    test: Dataset({
        features: ['conv_id', 'situation', 'emotion', 'conversations'],
        num_rows: 2547
    })
})
('I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.', 'Was this a friend you were in love with, or just a best friend?')
('Today,as i was leaving for work in the morning,i had a tire burst in the middle of a busy road. That scared the hell out of me!', 'Are you fine now?')


In [3]:
# load the tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
import torch

tokenizer.pad_token = tokenizer.eos_token

# tokenize the dataset
def tokenize_pairs(pairs):
    inputs = []
    labels = []

    for user_input, assistant_response in pairs:
        encoded = tokenizer(
            user_input,
            assistant_response,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        inputs.append(encoded['input_ids'])
        labels.append(encoded['attention_mask'])
    return {'input_ids': torch.cat(inputs), 'attention_mask': torch.cat(labels)}

train_data = tokenize_pairs(train_pairs)
eval_data = tokenize_pairs(eval_pairs)

In [5]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

# ensure the correct GPU is being used (if multiple are available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.tensor([1.0, 2.0, 3.0]).to(device)
print(x.device) # should print 'cuda:0'

True
1
NVIDIA GeForce RTX 4090
cuda:0


In [6]:
from torch.utils.data import Dataset

# define the custom Dataset class for processing dialog data
class DialogDataset(Dataset):
    def __init__(self, encodings):
        """
        initializes the DialogDataset with the encoded text data.

        :param encodings: a dictionary containing the encoded input data.
                           it should contain keys like 'input_ids' and 'attention_mask'
                           after tokenizing the raw text.
        """
        # store the encodings (tokenized inputs) passed to the class
        self.encodings = encodings

    def __len__(self):
        """ 
        returns the length of the dataset.

        the length is determined by the number of input sequences in the 'input_ids'
        of the encodings dictionary. all items (input sequences) should have the same length.

        :return: the number of samples in the dataset.
        """
        # return the number of samples, which is the same as the length of the 'input_ids' list
        return len(self.encodings['input_ids'])
        
    def __getitem__(self, idx):
        """
        retrieves a sample from the dataset at the specified index.

        the method returns the tokenized input for a given index (sample)
        in the dataset. the sample consists of input IDs and other related features
        (e.g., attention mask_mask) wrapped as a tensor.

        :param idx: the index of the sample to retrieve.
        :return: a dictionary where each key is a feature (e.g., 'input_ids', 'attention_mask')
        """
        # create and return a dictionary with each encoding key and its corresponding value
        # at the specified index. the values are converted into PyTorch tensors.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone() # labels should be the same as input_ids
        return item
        
train_dataset = DialogDataset(train_data)
eval_dataset = DialogDataset(eval_data)

# define training arguments
training_args = TrainingArguments(
    output_dir="./results",             # directory to save model checkpoints
    num_train_epochs=3,                 # adjust based on convergence
    per_device_train_batch_size=8,      # increase to 16 if vram allows, or decrease to 8
    per_device_eval_batch_size=8, 
    gradient_accumulation_steps=2,      # helps if batch size is small
    evaluation_strategy="epoch",        # evaluate at the end of each epoch
    save_strategy="epoch",              # save model checkpoints each epoch
    logging_dir="./logs",               # logging directory
    logging_steps=100,                  # adjust based on dataset size
    learning_rate=5e-5,                 # starndard for transformer fine-tuning
    warmup_steps=500,                   # helps stabilize training
    weight_decay=0.01,                  # regularization
    bf16=True,                          # enable mixed precision for speedup
    save_total_limit=3,                 # keep last 3 checkpoints
    eval_accumulation_steps=4,          # accumuluate loss over multiple steps
)

# create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [7]:
# evaluate before fine-tuning
pre_eval_results = trainer.evaluate(eval_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


In [8]:
torch.cuda.empty_cache()

In [9]:
print(pre_eval_results)

{'eval_loss': 7.78450345993042, 'eval_model_preparation_time': 0.003, 'eval_runtime': 61.6063, 'eval_samples_per_second': 93.124, 'eval_steps_per_second': 11.655}


In [17]:
# fine-tune the model :)
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,0.1868,0.20234,0.0042
2,0.1803,0.199421,0.0042
3,0.1625,0.200169,0.0042


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=7548, training_loss=0.1953196435866687, metrics={'train_runtime': 3934.4505, 'train_samples_per_second': 30.693, 'train_steps_per_second': 1.918, 'total_flos': 1.1214896717050675e+17, 'train_loss': 0.1953196435866687, 'epoch': 3.0})

In [18]:
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.json',
 './trained_model\\merges.txt',
 './trained_model\\added_tokens.json',
 './trained_model\\tokenizer.json')

In [10]:
metrics = trainer.evaluate()
print(metrics)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 7.78450345993042, 'eval_model_preparation_time': 0.003, 'eval_runtime': 63.0127, 'eval_samples_per_second': 91.045, 'eval_steps_per_second': 11.395}


In [11]:
from transformers import pipeline

# for generating predictions
chatbot = pipeline("text-generation", model="./trained_model", truncation=True, tokenizer=tokenizer)
response = chatbot("I'm feeling overwhelmed with everything going on in my life. Can you help me figure out how to deal with it? ", max_length=100)
print(response)

Device set to use cuda:0


[{'generated_text': "I'm feeling overwhelmed with everything going on in my life. Can you help me figure out how to deal with it? I can try. What are you having trouble with?"}]


In [12]:
tokenizer = AutoTokenizer.from_pretrained("./trained_model")
model = AutoModelForCausalLM.from_pretrained("./trained_model")

# let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last output tokens from bot
    print("EmpathAI: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


EmpathAI: I am so happy that my son is starting school.That is great news. I hope he does well.
EmpathAI: Me too. He is going to be a great student.
EmpathAI: I hope so too. I hope he does well.
EmpathAI: I hope so too. He is going to be a great student.
EmpathAI: I hope so too. I hope he does well.


### Human-Like-DPO-Dataset Fine-tuning

In [43]:
# load the previously trained model
model_name = "./trained_model"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [44]:
import pandas as pd

# load the new dataset
human_like_dpo_ds = load_dataset("HumanLLMs/Human-Like-DPO-Dataset")
# convert the train split to a pandas DataFrame for easier manipulation
train_df = pd.DataFrame(human_like_dpo_ds['train'])
print(train_df.head())

                                              prompt  \
0   Oh, I just saw the best meme - have you seen it?   
1                   Do you have a go-to karaoke jam?   
2  **Crafty corner** Are you good at any DIY proj...   
3  What's your favorite type of cuisine to cook o...   
4              Do you have a secret talent or skill?   

                                              chosen  \
0  ðŸ˜‚ Ah, no I haven't! I'm dying to know, what's ...   
1  Oh, totally! ðŸ˜„ I'm a sucker for a good ol' roc...   
2  ðŸ˜Š I'm actually a big fan of DIY projects! I'm ...   
3  Oh, man! I'm a total sucker for Italian food! ...   
4  You know, I've always been fascinated by music...   

                                            rejected  
0  I'm an artificial intelligence language model,...  
1  As a professional AI language model, I don't h...  
2  Good day. As a continuously evolving artificia...  
3  In accordance with my programming, I must emph...  
4  Good day. As a professional AI language

In [56]:
# data preprocessing
# train_df["input_text"] = train_df["prompt"] + " " + train_df["chosen"]
# train_df["target_text"] = train_df["chosen"]
# train_df.drop(train_df.columns[2], axis=1, inplace=True) # remove the "rejected" column

# print(train_df[["input_text", "target_text"]].head())

def preprocess_new_dataset(dataset):
    all_pairs = [] # initializes an empty list that will store all conversation pairs

    for example in dataset:
        all_pairs.extend(example)

    return all_pairs

new_train_data = preprocess_new_dataset(train_df)
print(new_train_data[1])

r


In [49]:
from datasets import Dataset

# convert the pandas DataFrame back to a Hugging Face dataset
human_like_train_data = Dataset.from_pandas(train_df)
print(human_like_train_data[0])

{'prompt': 'Oh, I just saw the best meme - have you seen it?', 'chosen': "ðŸ˜‚ Ah, no I haven't! I'm dying to know, what's the meme about? Is it a funny cat or a ridiculous situation? Spill the beans! ðŸ¤£", 'input_text': "Oh, I just saw the best meme - have you seen it? ðŸ˜‚ Ah, no I haven't! I'm dying to know, what's the meme about? Is it a funny cat or a ridiculous situation? Spill the beans! ðŸ¤£", 'target_text': "ðŸ˜‚ Ah, no I haven't! I'm dying to know, what's the meme about? Is it a funny cat or a ridiculous situation? Spill the beans! ðŸ¤£"}


In [None]:
tokenizer.pad_token = tokenizer.eos_token

# tokenize the dataset
def tokenize_pairs(pairs):
    inputs = []
    labels = []

    for user_input, assistant_response in pairs:
        encoded = tokenizer(
            user_input,
            assistant_response,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        inputs.append(encoded['input_ids'])
        labels.append(encoded['attention_mask'])
    return {'input_ids': torch.cat(inputs), 'attention_mask': torch.cat(labels)}

train_data = tokenize_pairs(train_pairs)
eval_data = tokenize_pairs(eval_pairs)