In [1]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import tensorflow as tf

### **Read reddit dataset**

In [2]:
!gdown --id 1OrtWVYzMEcCauJgP06kbaFuqRAL4UVEd

Downloading...
From: https://drive.google.com/uc?id=1OrtWVYzMEcCauJgP06kbaFuqRAL4UVEd
To: /content/reddit_conversation.csv
100% 7.96M/7.96M [00:00<00:00, 32.9MB/s]


In [3]:
df = pd.read_csv('reddit_conversation.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2
0,0,What kind of phone(s) do you guys have?,I have a pixel. It's pretty great. Much better...,Does it really charge all the way in 15 min?
1,1,I have a pixel. It's pretty great. Much better...,Does it really charge all the way in 15 min?,"Pretty fast. I've never timed it, but it's und..."
2,2,Does it really charge all the way in 15 min?,"Pretty fast. I've never timed it, but it's und...","cool. I've been thinking of getting one, my ph..."
3,3,What kind of phone(s) do you guys have?,Samsung Galaxy J1. It's my first cell phone an...,What do you think of it? Anything you don't like?
4,4,Samsung Galaxy J1. It's my first cell phone an...,What do you think of it? Anything you don't like?,I love it. I can't think of anything I don't l...


In [5]:
Questions = list()
Answers = list()
for i in tqdm(range(len(df))):
  Q = df['0'][i]
  A = df['1'][i]

  Q = Q.lower()
  A = A.lower()

  Q = re.sub('[\/:;-_+@&!?$()<>.,@#%^&*"]',"",Q)
  A = re.sub('[\/:;-_+@&!?$()<>.,@#%^&*"]',"",A)

  A = "startseq "+A+" endseq"

  Questions.append(Q)
  Answers.append(A)

  0%|          | 0/56297 [00:00<?, ?it/s]

In [6]:
len(Questions)

56297

In [7]:
len(Answers)

56297

In [8]:
Questions[:10]

['what kind of phones do you guys have',
 "i have a pixel it's pretty great much better than what i had before ",
 'does it really charge all the way in 15 min',
 'what kind of phones do you guys have',
 "samsung galaxy j1 it's my first cell phone and i've had it for 7 months",
 "what do you think of it anything you don't like",
 'what kind of phones do you guys have',
 "lg optimus v i know it's old",
 'my friend told me to kill myself ',
 "don't kill yourself op"]

In [9]:
Answers[:10]

["startseq i have a pixel it's pretty great much better than what i had before  endseq",
 'startseq does it really charge all the way in 15 min endseq',
 "startseq pretty fast i've never timed it but it's under half an hour  endseq",
 "startseq samsung galaxy j1 it's my first cell phone and i've had it for 7 months endseq",
 "startseq what do you think of it anything you don't like endseq",
 "startseq i love it i can't think of anything i don't like about it endseq",
 "startseq lg optimus v i know it's old endseq",
 "startseq if it does it's job it's good enough endseq",
 "startseq don't kill yourself op endseq",
 "startseq i won't give them the satisfaction  endseq"]

In [10]:
data = pd.DataFrame({'question': Questions, 'answer': Answers})

In [11]:
data.shape

(56297, 2)

In [12]:
data.head()

Unnamed: 0,question,answer
0,what kind of phones do you guys have,startseq i have a pixel it's pretty great much...
1,i have a pixel it's pretty great much better t...,startseq does it really charge all the way in ...
2,does it really charge all the way in 15 min,startseq pretty fast i've never timed it but i...
3,what kind of phones do you guys have,startseq samsung galaxy j1 it's my first cell ...
4,samsung galaxy j1 it's my first cell phone and...,startseq what do you think of it anything you ...


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

In [15]:
print(train.shape)
print(test.shape)

(39407, 2)
(16890, 2)


**convert to huggingeface Dataset**

In [16]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  D

In [17]:
from datasets import Dataset, DatasetDict

In [18]:
train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
raw_dataset = DatasetDict({'train': train_dataset, 'test': test_dataset})

In [19]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 39407
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 16890
    })
})

In [20]:
type(raw_dataset)

datasets.dataset_dict.DatasetDict

In [21]:
type(raw_dataset['train'])

datasets.arrow_dataset.Dataset

In [22]:
for sample in raw_dataset['train']:
  print(sample)
  break

{'question': 'my sunday is over\n\ne', 'answer': 'startseq rip endseq'}


In [23]:
print(raw_dataset['train']['question'][5])
print(raw_dataset['train']['answer'][5])

happy early birthday
startseq thank you happy bday for you today endseq


### **Preprocessing the data**

In [24]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.0 MB/s

In [25]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollator, T5ForConditionalGeneration, T5TokenizerFast

from transformers import (
    T5ForConditionalGeneration, 
    T5Tokenizer, 
    EvalPrediction,
    DataCollator,
    Trainer,
    TrainingArguments)

In [26]:
checkpoint = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = T5TokenizerFast.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [27]:
max_input_length =  512
max_target_length = 64

# tokenize the examples
def convert_to_features(example_batch):

    input_encodings = tokenizer.batch_encode_plus(example_batch['question'], 
                                                  max_length=max_input_length, 
                                                  add_special_tokens=True,
                                                  truncation=True, 
                                                  pad_to_max_length=True)
    
    target_encodings = tokenizer.batch_encode_plus(example_batch['answer'], 
                                                   max_length=max_target_length, 
                                                   add_special_tokens=True,
                                                   truncation=True, pad_to_max_length=True)
                                                   
    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'decoder_input_ids': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask']
    }

    return encodings


def add_eos_examples(example):
  example['question'] = example['question'] + " </s>"
  example['answer'] = example['answer'] + " </s>"
  return example

In [28]:
tokenized_dataset = raw_dataset.map(convert_to_features, batched=True)

Map:   0%|          | 0/39407 [00:00<?, ? examples/s]



Map:   0%|          | 0/16890 [00:00<?, ? examples/s]

In [29]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 39407
    })
    test: Dataset({
        features: ['question', 'answer', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 16890
    })
})

In [30]:
tokenized_dataset['train']['question'][5]

'happy early birthday'

In [31]:
tokenized_dataset = tokenized_dataset.remove_columns(
    ["question", "answer"]
)

train_dataset = tokenized_dataset["train"]
valid_dataset = tokenized_dataset["test"]

columns = ['input_ids', 'decoder_input_ids', 'attention_mask', 'decoder_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [32]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 39407
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 16890
    })
})

### **Fine-Tuning the t5 model**

In [33]:
from dataclasses import dataclass
from typing import List, Dict

In [34]:
@dataclass
class T2TDataCollator():
  def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
    """
    Take a list of samples from a Dataset and collate them into a batch.
    Returns:
    A dictionary of tensors
    """
    
    input_ids = torch.stack([example['input_ids'] for example in batch])
    lm_labels = torch.stack([example['decoder_input_ids'] for example in batch])
    lm_labels[lm_labels[:, :] == 0] = -100 
    attention_mask = torch.stack([example['attention_mask'] for example in batch])
    decoder_attention_mask = torch.stack([example['decoder_attention_mask'] for example in batch])
    
    return {
        'input_ids': input_ids, 
        'attention_mask': attention_mask,
        'labels': lm_labels, 
        'decoder_attention_mask': decoder_attention_mask
    }

In [35]:
from huggingface_hub import notebook_login

# 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
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [36]:
training_args = TrainingArguments(output_dir="./gdrive/My Drive/models", 
                                  per_device_train_batch_size=4, 
                                  per_device_eval_batch_size=4,
                                  gradient_accumulation_steps=16,
                                  learning_rate=1e-4, 
                                  num_train_epochs=1,
                                  logging_steps=100,
                                  run_name="t5-base-end2end-chatbot-generative",
                                  evaluation_strategy="steps",
                                  save_steps=100,
                                  push_to_hub=True,
                                  push_to_hub_model_id="t5-base-end2end-chatbot-generative")



In [37]:
# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=T2TDataCollator()
)

Cloning https://huggingface.co/danfarh2000/t5-base-end2end-chatbot-generative into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/850M [00:00<?, ?B/s]

Download file runs/Apr12_06-54-22_509dd628a49c/1681282475.8975651/events.out.tfevents.1681282475.509dd628a49c.…

Clean file runs/Apr12_06-54-22_509dd628a49c/1681282475.8975651/events.out.tfevents.1681282475.509dd628a49c.717…

Download file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Clean file training_args.bin:  28%|##8       | 1.00k/3.56k [00:00<?, ?B/s]

Download file runs/Apr12_06-54-22_509dd628a49c/events.out.tfevents.1681282475.509dd628a49c.717.0: 100%|#######…

Clean file runs/Apr12_06-54-22_509dd628a49c/events.out.tfevents.1681282475.509dd628a49c.717.0:  14%|#3        …

Clean file pytorch_model.bin:   0%|          | 1.00k/850M [00:00<?, ?B/s]

In [38]:
# Training
trainer.train()



Step,Training Loss,Validation Loss
100,2.9718,2.395115
200,2.4855,2.336802
300,2.4712,2.311251
400,2.4346,2.29719
500,2.4126,2.288805
600,2.4219,2.285661


TrainOutput(global_step=615, training_loss=2.529748621994887, metrics={'train_runtime': 10358.5847, 'train_samples_per_second': 3.804, 'train_steps_per_second': 0.059, 'total_flos': 2.39685825724416e+16, 'train_loss': 2.529748621994887, 'epoch': 1.0})

In [39]:
trainer.save_model('/content/t5-base-end2end-chatbot-generative-model/')

Upload file pytorch_model.bin:   0%|          | 1.00/850M [00:00<?, ?B/s]

Upload file runs/Apr13_07-29-23_a6fa1e48beac/events.out.tfevents.1681371185.a6fa1e48beac.256.0:   0%|         …

To https://huggingface.co/danfarh2000/t5-base-end2end-chatbot-generative
   80901c4..001bc29  main -> main

   80901c4..001bc29  main -> main

To https://huggingface.co/danfarh2000/t5-base-end2end-chatbot-generative
   001bc29..4d776c9  main -> main

   001bc29..4d776c9  main -> main



In [40]:
# When training is done, we push the fine-tuned model to the Hub
trainer.push_to_hub("t5-base-end2end-chatbot-generative")

### **Testing the Model**

In [44]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast

chatbot_model = T5ForConditionalGeneration.from_pretrained("danfarh2000/t5-base-end2end-chatbot-generative")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]