In [18]:
!pip install transformers
!pip install datasets



In [28]:
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import TrainingArguments, pipeline
from transformers.trainer import Trainer

In [3]:
!wget https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz && tar -xf empatheticdialogues.tar.gz

--2022-03-23 23:41:09--  https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28022709 (27M) [application/gzip]
Saving to: ‘empatheticdialogues.tar.gz’


2022-03-23 23:41:09 (56.8 MB/s) - ‘empatheticdialogues.tar.gz’ saved [28022709/28022709]



In [21]:
# Loading base model architectures and tokenizer
pline = pipeline('text-generation', 'microsoft/DialoGPT-medium', max_length=1000)

model = pline.model 
tokenizer = pline.tokenizer

In [None]:
def _load_state_dict_in_model(model, state_dict):
    load_result = model.load_state_dict(state_dict, strict=False)

    if len(load_result.missing_keys) != 0:
        if model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
            model._keys_to_ignore_on_save
        ):
            model.tie_weights()
        else:
            print(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.")
    if len(load_result.unexpected_keys) != 0:
        print(
            f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}."
        )


def load_weights(model, weights_path):
    state_dict = torch.load(weights_path, map_location="cpu")

    _load_state_dict_in_model(model, state_dict)


In [None]:
# Use baseline trained on Reddit data
load_weights(pline.model, "pytorch_model.bin")

In [5]:
def clean_text(txt):
    """Remove unnecessary spaces."""
    return ' '.join(txt.strip().split())

In [7]:
text = "Hey! How are you doing?"
text = clean_text(text) + tokenizer.eos_token
text

'Hey! How are you doing?<|endoftext|>'

In [8]:
outputs = pline(text)
outputs

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hey! How are you doing?<|endoftext|>I'm doing well, how are you?"}]

In [None]:
def read_dataset(csv_path: Path):
    with open(csv_path) as f:
        df = f.readlines()

    max_hist_len = 1

    data = {
        "context": [],
        "response": []
    }
    history = []
    for i in range(1, len(df)):
        cparts = df[i - 1].strip().split(",")
        sparts = df[i].strip().split(",")
        if cparts[0] == sparts[0]:
            prevsent = cparts[5].replace("_comma_", ",")
            history.append(prevsent)

            prev_str = " <SOC> ".join(history[-max_hist_len :])
            context = prev_str

            sent = sparts[5].replace("_comma_", ",")
            label = sent

            # data.append((context, label))

            data["context"].append(context)
            data["response"].append(label)
        else:
            history = []

    return data


In [11]:
ds_dir = Path("/content/empatheticdialogues")

In [12]:
splits = list(ds_dir.glob("*.csv"))

for split in splits:
    dataset = read_dataset(split)
    df = pd.DataFrame(dataset)
    df.to_csv(str(split).replace(".csv", "_rebuild.csv"), index=False)

In [13]:
df

Unnamed: 0,context,response
0,"Today,as i was leaving for work in the morning...",Are you fine now?
1,Are you fine now?,"Yeah,i'm doing alright now, but with minor inj..."
2,"Yeah,i'm doing alright now, but with minor inj...",Cool :) Is your car damaged a lot?
3,Cool :) Is your car damaged a lot?,"The car was badly damaged,i veered outside the..."
4,"A few weeks ago, I was walking through my hall...","That's funny, hope he didn't give you a heart ..."
...,...,...
9303,whatwas the exam for?,It was for Organic Chemistry
9304,It was for Organic Chemistry,tha is really cool what was your grade
9305,One of my coworkers has been arguing with his ...,What are they arguing about?
9306,What are they arguing about?,"Everything and anything. It's annoying, though..."


In [11]:
df_train = pd.read_csv("empatheticdialogues/train_rebuild.csv")
df_train

Unnamed: 0,context,response
0,I remember going to see the fireworks with my ...,"Was this a friend you were in love with, or ju..."
1,"Was this a friend you were in love with, or ju...",This was a best friend. I miss her.
2,This was a best friend. I miss her.,Where has she gone?
3,Where has she gone?,We no longer talk.
4,We no longer talk.,Oh was this something that happened because of...
...,...,...
64631,Yeah I found some old pictures of when us kids...,What a wonderful memory.
64632,What a wonderful memory.,Yeah reminds me of the good old days. I miss ...
64633,I woke up this morning to my wife telling me s...,Oh hey that's awesome! That is awesome right?
64634,Oh hey that's awesome! That is awesome right?,It is soooo awesome. We have been wanting a b...


In [12]:
df_train["num_context"] = df_train["context"].apply(lambda x: len(x.split(" ")))
df_train["num_response"] = df_train["response"].apply(lambda x: len(x.split(" ")))
df_train

Unnamed: 0,context,response,num_context,num_response
0,I remember going to see the fireworks with my ...,"Was this a friend you were in love with, or ju...",38,14
1,"Was this a friend you were in love with, or ju...",This was a best friend. I miss her.,14,8
2,This was a best friend. I miss her.,Where has she gone?,8,4
3,Where has she gone?,We no longer talk.,4,4
4,We no longer talk.,Oh was this something that happened because of...,4,10
...,...,...,...,...
64631,Yeah I found some old pictures of when us kids...,What a wonderful memory.,19,6
64632,What a wonderful memory.,Yeah reminds me of the good old days. I miss ...,6,22
64633,I woke up this morning to my wife telling me s...,Oh hey that's awesome! That is awesome right?,12,9
64634,Oh hey that's awesome! That is awesome right?,It is soooo awesome. We have been wanting a b...,9,27


In [13]:
df_train[["num_context", "num_response"]].describe(percentiles=[.90, .95, .97, .99])

Unnamed: 0,num_context,num_response
count,64636.0,64636.0
mean,13.970048,12.80483
std,9.016605,8.322922
min,1.0,1.0
50%,12.0,11.0
90%,24.0,23.0
95%,30.0,28.0
97%,35.0,32.0
99%,45.0,43.0
max,615.0,110.0


In [8]:
tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})

0

In [None]:
def tokenize_func(examples, max_length=80):
    inputs = tokenizer(examples["context"], padding="max_length", truncation=True, max_length=max_length)
    labels = tokenizer(examples["response"], truncation=True, max_length=max_length)["input_ids"]
    labels = labels + [-100] * (max_length - len(labels))    

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }


data_files_train = ["empatheticdialogues/train_rebuild.csv"]
data_files_val = ["empatheticdialogues/valid_rebuild.csv"]

dataset_train = load_dataset("csv", data_files=data_files_train, split="train")
dataset_val = load_dataset("csv", data_files=data_files_val, split="train")

dataset_train = dataset_train.map(tokenize_func, batched=False)
dataset_val = dataset_val.map(tokenize_func, batched=False)

In [9]:
len(dataset_train)

64636

In [10]:
training_args = TrainingArguments(
    output_dir="lr_5e-5_gpt2_medium", 
    evaluation_strategy="epoch", 
    learning_rate=5e-5,
    num_train_epochs=5,
    save_steps=2000,
    per_device_train_batch_size=32,
    fp16=True
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
)

Using amp half precision backend


In [None]:
trainer.train()

In [16]:
model.to("cpu")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [6]:
text = "Hi! How is it going?"
text = clean_text(text) + tokenizer.eos_token
text

'Hi! How is it going?<|endoftext|>'

In [9]:
output = model.generate(**tokenizer(text, padding=True, truncation=True, return_tensors="pt"))
output

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[17250,     0,  1374,   318,   340,  1016,    30, 50256,  1026,   338,
          1016,   880,  5145,  1374,   546,   345,  5633, 50256]])

In [10]:
tokenizer.decode(output[0], skip_special_tokens=True)

"Hi! How is it going?It's going well! How about you?"

In [17]:
def get_string_tokens_from_ids(indices):
    return tokenizer.decode(
        indices, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True
    ).split()


def get_string_tokens_from_text(text):
    remove_symbols = ".,:;?!*@#$%^&()"

    for ch in remove_symbols:
        text = text.replace(ch, "")
    return text.lower().split()


def inference(text, pline, full_text=False):
    def clean_text(txt):
        return ' '.join(txt.strip().split())

    inputs = clean_text(text) + tokenizer.eos_token
    out = pline(inputs)
    outputs = out[0]["generated_text"]

    if not full_text:
        return outputs[len(inputs):]

    return outputs


In [15]:
df_test = pd.read_csv("empatheticdialogues/test_rebuild.csv")
df_test

Unnamed: 0,context,response
0,Yeah about 10 years ago I had a horrifying exp...,Did you suffer any injuries?
1,Did you suffer any injuries?,No I wasn't hit. It turned out they were drunk...
2,No I wasn't hit. It turned out they were drunk...,Why did you feel guilty? People really shouldn...
3,Why did you feel guilty? People really shouldn...,I don't know I was new to driving and hadn't e...
4,"Well, can you tell me about your experience? I...",Yeah i wanted to tell you about the time i was...
...,...,...
8421,did you call the exterminator?,Not yet since it's the weekend. We live in Tex...
8422,Not yet since it's the weekend. We live in Tex...,I live in Texas to so i know those feels
8423,"I have a big test on Monday, I am so nervous.",What is the test on?
8424,What is the test on?,It's for my Chemistry class. I haven't slept m...


In [16]:
from datasets import load_metric

In [None]:
meteor = load_metric("meteor")

In [None]:
meteor_values = []
for idx, (context, response) in df_test.iterrows():
    gt = get_string_tokens_from_text(response)
    pred = get_string_tokens_from_text(inference(context, pline))

    results = meteor.compute(predictions=[pred], references=[[gt]])
    meteor_values.append(results["meteor"])


In [29]:
np.mean(meteor_values)

0.3362979276623957