In [None]:
"""
Created on Thu Jun  1 11:34:23 2023
@author: DSJoshi
"""

#!pip install transformers

In [None]:
DATA_PATH = '/content/gdrive/MyDrive/q3_245_proj/'
OUTPUT_PATH = '/content/gdrive/MyDrive/q3_245_proj/'

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from datasets import load_dataset

class AHDataset(Dataset):
    def __init__(self, texts, tokenizer):
        texts.append('dummy')

        self.X = []
        for i in range(0, len(texts) - 1):
            self.X.append("<startofstring> "+ texts[i] +" <bot>: " + texts[i+1] + " <endofstring>")

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

def load_data():
  ds = load_dataset('Deojoandco/reddit-ah-dialogturns-annotations')
  train_df = pd.DataFrame(ds['train'])
  train_df['text'] = train_df['text'].str.replace('\n', ' ')

  train_ds = AHDataset(train_df['text'].tolist(), tokenizer)
  train_dl =  DataLoader(train_ds, batch_size=32)

  val_df = pd.DataFrame(ds['validation'])
  val_df['text'] = val_df['text'].str.replace('\n', ' ')

  val_ds = AHDataset(val_df['text'].tolist(), tokenizer)
  val_dl =  DataLoader(val_ds, batch_size=4)

  test_df = pd.DataFrame(ds['test'])
  test_df['text'] = test_df['text'].str.replace('\n', ' ')
  return train_dl, val_dl, test_df


In [None]:

from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm_notebook
import torch
import os

def epoch_train(epoch, model, optim, train_dl):
  model.train()
  epoch_loss = 0
  pbar = tqdm_notebook(train_dl, desc=f'Training Epoch: {epoch}')
  for X, a in pbar:
      X = X.to(device)
      a = a.to(device)

      optim.zero_grad()
      loss = model(X, attention_mask=a, labels=X).loss
      loss.backward()

      batch_loss = loss.item()
      pbar.set_postfix_str(f"Batch Loss: {batch_loss}")

      epoch_loss += batch_loss
      optim.step()

  epoch_loss = epoch_loss / len(train_dl)
  return epoch_loss

def epoch_evaluate(epoch, model, optim, val_dl):
  model.eval()
  epoch_loss = 0
  pbar = tqdm_notebook(val_dl, desc=f'Validating Epoch: {epoch}')
  for X, a in pbar:
      X = X.to(device)
      a = a.to(device)

      loss = model(X, attention_mask=a, labels=X).loss
      batch_loss = loss.item()
      pbar.set_postfix_str(f"Batch Loss: {batch_loss}")

      epoch_loss += batch_loss

  epoch_loss = epoch_loss / len(val_dl)
  return epoch_loss

def train(train_dl, val_dl, model, optim):

    epochs = 12
    best_valid_loss = float('inf')

    for epoch in tqdm_notebook(range(epochs), desc="Epochs"):
        train_loss = epoch_train(epoch, model, optim, train_dl)

        val_loss = 0
        if val_dl is not None:
          val_loss = epoch_evaluate(epoch, model, optim, val_dl)

        print(f'Epoch: {epoch}')
        print(f'\tTrain Loss: {train_loss}')
        print(f'\t Val. Loss: {val_loss}')
        print(infer("Hey, have you listened to The Reeve’s Tale podcast?"))

        if val_loss == 0:
          torch.save(model.state_dict(), os.path.join(OUTPUT_PATH, f'ahGPT-model.pt'))
        else:
          if val_loss < best_valid_loss:
            best_valid_loss = val_loss
            torch.save(model.state_dict(), os.path.join(OUTPUT_PATH, f'ahGPT-model-v4.pt'))
          #else:
          #  torch.save(model.state_dict(), os.path.join(OUTPUT_PATH, f'ahGPT-model-{epoch}.pt'))


def infer(inp):
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a )
    output = tokenizer.decode(output[0])
    outputs = output.split('<bot>:')
    if len(outputs) == 2:
      output = outputs[1].strip()
    return output


device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
optim = Adam(model.parameters(), lr=1e-3)

train_dl, val_dl, test_df = load_data()

print("training .... ")
train(train_dl, val_dl, model, optim)

model.push_to_hub('ah-GPT2-v4', token ='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')
tokenizer.push_to_hub('ah-GPT2-v4', token ='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')



training .... 


Epochs:   0%|          | 0/12 [00:00<?, ?it/s]

Training Epoch: 0:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 0:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 0
	Train Loss: 4.610677677796656
	 Val. Loss: 3.1276809024222105




Yes, I


Training Epoch: 1:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 1:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 1
	Train Loss: 2.865936024730424
	 Val. Loss: 3.031212199470143
Yes, I


Training Epoch: 2:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 2:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 2
	Train Loss: 2.312357352786805
	 Val. Loss: 3.1069458737785434
Yes, I


Training Epoch: 3:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 3:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 3
	Train Loss: 1.884679198027607
	 Val. Loss: 3.310837021286105
Yes, I


Training Epoch: 4:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 4:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 4
	Train Loss: 1.5769765032715057
	 Val. Loss: 3.495178687719651
Yes, I


Training Epoch: 5:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 5:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 5
	Train Loss: 1.3438041806221008
	 Val. Loss: 3.7300411386254395
Yes, I


Training Epoch: 6:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 6:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 6
	Train Loss: 1.158742352429614
	 Val. Loss: 3.9208151278672396
Yes, I


Training Epoch: 7:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 7:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 7
	Train Loss: 1.0151431546268235
	 Val. Loss: 4.072017649368004
Yes, I


Training Epoch: 8:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 8:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 8
	Train Loss: 0.903124200751582
	 Val. Loss: 4.259828417389481
Yes, I


Training Epoch: 9:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 9:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 9
	Train Loss: 0.8143956478373463
	 Val. Loss: 4.4088833788294854
Yes, I


Training Epoch: 10:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 10:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 10
	Train Loss: 0.7432756581866884
	 Val. Loss: 4.679633877895497
Yes, I


Training Epoch: 11:   0%|          | 0/502 [00:00<?, ?it/s]

Validating Epoch: 11:   0%|          | 0/411 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Epoch: 11
	Train Loss: 0.6887479145450895
	 Val. Loss: 4.702435954706169
Yes, I


In [None]:
import statistics
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def test_infer(inp, gold):
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"]
    a = inp["attention_mask"]
    gold_tok = tokenizer(gold, return_tensors="pt")

    outputs = model(X, attention_mask=a, labels=gold_tok["input_ids"]).loss
    return outputs.item()

model = GPT2LMHeadModel.from_pretrained('Deojoandco/ah-GPT2-v4')
tokenizer = GPT2Tokenizer.from_pretrained('Deojoandco/ah-GPT2-v4')

records = test_df.to_dict('records')

losses = []
for i in range(0, len(records) - 2, 2):
    query = records[i]['text']
    gold = records[i+1]['text']
    item_loss = test_infer(query, gold)
    losses.append(item_loss)

test_loss = sum(losses)/len(losses)
print(f'test loss: {test_loss}')

test loss: 2.946188620679701
