<a href="https://colab.research.google.com/github/clam004/case/blob/main/ACES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
! pip install transformers[sentencepiece] datasets

In [6]:
# Mount to my google drive
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/case
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/case
ACES.ipynb	     modelstates  T0_3B_dialog_summarization.ipynb
empatheticdialogues  __pycache__  utils.py


In [8]:
#sys libs
import os
import sys
import random
import time
import re
import json

#string manupulation libs
import re
import string

#data manupulation libs
import numpy as np

#plotting tools
from matplotlib import pyplot as plt 

#torch libs
import torch
print('torch.__version__', torch.__version__)
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print('torch.cuda.device_count()', torch.cuda.device_count())
print('torch.cuda.empty_cache()', torch.cuda.empty_cache())

#huggingface transformers
import transformers
print('transformers.__version__', transformers.__version__)
from transformers import set_seed
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# seeds
set_seed(42)
np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

%load_ext autoreload
%autoreload 2
%matplotlib inline

torch.__version__ 1.11.0+cu113
torch.cuda.device_count() 1
torch.cuda.empty_cache() None
transformers.__version__ 4.20.1
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
class BaseAgent(torch.nn.Module):

    def __init__(self, pretrained_model = 'gpt2'):
        
        super().__init__()
        
        if pretrained_model in ['gpt2']:
            
            cache_dir = os.path.join(
                "./modelstates/hugface_models/",
                pretrained_model,
            )
            
            print("cache_dir=", cache_dir)
            
            model_save_path = os.path.join(
                "./modelstates/finetuned_models",
                pretrained_model,
            )
            
            print("model_save_path=", model_save_path)
            
            self.tokenizer = GPT2Tokenizer.from_pretrained(
                pretrained_model,
                pad_token='<|endoftext|>',
            )
            
            self.model = GPT2LMHeadModel.from_pretrained(
                pretrained_model,
                cache_dir=cache_dir,
            )
        
        self.optimizer = torch.optim.Adam(
            self.model.parameters(),
            lr=0.00005,
            betas=(0.9, 0.98),
            eps=1e-9,
        )
        
        self.num_gpus = torch.cuda.device_count()
        
        if self.num_gpus > 1:
            self.model.parallelize()
        elif self.num_gpus == 1:
            self.gpu0 = torch.device('cuda:0')
            #self.model = self.model.cuda()
            self.model = self.model.to(self.gpu0)
            '''you can do .to(cuda0) with tensors to'''
            
        self.model_device = next(self.model.parameters()).device
        print('model_device', self.model_device)
        
        self.num_params = \
          sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        print("num_params", self.num_params)
        
    def get_response(self, prompt, max_len = 32):
        
        prompt_dic = self.tokenizer(prompt,return_tensors="pt")
        prompt_ids = prompt_dic.input_ids
        prompt_mask = prompt_dic.attention_mask
        prompt_len = prompt_ids.shape[1]

        if self.num_gpus > 0:
            prompt_ids = prompt_ids.to(self.model_device)
            prompt_mask = prompt_mask.to(self.model_device)
        
        if self.num_gpus > 0:
            prompt_ids = prompt_ids.to(self.model_device)
            prompt_mask = prompt_mask.to(self.model_device)
        
        prompt_len = prompt_ids.shape[1]
        
        output_ids = self.model.generate(
            prompt_ids,
            attention_mask = prompt_mask,
            max_length=prompt_len+max_len,
        )

        generated_text = self.tokenizer.batch_decode(output_ids)[0]
        
        return generated_text
    
    def memorize(self, prompt, num_epochs = 3):

        print('start training loop')

        """ This is a rudimentary training loop
        that will train the agent to learn one
        sequence, the prompt. With enough epochs, this should
        result in memorizing the sequence, which is why this
        class method was names memorize. 
        There is nothing returned because the model attribute is modified inplace. 
        Args:
            prompt (string): the text to be learned
            num_epochs (int): the number of times we cycle though the training data, only 1 sample in this case
        """

        
        
        prompt_dic = self.tokenizer(prompt,return_tensors="pt")
        prompt_ids = prompt_dic.input_ids
        prompt_mask = prompt_dic.attention_mask
        prompt_len = prompt_ids.shape[1]

        if self.num_gpus > 0:
            prompt_ids = prompt_ids.to(self.model_device)
            prompt_mask = prompt_mask.to(self.model_device)
            
        source_ids = prompt_ids[:,:-1]
        target_ids = prompt_ids[:,1:]
        source_mask = prompt_mask[:,:-1]
        target_mask = prompt_mask[:,1:]

        # allow params to be updated
        self.model.train()

        for e in range(num_epochs):

            # Forward Pass To Loss
            output = self.model(
                input_ids = source_ids,
                attention_mask = source_mask,
            )

            # used logits and target tokens to calculate the loss
            logits = output.logits

            loss = cross_entropy_loss(
                logits, 
                target_ids, 
            )

            '''
            # Equally valid way to do Forward Pass To Loss
            # labels are automatically shifted into targets
            outputs = self.model(
                input_ids = prompt_ids,
                labels = prompt_ids,
                attention_mask = prompt_mask,
                token_type_ids=None,
            )

            # used logits and target tokens to calculate the loss
            loss = outputs.loss
            logits = outputs.logits
            '''

            # backward pass
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()

            print("epoch", e, "loss", loss.item())

        torch.cuda.empty_cache()


def cross_entropy_loss(logits, target_ids):
    
    """
    For F.cross_entropy the Input is shape (N, C), where N = batch_size x sequence_length
    and C is the number of classes, in our case C is the number of tokens in the vocabulary
    Target is shape (N).
    https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
    we flatten the batch dimension together with the max_seq length
    so that for the loss funstion, so afterwards, there is no batch dimension,
    just a vector sized C-dimensions for each of the seq_len tokens. 
    If there had been 2 sampels with a batch size of 2, with 3 tokens in each sample
    then the predictions.shape would be torch.Size([6, 50257])
    Args:
        logits (torch.tensor, float): shape [batch_size, sequence_length, vocab_size]
        target_ids (torch.tensor, int): shape [batch_size, sequence_length]
    Returns: 
        scalar_loss (torch.tensor, scalar float, grad_fn=<NllLossBackward0>)): no shape
            this is a loss you can backpropagate using:
            optimizer.zero_grad()
            scalar_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
    """
    
    predictions = logits.view(-1, logits.size(-1))
    target = target_ids.view(-1)

    scalar_loss = F.cross_entropy(
        predictions,
        target,
    )

    return scalar_loss

# place the model and tokenizer into our dialog agent

agent = BaseAgent(
    pretrained_model = 'gpt2'
)

## Agent Conversationnel Emotionnel Social (ACES)

Large Causal Language Models, also known as autoregresive models, make excllent chatbots because they are not only trained to predict the next tokens in dialog text but have also pretrained on the much larger body of data on the internet aside from conversation. 

But useful agents cannot simply reply with the most likely response from it's training data. There needs to be a self awareness or self monitoring and the human controller should be able to update the behavior of the agent.


Some strategies  might include: training and evaluating on few shot monitoring goals, GANs to maintain conversational realism, planning and explaining


In [16]:


# an example of how the pretrained model can extend dialog

generated_text = agent.get_response(
    prompt = "A: Hello.\nB: Hi.\nA: How was your day?\nB:",
    max_len = 16
)

print(generated_text)

# an example of how to learn to extend in a directed manner

agent.memorize(
    "A: Hello.\nB: Hi.\nA: How was your day?\nB: First, you tell me about your day.",
    num_epochs = 5,
)

generated_text = agent.get_response(
    prompt = "A: Hello.\nB: Hi.\nA: How was your day?\nB:",
    max_len = 16
)

print(generated_text)

cache_dir= ./modelstates/hugface_models/gpt2
model_save_path= ./modelstates/finetuned_models/gpt2


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


model_device cuda:0
num_params 124439808
A: Hello.
B: Hi.
A: How was your day?
B: I was in the hospital.
A: What was your name?
B
start training loop
epoch 0 loss 2.6564619541168213
epoch 1 loss 2.4013028144836426


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 2 loss 1.9447654485702515
epoch 3 loss 1.4950447082519531
epoch 4 loss 0.9667388200759888
A: Hello.
B: Hi.
A: How was your day?
B: Well, I was just sitting in my room, and I was thinking about the


In [17]:
# !wget https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
# !tar -xvf empatheticdialogues.tar.gz
# !rm empatheticdialogues.tar.gz
path_to_empatheticdialogues = 'empatheticdialogues/'
data_split_list = os.listdir(path_to_empatheticdialogues)
print(data_split_list)

['test.csv', 'train.csv', 'valid.csv']
