In [0]:
import csv
import torch
import numpy as np
from torchtext.data.utils import get_tokenizer
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
import math
from tqdm import tqdm
import random
import time

In [0]:
# if this cell prints "Running on cpu", you must switch runtime environments
# go to Runtime > Change runtime type > Hardware accelerator > GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on {}".format(device))

Running on cuda


# Data processing

In [0]:
#This gets the EmpatheticDialogues corpus
!wget https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
!tar -xvf empatheticdialogues.tar.gz
#This gets the GLoVe embeddings, which we will use to bootstrap our model
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2020-05-03 02:48:29--  https://dl.fbaipublicfiles.com/parlai/empatheticdialogues/empatheticdialogues.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 2606:4700:10::6816:4a8e, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28022709 (27M) [application/gzip]
Saving to: ‘empatheticdialogues.tar.gz’


2020-05-03 02:48:34 (7.63 MB/s) - ‘empatheticdialogues.tar.gz’ saved [28022709/28022709]

empatheticdialogues/
empatheticdialogues/test.csv
empatheticdialogues/train.csv
empatheticdialogues/valid.csv
--2020-05-03 02:48:42--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-05-03 02:4

In [0]:
PAD_INDEX = 0             # reserved for padding words
UNKNOWN_INDEX = 1         # reserved for unknown words
START_DECODE = 2          # special symbol to denote decoding should start
END_DECODE = 3            # special symbol to indicate decoding is ending. This is how the model indicates the sequence is done.

def read_embeddings(filename, vocab_size=10000):
  """
  Utility function, loads in the `vocab_size` most common embeddings from `filename`
  
  Arguments:
  - filename:     path to file
                  automatically infers correct embedding dimension from filename
  - vocab_size:   maximum number of embeddings to load

  Returns 
  - embeddings:   torch.FloatTensor matrix of size (vocab_size x word_embedding_dim)
  - vocab:        dictionary mapping word (str) to index (int) in embedding matrix
  """

  # get the embedding size from the first embedding
  with open(filename, encoding="utf-8") as file:
    word_embedding_dim = len(file.readline().split(" ")) - 1

  vocab = {}
  vocab["PAD_INDEX"] = 0
  vocab["UNKNOWN_INDEX"] = 1
  vocab["START_DECODE"] = 2
  vocab["END_DECODE"] = 3

  embeddings = np.zeros((vocab_size, word_embedding_dim))

  with open(filename, encoding="utf-8") as file:
    for idx, line in enumerate(file):
      if idx + 4 >= vocab_size:
        break

      cols = line.rstrip().split(" ")
      val = np.array(cols[1:])
      word = cols[0]
      embeddings[idx + 4] = val
      vocab[word] = idx + 4
  
  # a FloatTensor is a multidimensional matrix
  # that contains 32-bit floats in every entry
  # https://pytorch.org/docs/stable/tensors.html
  return torch.FloatTensor(embeddings), vocab

In [0]:
# Let's load in a spacy tokenizer to process our conversation data
tokenizer = get_tokenizer("spacy")

class Dataset():
  '''
  This is a Dataset object, similar to the one used in HW4.

  It serves two purposes- reading data and creating batches.

  read_data():
    Inputs: 
      filename 
      emotions_list- list of emotions to include in this dataset
    Outputs: 
      emotions-     list of emotions associated with the dataset
      past_turns-   list of past turns associated with the dataset.  
                    This is input to our model.
      responses-    list of responses associated with the dataset.  This is what 
                    we will train our model to generate.

  get_batches():
    Inputs: batch_size- size of batches we want to create 
            vocab-      our vocabulary, used to replace unknown words
            emotset-    set of emotions to id, used to create emotion IDs
    Outputs:  
      batched_past_turn_idx:      indices of the words in the past turn
      batched_past_lengths:       lengths of the past turns (since we are padding these)
      batched_response_idx:       indices of words in the response
      batched_past_resp_lengths:  lengths of the response (since we pad those)
      batched_emotions:           emotions associated with the conversations

  '''
  def __init__(self, filename, emotions_list=None):
    if emotions_list is not None:
      self.emotions_list = ['context'] + emotions_list
    else:
      self.emotions_list = None
    self.emotions, self.past_turns, self.responses = self.read_data(filename)

  def read_data(self, filename):
    past_turns = []
    responses = []
    emotions = []

    raw_data = {}

    with open(filename, encoding='utf8') as f:
      csvreader = csv.reader(f, delimiter=',')
      for row in csvreader:
        convo_num = row[0]
        emotion = row[2]  
        utterance = row[5]
        #if we receive an emotions_list, we need to make sure the emotion is relevant to the Dataset we create.
        if self.emotions_list is None or emotion in self.emotions_list:
          if convo_num not in raw_data:
            raw_data[convo_num] = {}
            raw_data[convo_num]["emotion"] = emotion
            raw_data[convo_num]["convo"] = []
          raw_data[convo_num]["convo"].append(utterance.replace("_comma_", ","))

      for key in raw_data:
        for i,turn in enumerate(raw_data[key]["convo"]):
          #we want to grab every other response
          if i % 2 == 0 and i != 0:
            emotions.append(raw_data[key]["emotion"])
            past_turns.append(tokenizer(raw_data[key]["convo"][i-1].lower()))
            responses.append(tokenizer(raw_data[key]["convo"][i].lower()))
    
    return emotions, past_turns, responses

  def get_batches(self, batch_size, vocab, emotset):
    # randomly shuffle the data
    np.random.seed(159) # don't change this, for reproducibility
    shuffle = np.random.permutation(range(len(self.past_turns)))
    
    #grabs the relevant data from the random permutation
    past_turns = [self.past_turns[i] for i in shuffle]
    emotions = [self.emotions[i] for i in shuffle]
    responses = [self.responses[i] for i in shuffle]

    #stores the id's of past_turn words
    batched_past_turn_idx = []
    #stores the id's of response words
    batched_response_idx = []
    #stores the lengths of past_turns for masking
    batched_past_lengths = []
    #stores the lengths of responses for masking
    batched_past_resp_lengths = []
    #stores the emotions associated with a batch
    batched_emotions = []

    #creates batches
    N = len(past_turns)
    if N % batch_size == 0:
      num_batches = N // batch_size
    else:
      num_batches = N // batch_size + 1

    for b in range(num_batches):
      start = b * batch_size
      stop = min((b+1) * batch_size, len(past_turns))
      #calculates the max lengths of response and past turn sequences for this batch
      max_resp_seq_len = max([len(s) for s in responses[start:stop]])
      max_past_seq_len = max([len(s) for s in past_turns[start:stop]])

      #creates the vectors for the past_turn and responses
      past_turn_idx = np.zeros((stop-start, max_past_seq_len))
      response_idx = np.zeros((stop-start, max_resp_seq_len + 2))
      emotion_idx = np.empty((stop-start, 1))
      past_lengths = np.zeros((stop-start))
      resp_lengths = np.zeros((stop-start))
      for i in range(start, stop):
        #gathers the corresponding data
        past_turn = past_turns[i]
        response = responses[i]
        emotion = emotions[i]
        #gets ID for corresponding emotion
        emotion_idx[i - start] = emotset[emotion]

        #We start the response with START_DECODE to indicate to the model that decoding should start
        response_idx[i - start][0] = START_DECODE

        #this captures the lengths 
        past_lengths[i - start] = len(past_turn)
        resp_lengths[i - start] = len(response)

        #this gets the vocabulary IDs for each word in the past_turn and response
        #UNKNOWN_INDEX is used if the word is out of vocabulary
        for j in range(len(past_turn)):
          if past_turn[j] in vocab:
            past_turn_idx[i - start][j] = vocab[past_turn[j]]
          else:
            past_turn_idx[i - start][j] = UNKNOWN_INDEX      
        for j in range(len(response)):
          if response[j] in vocab:
            response_idx[i - start][j + 1] = vocab[response[j]]
          else:
            response_idx[i - start][j + 1] = UNKNOWN_INDEX
            
        #we want to end the response with END_DECODE so the model learns to predict the end of an utterance
        response_idx[i - start][len(response)] = END_DECODE
      batched_past_turn_idx.append(past_turn_idx)
      batched_response_idx.append(response_idx)
      batched_past_lengths.append(past_lengths)
      batched_past_resp_lengths.append(resp_lengths)
      batched_emotions.append(emotion_idx)
    return batched_past_turn_idx, batched_past_lengths, batched_response_idx, batched_past_resp_lengths, batched_emotions

In [0]:
'''
Helper function to extract a set of emotions from a dataset and associate them 
with an ID.

Arguments-
  emotion_file:   Data file we want to extract emotions from

Returns:
  emotset:        Dictionary of emotions
'''
def read_emotions(emotion_file):
  emotset = {}
  with open(emotion_file, encoding='utf8') as f:
    csvreader = csv.reader(f, delimiter=',')
    counter = 0
    for row in csvreader:
      emotion = row[2]
      if emotion not in emotset:
        emotset[emotion] = counter
        counter += 1
  
  return emotset

In [0]:
# this loads the 10,000 most common word 300-dimensional embeddings
vocab_size = 10000
embeddings, vocab = read_embeddings('glove.6B.300d.txt', vocab_size)

# read the files
emotset = read_emotions('empatheticdialogues/train.csv')
train_dataset = Dataset('empatheticdialogues/train.csv')
dev_dataset = Dataset('empatheticdialogues/valid.csv')
test_dataset = Dataset('empatheticdialogues/test.csv')

BATCH_SIZE = 32

train_batched_past_turn_idx, train_batched_past_lengths, train_batched_response_idx, train_batched_past_resp_lengths, train_batched_emotions = train_dataset.get_batches(BATCH_SIZE, vocab, emotset)
dev_batched_past_turn_idx, dev_batched_past_lengths, dev_batched_response_idx, dev_batched_past_resp_lengths, dev_batched_emotions= dev_dataset.get_batches(BATCH_SIZE, vocab, emotset)
test_batched_past_turn_idx, test_batched_past_lengths, test_batched_response_idx, test_batched_past_resp_lengths, test_batched_emotions= test_dataset.get_batches(BATCH_SIZE, vocab, emotset)

# Transformer Model

In [0]:
'''
This class is used to create transformer-style positional encodings.  
Reference: https://github.com/pytorch/pytorch/issues/24826
Note: these are different than the categorical positional encodings discussed in 
class for Information Extraction.
'''
class TransformerPositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=160):
        super(TransformerPositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        #import pdb; pdb.set_trace()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

'''
This is the TransformerGenerator class, where the generation model is set up 
and the model structure is defined.

Please fill in your solution where you see "..." 
'''
class TransformerGenerator(nn.Module):
  def __init__(self, embeddings, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
    super().__init__()
    self.ninp = ninp # 300
    self.dropout = dropout
    self.ntoken = ntoken # 10,000
    self.embed = nn.Embedding.from_pretrained(embeddings, freeze=False)
    self.model = nn.Transformer(d_model=ninp, nhead=nhead,num_encoder_layers=nlayers,
                                num_decoder_layers=nlayers, 
                                dropout=dropout)
    #self.out = math.sqrt #for all elems of a vector? dot product?
    self.out = nn.Linear(ninp,ntoken)
    self.pos_encoder = TransformerPositionalEncoding(d_model=ninp,dropout=dropout)
    self.pos_decoder = TransformerPositionalEncoding(d_model=ninp,dropout=dropout)

  def forward(self, past_turn, past_turn_lengths, response, response_lengths):
    '''
    print('past turn',type(past_turn))
    print('past turn',past_turn.shape)
    print('past turn transposed',past_turn.transpose(1,0))
    print('past turn transposed shape',past_turn.transpose(1,0).shape)
    print('response',type(response))
    print('response',response.shape)
    print('response transposed',response.transpose(1,0))
    print('response transposed shape',response.transpose(1,0).shape)
    print('past lengths',past_turn_lengths)
    print('resp lens',response_lengths)
    print('past turn len',type(past_turn_lengths))
    print('past turn len', past_turn_lengths.shape)
    print('response len',type(response_lengths))
    print('response len',response_lengths.shape)
    '''
    
    past_turn = torch.LongTensor(past_turn).to(device) #tensor([26,32]), batch of sentences each of length 26
    response = torch.LongTensor(response).to(device) #tensor([49,32]), batch of responses, each of length 49
    past_turn_lengths = torch.LongTensor(past_turn_lengths)  #tensor([32]), lens of each past_turn sample in batch size 32
    response_lengths = torch.LongTensor(response_lengths) #tensor([32]), lens of each response sample in batch size 32
    # create embedding for past turn and response before giving them a positional embedding
    #trans_past_turn = past_turn.transpose(1,0)
    #trans_response = response.transpose(1,0)
    mask_src = [] # BATCH SIZE X MAX SRC LENGTH
    mask_tgt = [] # BATCH SIZE X MAX TGT LENGTH
    # each index will correspond to that sentence's length in past_turn/response lengths
    # TRUE = padded indices
    # FALSE = non-padded indices aka != 0
    #for sentence_idx in range(len(trans_past_turn)): # BATCH SIZE
    batch_size = len(past_turn_lengths)
    src_max_len = past_turn.shape[0]
    tgt_max_len = response.shape[0]
    for sentence_idx in range(batch_size): # BATCH SIZE
      sentence_values = []
      for word_idx in range(src_max_len):
        if word_idx >= past_turn_lengths[sentence_idx]:
        #if past_turn[sentence_idx][word_idx] >= past_turn_lengths[sentence_idx]:
          sentence_values.append(True)
        else:
          sentence_values.append(False)
      mask_src.append(sentence_values) # BATCH SIZE times
    
    for sentence_idx in range(batch_size):
      sentence_vals = []
      for word_idx in range(tgt_max_len):
        if word_idx >= response_lengths[sentence_idx]:
        #if response[sentence_idx][word_idx] >= response_lengths[sentence_idx]:
          sentence_vals.append(True)
        else:
          sentence_vals.append(False)
      mask_tgt.append(sentence_vals) # BATCH SIZE times

    # CONVERT MASKS TO BOOLEAN TENSORS
    src_masks = torch.BoolTensor(mask_src) # tensor([32,26])
    tgt_masks = torch.BoolTensor(mask_tgt) # tensor([32,49])

    # CONVERT PAST_TURN AND RESPONSE TO EMBEDDINGS
    past_turn_embeddings = self.embed(past_turn) # tensor([26,32,300])
    response_embeddings = self.embed(response) # tensor([49,32,300])

    # CREATE POSITIONAL EMBEDDINGS
    src = self.pos_encoder.forward(past_turn_embeddings) #input to transformer encoder, tensor([26,32,300])
    tgt = self.pos_decoder.forward(response_embeddings) #input to transformer decoder, tensor([49,32,300])

    #Ensures decoder doesn't peek at the future tokens
    cheater_mask = self.model.generate_square_subsequent_mask(sz = len(tgt)).to(device)

    output = self.model(src, tgt, tgt_mask = cheater_mask, src_key_padding_mask=src_masks.to(device), tgt_key_padding_mask = tgt_masks.to(device))
    output = self.out(output)
    #print("output shape", output.shape)
    return output

'''
This is a function which is used to evaluate a model on a development dataset.
This method does not update the model; rather, it is used to evaluate a model's responses
on a dataset.

Arguments:
  model:        Model to evaluate
  dev_dataset:  Dataset we want to evaluate with
  batch_size:   batch size for dev dataset
  vocab:        Vocabulary for the dataset
  emotset:      Set of emotions for the dataset

Returns:
  avg_loss:     The average loss for the model on the dev_dataset.
'''
def evaluate_on_data(model, dev_dataset, batch_size, vocab,  emotset):
    loss_function = nn.CrossEntropyLoss(ignore_index=vocab["PAD_INDEX"])
    dev_batched_past_turn_idx, dev_batched_past_lengths, dev_batched_response_idx, dev_batched_resp_lengths, dev_batched_emotions= dev_dataset.get_batches(BATCH_SIZE, vocab, emotset)
    model.eval()
    with torch.no_grad():
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      total_loss = 0
      for b in range(len(dev_batched_past_turn_idx)):
          # have to transpose since model expects them in a certain format
          src = dev_batched_past_turn_idx[b].transpose([1, 0])
          tgt = dev_batched_response_idx[b].transpose([1, 0])

          #calls the model on the current batch's input
          logits = model.forward(src, dev_batched_past_lengths[b], tgt[:-1,:], dev_batched_resp_lengths[b])
          # move labels to GPU memory
          labels = torch.LongTensor(dev_batched_response_idx[b].transpose([1,0])).to(device)
          # compute the loss with respect to true words
          loss = loss_function(logits.view(-1, 10000), labels[1:,:].reshape(-1))
          total_loss += loss
      avg_loss = total_loss / float(len(dev_batched_past_turn_idx))
      avg_loss = float(avg_loss.detach().cpu().numpy())
      return avg_loss

'''
This is the function used to train a model.

Arguments:
  model:          model we want to train
  train_dataset:  dataset we want to train the model with
  dev_dataset:    dataset we want to evaluate model with during training
  batch_size:     batch size for training
  vocab:          vocabulary for the dataset
  emotset:        emotion set for the dataset
  lr:             learning rate we want to use
  num_epochs:     epochs we want to train our model for
  eval_every:     how often we want to evaluate on the dev dataset
'''
def run_training(model, train_dataset, dev_dataset, batch_size, vocab,  emotset,
                         lr=1e-4, num_epochs=100, eval_every=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if str(device) == 'cpu':
      print("Training only supported in GPU environment")
      return


    # clear unreferenced data/models from GPU memory 
    torch.cuda.empty_cache()
    # move model to GPU memory
    model.to(device)

    # set the optimizer (Adam) and loss function (CrossEnt)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss(ignore_index=vocab["PAD_INDEX"])

    # batch training and dev data
    train_batched_past_turn_idx, train_batched_past_lengths, train_batched_response_idx, train_batched_resp_lengths, train_batched_emotions = train_dataset.get_batches(BATCH_SIZE, vocab, emotset)
    #dev_batched_past_turn_idx, dev_batched_past_lengths, dev_batched_response_idx, dev_batched_resp_lengths, dev_batched_emotions= dev_dataset.get_batches(BATCH_SIZE, vocab, emotset)

    t0 = time.time()
    print("**** TRAINING *****")
    for i in range(num_epochs):
      if i % eval_every == 0:
      #  # Run on Dev data
         dev_loss = evaluate_on_data(model, dev_dataset, batch_size, vocab,  emotset)
         print("-------------------------------")
         print("Dev Loss: {}".format(dev_loss))
         print("-------------------------------")

      # sets the model in train mode
      model.train()
      total_loss = 0
      for b in range(len(train_batched_past_turn_idx)):

        # have to transpose since model expects them in a certain format
        src = train_batched_past_turn_idx[b].transpose([1, 0])
        tgt = train_batched_response_idx[b].transpose([1, 0])

        #calls the model on the current batch's input
        logits = model.forward(src, train_batched_past_lengths[b], tgt[:-1,:], train_batched_resp_lengths[b])
        # move labels to GPU memory
        labels = torch.LongTensor(train_batched_response_idx[b].transpose([1,0])).to(device)
        # compute the loss with respect to true words
        loss = loss_function(logits.view(-1, 10000), labels[1:,:].reshape(-1))
        total_loss += loss

        # propagate gradients backward
        loss.backward()
        optimizer.step()
        # set model gradients to zero before performing next forward pass
        model.zero_grad()

      seconds_elapsed = time.time()-t0
      mins = int(np.floor(seconds_elapsed/60))
      secs = int(seconds_elapsed - (60*mins))
      print("Epoch {} | Train Loss: {} | Time: {} mins, {} secs".format(i, total_loss / float(len(train_batched_past_turn_idx)),mins,secs))


In [0]:
def set_seed(seed):
  """
  Sets random seeds and sets model in deterministic
  training mode. Ensures reproducible results
  """
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)

In [0]:
# sets the random seed – DO NOT change this
# this ensures deterministic results that are comparable with the staff values
set_seed(159)

'''Do NOT change these parameters'''
#Number of vocabulary words we have
VOCAB_SIZE = 10000
#Size of our word embeddings.  We embed each word before passing into the transformer layer,
#so the transformer needs to know how large these embeddings will be
NINP = 300
#The number of heads we want our transformer model to have
NHEAD = 6
#The size of hidden dimensions we want our transformer to have
NHIDDEN = 200
#The number of layers we want our transformer to have
NLAYERS = 1
#Dropout rate
DROPOUT = 0.4

In [0]:
np_ar = np.array([1,2,3])
np_ar = np.expand_dims(np_ar,axis=1)
print(np_ar.shape)

(3, 1)


In [0]:
'''
NOTE: do NOT run this cell if you are loading a pre-trained model.
'''

#This is the call which initializes the model
model = TransformerGenerator(embeddings, VOCAB_SIZE, NINP, NHEAD, NHIDDEN, NLAYERS, DROPOUT)

# This call trains the model.  If you have implemented Q1 correctly, the loss should decrease from ~5.6 to ~3.8
# Sanity check: if this function fails, your Q1 code is probably incorrect. 
run_training(model, train_dataset, dev_dataset, BATCH_SIZE, vocab, emotset, 
                   lr=1e-4, num_epochs=25, eval_every=5)

**** TRAINING *****
-------------------------------
Dev Loss: 9.349087715148926
-------------------------------
Epoch 0 | Train Loss: 5.785675525665283 | Time: 0 mins, 38 secs
Epoch 1 | Train Loss: 5.154396057128906 | Time: 1 mins, 13 secs
Epoch 2 | Train Loss: 4.895057201385498 | Time: 1 mins, 47 secs
Epoch 3 | Train Loss: 4.742707252502441 | Time: 2 mins, 22 secs
Epoch 4 | Train Loss: 4.638689994812012 | Time: 2 mins, 56 secs
-------------------------------
Dev Loss: 4.63595724105835
-------------------------------
Epoch 5 | Train Loss: 4.562170028686523 | Time: 3 mins, 34 secs
Epoch 6 | Train Loss: 4.5001444816589355 | Time: 4 mins, 9 secs
Epoch 7 | Train Loss: 4.451632976531982 | Time: 4 mins, 44 secs
Epoch 8 | Train Loss: 4.406965255737305 | Time: 5 mins, 19 secs
Epoch 9 | Train Loss: 4.368797302246094 | Time: 5 mins, 53 secs
-------------------------------
Dev Loss: 4.444859027862549
-------------------------------
Epoch 10 | Train Loss: 4.335965156555176 | Time: 6 mins, 32 secs


**If you want to save your trained model so you don't have to train it again for #2, please run the following cell.  You will need to download the model file and import this to Colab the next time you'd like to load it.**

In [0]:
#Now, let's save this model so you won't have to run it again for #2.
torch.save(model.state_dict(), "./model")

In [0]:
#NOTE: if you have a saved version of the model, un-comment and run this code to load your model back in.
model = TransformerGenerator(embeddings, VOCAB_SIZE, NINP, NHEAD, NHIDDEN, NLAYERS, DROPOUT)
model.load_state_dict(torch.load("./model"))
model.eval()
model.to(device)

TransformerGenerator(
  (embed): Embedding(10000, 300)
  (model): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=300, out_features=300, bias=True)
          )
          (linear1): Linear(in_features=300, out_features=2048, bias=True)
          (dropout): Dropout(p=0.4, inplace=False)
          (linear2): Linear(in_features=2048, out_features=300, bias=True)
          (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.4, inplace=False)
          (dropout2): Dropout(p=0.4, inplace=False)
        )
      )
      (norm): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0): TransformerDecoderLayer(
          (self_attn): MultiheadAttention(
 

# Decoding

In [0]:
def id2string(vocab, response):

    """
    id2string function, takes a vocabulary set and response and translates the response to a list of strings using the vocab.
    
    Arguments:
    - vocab:        vocabulary, keys are strings of each word and values are the word's IDs in the dictionary
    - response:     list of word IDs we want to translate

    Returns 
    - str_response:   list of strings containing the words represented by the response's IDs
    """

    str_response = []
    #print(vocab)
    #print(response)
    keys = list(vocab.keys())
    values = list(vocab.values())
    for word_idx in response:
      val_idx = values.index(word_idx)
      word = keys[val_idx]
      str_response.append(word)
    return str_response

    
    
def decode(model, prev_turn, prev_lengths, vocab, max_len, random_top_k=False):#, batched_resps, batched_resp_lens, vocab, max_len):
    
    """
    Decode function, takes a trained model and past_turn and returns the model's generated response
    Arguments:
    - model:        trained model that we want to evaluate
    - prev_turn:    The previous turn we want to generate a model response for
    - prev_lengths: The length of the prev_turn
    - max_len:      The maximum decoded sequence length
    - random_top_k: Flag specifying whether to use topK decoding

    Returns 
    - id2string(vocab, prediction):   list of strings indicating the words produced 
                                      by the model, calculated using id2string helper function. 
    """
    '''
    past turn <class 'numpy.ndarray'>
    past turn (26, 32)
    response <class 'numpy.ndarray'>
    response (49, 32)
    past turn len <class 'numpy.ndarray'>
    past turn len (32,)
    response len <class 'numpy.ndarray'>
    response len (32,) [12,3,2] length of each response in the batch of 32 sentences

    output shape torch.Size([37, 32, 10000]) # (TARGET SEQ LEN, BATCH SIZE, VOCAB SIZE)
    '''
    
    prediction = [START_DECODE]
    prev = list(np.transpose(prev_turn))
    seq = list(np.transpose([np.array(prediction)]))
    seq_len = np.array([1])
    tok = START_DECODE
    if random_top_k == True:
      for i in range(max_len):
        output = model.forward(prev,prev_lengths,seq,seq_len).squeeze(1)
        sorted_output,indices = torch.sort(output[i],descending=True)
        five_max_probs = list(sorted_output)[:5]
        max_prob_idx = []
        for max_prob in five_max_probs:
          max_prob_idx.append(list(output[i]).index(max_prob))
        randInt = random.randint(0,4)
        token = max_prob_idx[randInt]
        prediction.append(token)
        seq = list(np.transpose([np.array(prediction)]))
        seq_len[0] += 1
        if token == END_DECODE:
          break
    
    else:
      for i in range(max_len):
        output = model.forward(prev,prev_lengths,seq,seq_len).squeeze(1)
        max_prob = torch.max(output[i]) # max prob
        max_prob_idx = list(output[i]).index(max_prob) # index of chosen word
        token = max_prob_idx
        #token = id2string(vocab,[max_prob_idx]) # chosen word
        prediction.append(token)
        seq = list(np.transpose([np.array(prediction)]))
        seq_len[0] += 1
        if token == END_DECODE:
          break
    
    return id2string(vocab,prediction)


### Let's test the base decode() function by evaluating it with some model responses to dev set data.

In [0]:
'''
This function runs a trained model to respond to random examples from a dataset

Arguments:
  dataset:      Dataset we want to evaluate the model with
  model:        The model we want to evaluate
  random_top_k: Whether we want to use topk decoding
'''
def generate_5_responses(dataset, model, random_top_k=False):
  dev_batched_past_turn_idx, dev_batched_past_lengths, dev_batched_response_idx, dev_batched_resp_lengths, dev_batched_emotions= dataset.get_batches(BATCH_SIZE, vocab, emotset)
  for _ in range(5):
    rand_batch = random.randint(0, len(dev_batched_past_turn_idx)-1)
    rand_item = random.randint(0, len(dev_batched_past_turn_idx[0])-1)
    print("Past response: ")
    print(id2string(vocab, [x for x in dev_batched_past_turn_idx[rand_batch][rand_item] if x != PAD_INDEX]))
    print("Model Response: ")
    model_resp = decode(model=model, prev_turn=[dev_batched_past_turn_idx[rand_batch][rand_item]], prev_lengths=[dev_batched_past_lengths[rand_batch][rand_item]], vocab=vocab, max_len=20, random_top_k=random_top_k)
    print(model_resp)
    print("Gold Response: ")
    print(id2string(vocab, [x for x in dev_batched_response_idx[rand_batch][rand_item][1:] if x != PAD_INDEX]))
    print()
    print("---------------------------------")
    print()

In [0]:
generate_5_responses(dev_dataset, model)

Past response: 
['oh', 'i', "'m", 'so', 'sorry', 'but', 'i', 'bet', 'that', 'was', 'very', 'special', 'of', 'you', 'to', 'find', '.']
Model Response: 
['START_DECODE', 'i', 'was', 'so', 'UNKNOWN_INDEX', '.', 'i', 'was', 'so', 'UNKNOWN_INDEX', 'END_DECODE']
Gold Response: 
['yeah', 'UNKNOWN_INDEX', 'we', 'do', "n't", 'have', 'many', 'videos', 'of', 'her', 'but', 'the', 'few', 'we', 'have', 'we', 'keep', 'them', 'safe', 'and', 'stored', 'END_DECODE']

---------------------------------

Past response: 
['that', "'s", 'terrible', '!', 'how', 'does', 'that', 'make', 'you', 'feel', '?']
Model Response: 
['START_DECODE', 'i', "'m", 'not', 'sure', '.', 'i', "'m", 'not', 'sure', 'i', "'m", 'not', 'sure', 'i', "'m", 'not', 'sure', 'i', "'m", 'not']
Gold Response: 
['it', 'makes', 'me', 'feel', 'old', '.', 'i', 'mean', 'i', 'am', 'not', 'that', 'old', ',', 'but', 'to', 'see', 'people', 'my', 'age', 'passing', 'away', 'really', 'makes', 'me', 'think', 'about', 'my', 'own', 'UNKNOWN_INDEX', 'END_DE

### Now, let's compare the base decode() method to the Random-Top5 decoding method. 

In [0]:
generate_5_responses(dev_dataset, model, random_top_k=True)

Past response: 
['maybe', 'she', 'was', 'around', ',', 'but', 'it', 'was', "n't", 'time', 'for', 'the', 'ball', 'yet', '.']
Model Response: 
['START_DECODE', 'i', 'was', 'a', 'UNKNOWN_INDEX', '.', 'i', 'was', 'so', 'happy', 'with', 'it', ',', 'so', 'she', 'is', 'a', 'little', 'UNKNOWN_INDEX', 'and', 'it']
Gold Response: 
['UNKNOWN_INDEX', ',', 'maybe', 'i', 'did', "n't", 'think', 'of', 'that', 'END_DECODE']

---------------------------------

Past response: 
['are', 'you', 'having', 'any', 'specific', 'problem', 'areas', '?']
Model Response: 
['START_DECODE', 'i', 'do', '.', 'i', 'am', 'not', 'really', 'sure', 'that', "'s", 'UNKNOWN_INDEX', ',', 'but', 'it', 'makes', 'it', 'END_DECODE']
Gold Response: 
['not', 'yet', ',', 'but', 'i', 'have', "n't", 'started', 'END_DECODE']

---------------------------------

Past response: 
['UNKNOWN_INDEX', ',', 'that', 'must', 'have', 'brought', 'up', 'a', 'lot', 'of', 'UNKNOWN_INDEX', 'for', 'you', '.']
Model Response: 
['START_DECODE', 'yes', ',', 

# Effect of Emotions

In [0]:
#Here is where we divide up the emotions in the emotset into positive and negative emotions, of equal sizes.
positive_emotions = ['anticipating', 'caring', 'confident', 'content', 'excited', 'faithful', 'grateful', 'hopeful', 'impressed', 'joyful', 'nostalgic', 'prepared', 'proud', 'sentimental','surprised','trusting']
negative_emotions = ['afraid', 'angry', 'annoyed', 'anxious', 'apprehensive', 'ashamed','devastated','disappointed','disgusted', 'embarrassed','furious','guilty','jealous','lonely','sad','terrified']
len(positive_emotions), len(negative_emotions)

(16, 16)

In [0]:
# creates the positive and negative datasets, specifying the relevant emotions_list
train_dataset_positive = Dataset('empatheticdialogues/train.csv', emotions_list=positive_emotions)
dev_dataset_positive = Dataset('empatheticdialogues/valid.csv', emotions_list=positive_emotions)
test_dataset_positive = Dataset('empatheticdialogues/test.csv', emotions_list=positive_emotions)

train_dataset_negative = Dataset('empatheticdialogues/train.csv', emotions_list=negative_emotions)
dev_dataset_negative = Dataset('empatheticdialogues/valid.csv', emotions_list=negative_emotions)
test_dataset_negative = Dataset('empatheticdialogues/test.csv', emotions_list=negative_emotions)

In [0]:
'''
NOTE: these models train quicker than the Q1 model, as they are trained on 1/2 of the data.
'''

#we need to reset these embeddings or they will be shared among all 3 models.
positive_embeddings, _ = read_embeddings('glove.6B.300d.txt', vocab_size)
negative_embeddings, _ = read_embeddings('glove.6B.300d.txt', vocab_size)

#This is the call which initializes the model
positive_model = TransformerGenerator(positive_embeddings, VOCAB_SIZE, NINP, NHEAD, NHIDDEN, NLAYERS, DROPOUT)
negative_model = TransformerGenerator(negative_embeddings, VOCAB_SIZE, NINP, NHEAD, NHIDDEN, NLAYERS, DROPOUT)

print("Positive model")
run_training(positive_model, train_dataset_positive, dev_dataset_positive, BATCH_SIZE, vocab, emotset, 
                   lr=1e-4, num_epochs=25, eval_every=5)
print("Negative model")
run_training(negative_model, train_dataset_negative, dev_dataset_negative, BATCH_SIZE, vocab, emotset, 
                   lr=1e-4, num_epochs=25, eval_every=5)

Positive model
**** TRAINING *****
-------------------------------
Dev Loss: 9.48857307434082
-------------------------------
Epoch 0 | Train Loss: 6.146296977996826 | Time: 0 mins, 17 secs
Epoch 1 | Train Loss: 5.481767654418945 | Time: 0 mins, 34 secs
Epoch 2 | Train Loss: 5.272855758666992 | Time: 0 mins, 51 secs
Epoch 3 | Train Loss: 5.090243339538574 | Time: 1 mins, 7 secs
Epoch 4 | Train Loss: 4.965153217315674 | Time: 1 mins, 23 secs
-------------------------------
Dev Loss: 4.959866523742676
-------------------------------
Epoch 5 | Train Loss: 4.865509986877441 | Time: 1 mins, 40 secs
Epoch 6 | Train Loss: 4.7839741706848145 | Time: 1 mins, 56 secs
Epoch 7 | Train Loss: 4.717936038970947 | Time: 2 mins, 12 secs
Epoch 8 | Train Loss: 4.65809440612793 | Time: 2 mins, 28 secs
Epoch 9 | Train Loss: 4.608442783355713 | Time: 2 mins, 44 secs
-------------------------------
Dev Loss: 4.678633213043213
-------------------------------
Epoch 10 | Train Loss: 4.564590930938721 | Time: 3 

### Let's decode the *positive* model on some *positive* data to see the types of responses it produces.

In [0]:
generate_5_responses(dataset=dev_dataset_positive, model=positive_model, random_top_k=True)

Past response: 
['moving', ',', 'is', 'always', 'such', 'a', 'pain', '.', 'did', 'it', 'go', 'UNKNOWN_INDEX', 'at', 'least', '?']
Model Response: 
['START_DECODE', 'yes', 'it', '.', 'UNKNOWN_INDEX', 'i', 'was', 'so', 'nice', 'for', 'the', 'same', ',', 'and', 'it', 'is', "n't", 'expecting', 'END_DECODE']
Gold Response: 
['yeah', '.', 'UNKNOWN_INDEX', 'he', 'kept', 'up', 'with', 'me', 'pretty', 'well', 'END_DECODE']

---------------------------------

Past response: 
['that', 'is', 'pretty', 'UNKNOWN_INDEX', 'actually', '.', 'i', 'wish', 'i', 'had', 'kids', 'to', 'bring', 'to', 'stuff', 'like', 'that', '.']
Model Response: 
['START_DECODE', 'i', 'was', 'so', 'UNKNOWN_INDEX', '.', 'we', 'are', 'a', 'good', 'UNKNOWN_INDEX', ',', 'i', 'hope', 'so', 'that', 'i', 'am', 'not', 'UNKNOWN_INDEX', 'to']
Gold Response: 
['you', 'should', 'check', 'your', 'local', 'library', '.', 'the', 'ones', 'in', 'my', 'area', 'have', 'tons', 'of', 'things', ',', 'like', 'a', 'UNKNOWN_INDEX', 'club', ',', 'for',

### Let's decode the *negative* model on some *negative* data to see the types of responses it produces.

In [0]:
generate_5_responses(dataset=dev_dataset_negative, model=negative_model, random_top_k=True)

Past response: 
['i', 'hope', 'it', "'s", 'nice', 'for', 'you', '.', 'do', 'you', 'have', 'any', 'alternate', 'plans', '?']
Model Response: 
['START_DECODE', 'UNKNOWN_INDEX', '.', 'UNKNOWN_INDEX', 'UNKNOWN_INDEX', '!', 'i', "'m", 'not', 'going', 'on', 'END_DECODE']
Gold Response: 
['we', 'do', ',', 'but', 'i', 'think', 'my', 'friends', 'want', 'to', 'do', 'the', 'UNKNOWN_INDEX', 'park', 'as', 'well', 'as', 'something', 'else', '.', 'i', 'do', "n't", 'know', 'if', 'i', 'have', 'money', 'for', 'both', 'END_DECODE']

---------------------------------

Past response: 
['UNKNOWN_INDEX', ',', 'that', "'s", 'no', 'good', '.', 'i', 'hope', 'they', 'get', 'it', 'worked', 'out', 'soon', '.']
Model Response: 
['START_DECODE', 'i', 'think', 'they', 'have', 'been', 'UNKNOWN_INDEX', 'END_DECODE']
Gold Response: 
['me', 'too', '.', 'END_DECODE']

---------------------------------

Past response: 
['oh', 'you', 'must', 'love', 'getting', 'cancer', 'results', '!']
Model Response: 
['START_DECODE', 'it'

### Let's decode *both* models on some *positive* data to see the types of responses it produces.

In [0]:
dev_batched_past_turn_idx, dev_batched_past_lengths, dev_batched_response_idx, dev_batched_resp_lengths, dev_batched_emotions= dev_dataset_positive.get_batches(BATCH_SIZE, vocab, emotset)
for _ in range(5):
  rand_batch = random.randint(0, len(dev_batched_past_turn_idx)-1)
  rand_item = random.randint(0, len(dev_batched_past_turn_idx[0])-1)
  print("Past response: ")
  print(id2string(vocab, [x for x in dev_batched_past_turn_idx[rand_batch][rand_item] if x != PAD_INDEX]))
  print("POSITIVE model Response: ")
  model_resp = decode(model=positive_model, prev_turn=[dev_batched_past_turn_idx[rand_batch][rand_item]], prev_lengths=[dev_batched_past_lengths[rand_batch][rand_item]], vocab=vocab, max_len=20, random_top_k=True)
  print(model_resp)
  print("NEGATIVE model Response: ")
  model_resp = decode(model=negative_model, prev_turn=[dev_batched_past_turn_idx[rand_batch][rand_item]], prev_lengths=[dev_batched_past_lengths[rand_batch][rand_item]], vocab=vocab, max_len=20, random_top_k=True)
  print(model_resp)
  print("Gold Response: ")
  print(id2string(vocab, [x for x in dev_batched_response_idx[rand_batch][rand_item][1:] if x != PAD_INDEX]))
  print()
  print("---------------------------------")
  print()

Past response: 
['yeah', ',', 'i', 'bet', ',', 'it', 'looked', 'like', 'a', 'fun', 'place', 'to', 'work', '.', 'UNKNOWN_INDEX', 'especially', 'for', 'a', 'college', 'job', '.']
POSITIVE model Response: 
['START_DECODE', 'yes', 'END_DECODE']
NEGATIVE model Response: 
['START_DECODE', 'yeah', 'i', 'think', 'it', "'s", 'a', 'little', 'time', 'i', 'have', 'to', 'be', 'a', 'UNKNOWN_INDEX', 'and', 'i', "'m", 'just', 'really', 'good']
Gold Response: 
['it', 'really', 'was', '.', 'UNKNOWN_INDEX', 'it', 'was', 'so', 'UNKNOWN_INDEX', 'laid', 'back', 'and', 'UNKNOWN_INDEX', '.', 'UNKNOWN_INDEX', 'i', 'also', 'made', 'a', 'ton', 'of', 'great', 'friends', 'there', 'END_DECODE']

---------------------------------

Past response: 
['that', "'s", 'so', 'true', '.', 'i', 'm', '30', 'and', 'just', 'now', 'having', 'the', 'time', 'to', 'work', 'on', 'my', 'UNKNOWN_INDEX', '.', 'but', 'i', 'have', 'a', 'lot', 'of', 'work', 'experience', '.']
POSITIVE model Response: 
['START_DECODE', 'yeah', '.', 'they', 

### Let's decode *both* models on some *negative* data to see the types of responses it produces.

In [0]:
dev_batched_past_turn_idx, dev_batched_past_lengths, dev_batched_response_idx, dev_batched_resp_lengths, dev_batched_emotions= dev_dataset_negative.get_batches(BATCH_SIZE, vocab, emotset)
for _ in range(5):
  rand_batch = random.randint(0, len(dev_batched_past_turn_idx)-1)
  rand_item = random.randint(0, len(dev_batched_past_turn_idx[0])-1)
  print("Past response: ")
  print(id2string(vocab, [x for x in dev_batched_past_turn_idx[rand_batch][rand_item] if x != PAD_INDEX]))
  print("POSITIVE model Response: ")
  model_resp = decode(model=positive_model, prev_turn=[dev_batched_past_turn_idx[rand_batch][rand_item]], prev_lengths=[dev_batched_past_lengths[rand_batch][rand_item]], vocab=vocab, max_len=20, random_top_k=True)
  print(model_resp)
  print("NEGATIVE model Response: ")
  model_resp = decode(model=negative_model, prev_turn=[dev_batched_past_turn_idx[rand_batch][rand_item]], prev_lengths=[dev_batched_past_lengths[rand_batch][rand_item]], vocab=vocab, max_len=20, random_top_k=True)
  print(model_resp)
  print("Gold Response: ")
  print(id2string(vocab, [x for x in dev_batched_response_idx[rand_batch][rand_item][1:] if x != PAD_INDEX]))
  print()
  print("---------------------------------")
  print()

Past response: 
['are', 'you', 'getting', 'laid', 'off', '?']
POSITIVE model Response: 
['START_DECODE', 'yes', 'END_DECODE']
NEGATIVE model Response: 
['START_DECODE', 'i', "'m", 'going', '.', 'it', 'was', 'a', 'bit', 'END_DECODE']
Gold Response: 
['yes', ',', 'unfortunately', '.', 'we', 'are', 'having', 'a', 'huge', 'company', 'UNKNOWN_INDEX', 'and', 'being', 'a', 'remote', 'employee', 'did', "n't", 'help', 'my', 'case', 'END_DECODE']

---------------------------------

Past response: 
['you', 'must', 'get', 'a', 'good', 'sleep', 'now', 'to', 'enjoy', 'the', 'new', 'york', 'later']
POSITIVE model Response: 
['START_DECODE', 'yes', 'i', 'was', 'so', 'i', 'have', 'been', 'the', 'UNKNOWN_INDEX', '.', 'i', 'have', 'never', 'have', 'UNKNOWN_INDEX', 'the', 'END_DECODE']
NEGATIVE model Response: 
['START_DECODE', 'i', 'am', ',', 'but', 'that', 'was', 'a', 'lot', '.', 'i', 'do', 'END_DECODE']
Gold Response: 
['yeah', 'i', 'm', 'going', 'to', 'try', ',', 'but', 'its', 'so', 'exciting', 'and',

### Now, let's see which model does better when evaluated on the other's development set

In [0]:
def compare_positive_and_negative_data(positive_model, negative_model, dev_dataset_positive, dev_dataset_negative):
  '''
  This method compares the positive model on the negative dataset and the negative model on the positive dataset.
  You should use evaluate_on_data to get the training loss for each model.

  Arguments:
  positive_model:       model trained on positive data
  negative_model:       model trained on negative data
  dev_dataset_positive: dev dataset for positive data
  dev_dataset_negative: dev dataset for negative data

  Returns:
  positive_model_negative_data:   result of evaluating the positive model on the negative dev dataset
  negative_model_positive_data:   result of evaluating the negative model on the positive dev dataset
  '''
  positive_model_negative_data = evaluate_on_data(positive_model,dev_dataset_negative,BATCH_SIZE,vocab,emotset)
  negative_model_positive_data = evaluate_on_data(negative_model,dev_dataset_positive,BATCH_SIZE,vocab,emotset)
  
  return positive_model_negative_data, negative_model_positive_data

In [0]:
positive_model_negative_data, negative_model_positive_data = compare_positive_and_negative_data(positive_model,  
                                                                                                negative_model, 
                                                                                                dev_dataset_positive, 
                                                                                                dev_dataset_negative)
print("Positive model loss on negative data:", positive_model_negative_data)
print("Negative model loss on positive data:", negative_model_positive_data)

Positive model loss on negative data: 4.528554916381836
Negative model loss on positive data: 4.652871608734131
