In [None]:
# Install Necessary libraries
!pip install transformers # HuggingFace

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 22.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 17.7MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
# Test Hugging Face
# from transformers import pipeline;
# print(pipeline('sentiment-analysis')('we love you'))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# AI Liibraries
from transformers import BertTokenizerFast, BertModel
import torch

# Data Libraries
import pandas as pd
import numpy as np

# Python Libraries
import math
import re
import pprint

# Global Variables for project info
MEMBERS = {
    'jm4495': "Joan Martinez",
    'dr2948': "David Rosado",
    'jp3742': "Joshua Polanco Calderon"
}

# Gloabl Variables for Program
DATA_DIR = 'data'
KAGGLE_DATA_FILENAME = 'train.csv'

In [None]:
class Norm(torch.nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = torch.nn.Parameter(torch.ones(self.size))
        self.bias = torch.nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [None]:
class HumorNet(torch.nn.Module):
    def __init__(self, num_heads, dim_model=768, pretrained_model=BertModel.from_pretrained('bert-base-uncased')):
        super(HumorNet, self).__init__()
        self.pretrained = pretrained_model

        #freeze bert layer
        for param in self.pretrained.parameters():
          param.requires_grad = False

        # Extra layers: Normalization layer and Multihead Attention layer.
        self.norm = Norm(dim_model)
        self.attention_layer = torch.nn.MultiheadAttention(
                dim_model, # Pooler Output of BERT base is 768. An encoding vector of the CLS tag OR Choose last_hidden_state
                num_heads)
        self.linear = torch.nn.Linear(768, 1)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        pooled_output = self.pretrained(**x).last_hidden_state
        # print('pooled output shape', pooled_output.shape)
  
        
        x_norm_1 = self.norm(pooled_output).transpose(0, 1) # Gives seq length x batch size x embedding vectors
        batch_size = x_norm_1.shape[1]

        # Prepare for key_padding_mask - Invert to have unattended tokens be labeled 1
        x['attention_mask'] = x['attention_mask'].type(torch.bool)
        inverted_attn_mask = (x['attention_mask'] == False) # Shape: batch size x seq length

        # print('Last hidden state mask Shape:', x_norm_1.shape)
        # print('attention mask shape', inverted_attn_mask.shape)

        attn_output, attn_output_weights = self.attention_layer(x_norm_1, x_norm_1, x_norm_1, key_padding_mask=inverted_attn_mask)
        # print('Attention Output Shape', attn_output.shape)

        attn_output = attn_output.transpose(0, 1)
        # print('Attention Output Shape After Transpose', attn_output.shape)
        averaged_attn_output =  torch.mean(attn_output, 1, keepdim=True).squeeze()
        # print('Averaged Attention output', averaged_attn_output.shape)

        linear_output = self.linear(averaged_attn_output)
        proba = self.sigmoid(linear_output).squeeze()
        # print('Probability', proba.shape)
        # print('Probability dtype', proba.dtype)
        return proba

# Instaniate Model with added attention layer
bertModel_extended = HumorNet(num_heads=1, pretrained_model=BertModel.from_pretrained('bert-base-uncased'))

# Set Optimizer
optimizer = torch.optim.Adam(bertModel_extended.parameters(), amsgrad=True)
bertModel_extended

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HumorNet(
  (pretrained): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [None]:
# Test BertTokenizerFast & BertModel
bertModel_test = BertModel.from_pretrained('bert-base-uncased')
tokenizer_test = BertTokenizerFast.from_pretrained('bert-base-uncased')
inputs = tokenizer_test("Hello, my dog is cute", return_tensors="pt")
pprint.pprint(inputs)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…


{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
# Pytorch Dataset Creater
class JokesBERTDataset(torch.utils.data.Dataset):

    def __init__(self, csv_file=KAGGLE_DATA_FILENAME, root_dir=DATA_DIR):
        super(JokesBERTDataset, self).__init__()
        self.jokes = pd.read_csv(f'{root_dir}/{csv_file}', sep=',')
        self.jokes['humor'] = self.jokes['humor'].astype(np.int32)

        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def __getitem__(self, idx): 
        if torch.is_tensor(idx):
              idx = idx.tolist()

        label = 1 if self.jokes['humor'][idx] == True else 0

        return (self.jokes['text'][idx], label)
            
    def __len__(self):
        return len(self.jokes)

In [None]:
jokes_dataset = JokesBERTDataset(root_dir='/content/drive/MyDrive/NLP/data')

# Peak original dataset
for i, sample in enumerate(jokes_dataset):
  if i == 5:
    break
  pprint.pprint(sample)

("Joe biden rules out 2020 bid: 'guys, i'm not running'", 0)
('Watch: darvish gave hitter whiplash with slow pitch', 0)
('What do you call a turtle without its shell? dead.', 1)
('5 reasons the 2016 election feels so personal', 0)
('Pasco police shot mexican migrant from behind, new autopsy shows', 0)


In [None]:
# Understand HuggingFace's tokenizer with batches
pt_batch = tokenizer_test(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."], 
                          padding=True, 
                          truncation=True, 
                          max_length=512, 
                          return_tensors="pt")
pt_batch

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996,   100,
         19081,  3075,  1012,   102],
        [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,  1012,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [None]:
def create_data_samples(tokenizer, size=3):
  batch_samples = []
  for i, (x, _) in enumerate(jokes_dataset):
    if i >= size: 
      break
    
    batch_samples.append(x)
  return tokenizer(batch_samples, padding=True, truncation=True, return_tensors="pt")

create_data_samples(tokenizer_test, size=3)

{'input_ids': tensor([[  101,  3533,  7226,  2368,  3513,  2041, 12609,  7226,  1024,  1005,
          4364,  1010,  1045,  1005,  1049,  2025,  2770,  1005,   102],
        [  101,  3422,  1024, 18243, 24968,  2435, 18694, 11473, 27067,  2007,
          4030,  6510,   102,     0,     0,     0,     0,     0,     0],
        [  101,  2054,  2079,  2017,  2655,  1037, 13170,  2302,  2049,  5806,
          1029,  2757,  1012,   102,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [None]:
x = create_data_samples(tokenizer_test, size=50)
result = bertModel_test(**x)
print('x shape:', x['input_ids'].shape) # samples x number of tokens
print('y shape:', result) # samples x number of tokens x vector rep of each token
print('pooled shape:', result.pooler_output.shape) # output of CLS label. Layer of interest to pass to next model OR Actually last_hidden_state

x shape: torch.Size([50, 31])
y shape: BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0421, -0.4534, -0.0352,  ...,  0.1663,  0.9036,  0.8926],
         [ 0.3187, -0.3222, -0.8232,  ...,  0.2860,  0.2238, -0.0755],
         [ 1.0832, -0.4889,  0.6589,  ..., -0.2441,  0.2941,  1.1338],
         ...,
         [ 0.1421, -0.5138,  0.1938,  ...,  0.1579,  0.2763, -0.2877],
         [ 0.3069, -0.2269,  0.3253,  ...,  0.0110,  0.0747,  0.1035],
         [ 0.1412, -0.2676,  0.2691,  ...,  0.0426,  0.1317,  0.2656]],

        [[-0.1829, -0.2635, -0.1415,  ..., -0.3593,  0.2070,  0.3730],
         [ 0.7855,  0.4108,  0.0396,  ..., -0.2089,  0.3636, -0.6942],
         [-0.1265,  0.2116, -0.4572,  ...,  0.2430, -0.3600, -0.3042],
         ...,
         [-0.0632, -0.1486,  0.3709,  ...,  0.2164, -0.1285, -0.2370],
         [-0.1680, -0.1723,  0.3250,  ...,  0.1792, -0.0582, -0.2402],
         [-0.2515, -0.2339,  0.2381,  ...,  0.2296, -0.0155, -0.2229]],

        [[-0.1

In [None]:
def create_dataloaders(dataset, batch_size=1000, test_cut=.2, shuffle=True):
  random_seed= 42

  # Creating data indices for training, validation, and test splits:
  data_size = len(dataset)
  indices = list(range(data_size))
  test_split = int(np.floor(data_size * (test_cut/2))) # Index to split at
  val_split = int(np.floor(data_size * test_cut))

  if shuffle:
      np.random.seed(random_seed)
      np.random.shuffle(indices)
  train_idx, val_idx, test_idx = indices[val_split:], indices[test_split:val_split], indices[:test_split] 
  

  # Creating PT data samplers and loaders:
  train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
  valid_sampler = torch.utils.data.SubsetRandomSampler(val_idx)
  test_sampler = torch.utils.data.SubsetRandomSampler(test_idx)

  train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                            sampler=train_sampler)
  validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                  sampler=valid_sampler)
  test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                  sampler=test_sampler)
  return train_loader, validation_loader, test_loader

train_dataloader, validation_dataloader, test_dataloader = create_dataloaders(jokes_dataset)

In [None]:
for s in train_dataloader:
  print(s)
  break

        1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,

In [None]:
# Model running code for both train and test
import timeit
def run_one_epoch(train_flag, dataloader, model, tokenizer, optimizer, device="cpu"):
    
    torch.set_grad_enabled(train_flag)
    model.train() if train_flag else model.eval()

    losses = []
    accuracies = []

    for jokes, labels in dataloader:
        
        # DataLoader's automatic batching makes batch x seq x cells tensors
        #(jokes, labels) = ( jokes.to(device), labels.to(device) ) # transfer data to GPU
        # print("Load Lables")
        labels = labels.type(torch.float32)



        # labels.to(device)
        # # move data. to gpu
        # for k, v in tokenized_jokes.items():
        #   tokenized_jokes[k] = v.to(device)

        # forward
        # Tokenize batch of setences:
        tokenized_jokes = tokenizer(list(jokes), padding=True, truncation=True, return_tensors="pt")
        # print('text shape', tokenized_jokes['input_ids'].shape)
        # print('Send jokes to model')
        label_predicted = model(tokenized_jokes) # the output is seq x batch x cells
        # print('Model Output', label_predicted)
        # print('After model')

        # Calculate loss
        loss_func = torch.nn.BCELoss()
        # print('label predicted shape', label_predicted.dtype)
        # print('Target Label shape', labels.shape)
        # print('Target Label dtype', labels.dtype)
        batch_loss = loss_func(label_predicted, labels)
        losses.append(batch_loss.detach().cpu().numpy())

        # Backprop and train
        if train_flag: 
          model.zero_grad()
          batch_loss.backward()
          optimizer.step()
          optimizer.zero_grad()
            

        # calculate accuracy
        correct = (label_predicted == labels)
        accuracy = torch.mean( correct.float() )

        accuracies.append(accuracy.detach().cpu().numpy())

    return( np.mean(losses), np.mean(accuracies) )

def train_loop(model,
               tokenizer,
               optimizer,
               train_dataloader, 
               validation_dataloader,
               check_point_filename = '/content/drive/MyDrive/Academics/2020-2021/Spring 2021/NLP/models/test_checkpoint.pt',
               max_epochs = 10, 
               patience = 10,
               device = "cuda"):

    train_accs = []
    val_accs = []
    patience_counter = patience
    best_val_loss = np.inf

    for epoch in range(max_epochs):
        start_time = timeit.default_timer()
        train_loss, train_acc = run_one_epoch(True, train_dataloader, model, tokenizer, optimizer, device)
        val_loss, val_acc = run_one_epoch(False, validation_dataloader, model, tokenizer, optimizer, device)
        train_accs.append(train_acc)
        val_accs.append(val_acc)

        elapsed = float(timeit.default_timer() - start_time)
        print("Epoch %i took %.2fs. Train loss: %.4f acc: %.4f. Val loss: %.4f acc: %.4f. Patience left: %i" % 
            (epoch+1, elapsed, train_loss, train_acc, val_loss, val_acc, patience_counter ))
        
        if val_loss < best_val_loss: 
            torch.save(model.state_dict(), check_point_filename)
            best_val_loss = val_loss
            patience_counter = patience
        else: 
            patience_counter -= 1
            if patience_counter <= 0: 
                model.load_state_dict(torch.load(check_point_filename)) # recover the best model so far
                break
    
    return(train_accs, val_accs)

In [None]:
# Test run_one_epoch - had a break to just test out one iteration
# run_one_epoch(train_flag=True, tokenizer=tokenizer_test, dataloader=train_dataloader, model=bertModel_extended, optimizer=optimizer)

KeyboardInterrupt: ignored

In [None]:
type(bertModel_extended)

__main__.HumorNet

In [None]:
train_accs, val_accs = train_loop(bertModel_extended,
                                  tokenizer_test,
                                  optimizer,
               train_dataloader, 
               validation_dataloader,
               check_point_filename = '/content/drive/MyDrive/Academics/2020-2021/Spring 2021/NLP/models/test_checkpoint.pt',
               max_epochs = 10, 
               patience = 10,
               device = "cuda")