In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers




In [0]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (
    AdamW,
    BertConfig,
    BertTokenizer,
    BertForSequenceClassification,
    XLNetConfig, 
    XLNetForSequenceClassification, 
    XLNetTokenizer,
    XLMConfig, 
    XLMForSequenceClassification, 
    XLMTokenizer,
    RobertaConfig, 
    RobertaForSequenceClassification, 
    RobertaTokenizer,
    DistilBertConfig, 
    DistilBertForSequenceClassification, 
    DistilBertTokenizer,
    AlbertConfig, 
    AlbertForSequenceClassification, 
    AlbertTokenizer,
    XLMRobertaConfig, 
    XLMRobertaForSequenceClassification, 
    XLMRobertaTokenizer,
    get_linear_schedule_with_warmup
)


In [0]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
} 

In [0]:
## Model setup
model_type = 'bert'
model_name = 'bert-large-cased'
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=False)


In [0]:
# PARAMETERS
MAX_LEN = 256

In [2]:
# Filepath global variables
ROOT = "../"
DATA_PATH = ROOT + 'data/'
OUTPUT_PATH = ROOT + 'output/'
DATASETS_PATH = DATA_PATH + 'datasets/'
MODELS_PATH = OUTPUT_PATH + 'models/'

In [0]:
def get_data(subset='train'):
    texts = []
    for root, folders, files in os.walk(DATASETS_PATH + 'C50/C50{}'.format(subset)):
        if len(files) == 0:
            continue

        author = root.split('/')[-1]
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r') as f:
                    texts.append({
                        'author': author,
                        'text': f.read(),

                    })
    df = pd.DataFrame(texts)
    unique_authors = sorted(df['author'].unique())
    num_authors = len(unique_authors)
    author_to_id = { unique_authors[i]: i for i in range(num_authors) }
    df = df.assign(author_id=df['author'].apply(lambda a: author_to_id[a]))
    return df

In [0]:
def get_encodings(texts):
    token_ids = []
    attention_masks = []
    for text in texts:
        token_id = tokenizer.encode(text, 
                                    add_special_tokens=True, 
                                    max_length=MAX_LEN,
                                    pad_to_max_length=True)
        token_ids.append(token_id)
    return token_ids



def get_attention_masks(padded_encodings):
    attention_masks = []
    for encoding in padded_encodings:
        attention_mask = [int(token_id > 0) for token_id in encoding]
        attention_masks.append(attention_mask)
    return attention_masks

In [0]:
train_df = pd.read_pickle(DATASETS_PATH + 'reuters50_train.pkl')
test_df = pd.read_pickle(DATASETS_PATH + 'reuters50_test.pkl')

In [0]:
train_encodings = get_encodings(train_df.text.values)
train_attention_masks = get_attention_masks(train_encodings)

test_encodings = get_encodings(test_df.text.values)
test_attention_masks = get_attention_masks(test_encodings)


In [0]:
train_input_ids = torch.tensor(train_encodings)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_df.author_id.values)


test_input_ids = torch.tensor(test_encodings)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_df.author_id.values)

In [0]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


In [0]:
import math

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def get_confusion_matrix(preds, labels):
  """
  Rows = true labels
  Columns = classified labels
  """
    confusion_matrix = np.zeros((50, 50))
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    for i, label in enumerate(labels):
        pred = preds[i]
        confusion_matrix[label][pred] += 1
    eturn confusion_matrix

def parse_confusion_matrix(confusion_matrix):
  """
  Rows = labels
  Col0 = tp
  Col1 = fp
  Col2 = fn
  Col3 = tn
  """
    parsed_confusion_matrix = np.zeros((50, 4))
    for i in range(confusion_matrix.shape[0]):
        tp = confusion_matrix[i][i]
        fp = confusion_matrix[:, i].sum() - tp
        fn = confusion_matrix[i, :].sum() - tp 
        tn = confusion_matrix.sum() - tp - fp - fn
        
        parsed_confusion_matrix[i][0] = tp
        parsed_confusion_matrix[i][1] = fp
        parsed_confusion_matrix[i][2] = fn
        parsed_confusion_matrix[i][3] = tn
    return parsed_confusion_matrix

def calculate_avg_precision(parsed_confusion_matrix):
  """
  Calculates macro average precision
  """
    total_precision = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, fp, _, _ = parsed_confusion_matrix[i]
        precision = tp / (tp + fp)
        if not np.isnan(precision):
            total_precision += precision
    return total_precision / num_classes

def calculate_avg_recall(parsed_confusion_matrix):
  """
  Calculates macro average recall
  """
    total_recall = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, _, fn, _ = parsed_confusion_matrix[i]
        recall = tp / (tp + fn)
        if not np.isnan(recall):
            total_recall += recall
    return total_recall / num_classes

def calculate_avg_f1(parsed_confusion_matrix):
  """
  Calculates macro average f1 score
  """
    total_f1 = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, fp, fn, _ = parsed_confusion_matrix[i]
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2*((precision * recall) / (precision + recall))
        if not np.isnan(f1):
            total_f1 += f1
    return total_f1 / num_classes

def calculate_avg_mcc(parsed_confusion_matrix):
  """
  Calculates macro average Matthews correlation coefficient
  """
    total_mcc = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, fp, fn, tn = parsed_confusion_matrix[i]
        mcc = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        if not np.isnan(mcc):
            total_mcc += mcc
    return total_mcc / num_classes

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [3]:
import pickle
train_xbow_p = DATA_PATH + "text_bow_train.p"
test_xbow_p = DATA_PATH + "text_bow_test.p"
train_bow = pickle.load( open( train_xbow_p, "rb" ) )
test_bow = pickle.load( open( test_xbow_p, "rb" ) )


In [0]:
train_bow = torch.tensor(train_bow.todense(), dtype = torch.double, device = device)
test_bow = torch.tensor(test_bow.todense(), dtype = torch.double, device = device)

In [0]:
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

class NeuralNet(nn.Module):
    """
    End to end neural net that combines with BERT model
    """
    def __init__(self, bert_model):
        super().__init__()
        self.hidden1 = nn.Linear(1071, 700)
        self.output = nn.Linear(700, 50)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        self.batchnorm1 = nn.BatchNorm1d(700)
        self.bert = bert_model

    
    def forward(self, x):
        x = self.hidden1(x)
        x = self.batchnorm1(x)
        x = self.dropout(x)
        x = self.relu(x)

        x = self.output(x)
        return x



In [17]:
## Instantiate a brand new bert model

config = config_class.from_pretrained(model_name, num_labels=50, output_hidden_states=True)
bert = model_class.from_pretrained(model_name, config=config)

# bert_filename = 'bert_bert-base-cased_max-length=128.pth'
# bert.load_state_dict(torch.load(ROOT + '/models/' + model_filename))
bert.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [0]:
# Parameters
batch_size = 4
lr = 1e-3
eps = 1e-8
epochs = 20
weight_decay = 2e-2

In [19]:
nnet = NeuralNet(bert)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.AdamW(nnet.parameters(), lr=lr, eps=eps, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.7)

nnet.double().to(device)

NeuralNet(
  (hidden1): Linear(in_features=1071, out_features=700, bias=True)
  (output): Linear(in_features=700, out_features=50, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (batchnorm1): BatchNorm1d(700, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 1024, padding_idx=0)
        (position_embeddings): Embedding(512, 1024)
        (token_type_embeddings): Embedding(2, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_f

In [0]:
train_dataset = TensorDataset(train_input_ids, train_masks, train_bow, train_labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

test_dataset = TensorDataset(test_input_ids, test_masks, test_bow, test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

In [0]:
# ========================================
#               Training
# ========================================

train_loss_values = []
test_loss_values = []

for epoch in range(1, epochs+1):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch, epochs))
    print('Training...')

    nnet.train()
    t0 = time.time()

    train_loss = 0
    train_acc = 0

    for step, batch in enumerate(train_dataloader):

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        input_ids, input_masks, bow, labels = tuple(t.to(device) for t in batch)

        # Run input IDs and masks through BERT first to get embeddings
        # then combine with BOW to run through rest of nnet model

        optimizer.zero_grad()
        embeddings = nnet.bert(input_ids, attention_mask=input_masks)[1][0]
        embeddings = embeddings.mean(dim=1).double() 

        features_comb = torch.cat((embeddings, bow), dim=1)

        outputs = nnet(features_comb)
        vals, inds = torch.max(outputs, dim=1)
        
        loss = loss_function(outputs, labels)
        train_loss += loss

        acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
        train_acc += acc

        # torch.nn.utils.clip_grad_norm_(nnet.parameters(), 1.0)

        loss.backward()
        optimizer.step()
        scheduler.step()

    elapsed = format_time(time.time() - t0)
    avg_train_loss = train_loss / len(train_dataloader)
    avg_train_acc = train_acc / len(train_dataloader)
    train_loss_values.append(avg_train_loss)
    print(f"Epoch: [{epoch}/{epochs}], elapsed: {elapsed}, loss: {avg_train_loss}, acc: {avg_train_acc}")

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    nnet.eval()
    t0 = time.time()
    
    test_loss = 0
    test_acc = 0
    confusion_matrix = torch.zeros((50, 50))

    for batch in test_dataloader:

        input_ids, input_masks, bow, labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            embeddings = nnet.bert(input_ids, attention_mask=input_masks)[1][0]
            embeddings = embeddings.mean(dim=1).double() 

            features_comb = torch.cat((embeddings, bow), dim=1)

            outputs = nnet(features_comb)
            vals, inds = torch.max(outputs, dim=1)
        
            loss = loss_function(outputs, labels)
            test_loss += loss

            acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
            test_acc += acc

            preds = outputs.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()

            tmp_confusion_matrix = get_confusion_matrix(preds, labels)
            confusion_matrix += tmp_confusion_matrix

    elapsed = format_time(time.time() - t0)
    avg_test_loss = test_loss / len(test_dataloader)
    avg_test_acc = test_acc / len(test_dataloader)
    test_loss_values.append(avg_test_loss)

    parsed_confusion_matrix = parse_confusion_matrix(confusion_matrix)
    avg_precision = calculate_avg_precision(parsed_confusion_matrix)
    avg_recall = calculate_avg_recall(parsed_confusion_matrix)
    avg_f1 = calculate_avg_f1(parsed_confusion_matrix)
    avg_mcc = calculate_avg_mcc(parsed_confusion_matrix)

    print(f"Test :- elapsed: {elapsed}, loss: {avg_test_loss}, acc: {avg_test_acc}")
    print(f"Precision: {avg_precision}, Recall: {avg_recall}, F1: {avg_f1}, MCC: {avg_mcc}")




Training...
  Batch    40  of  1,125.    Elapsed: 0:00:09.
  Batch    80  of  1,125.    Elapsed: 0:00:18.
  Batch   120  of  1,125.    Elapsed: 0:00:27.
  Batch   160  of  1,125.    Elapsed: 0:00:36.
  Batch   200  of  1,125.    Elapsed: 0:00:45.
  Batch   240  of  1,125.    Elapsed: 0:00:54.
  Batch   280  of  1,125.    Elapsed: 0:01:02.
  Batch   320  of  1,125.    Elapsed: 0:01:11.
  Batch   360  of  1,125.    Elapsed: 0:01:20.
  Batch   400  of  1,125.    Elapsed: 0:01:29.
  Batch   440  of  1,125.    Elapsed: 0:01:38.
  Batch   480  of  1,125.    Elapsed: 0:01:47.
  Batch   520  of  1,125.    Elapsed: 0:01:56.
  Batch   560  of  1,125.    Elapsed: 0:02:05.
  Batch   600  of  1,125.    Elapsed: 0:02:14.
  Batch   640  of  1,125.    Elapsed: 0:02:23.
  Batch   680  of  1,125.    Elapsed: 0:02:32.
  Batch   720  of  1,125.    Elapsed: 0:02:41.
  Batch   760  of  1,125.    Elapsed: 0:02:50.
  Batch   800  of  1,125.    Elapsed: 0:02:59.
  Batch   840  of  1,125.    Elapsed: 0:03:07.




Test :- elapsed: 0:00:25, loss: 1.4959390758968705, acc: 0.672
Precision: 0.7111091188242216, Recall: 0.6719999999999999, F1: 0.6612868577877257, MCC: 0.6703841613742401

Training...
  Batch    40  of  1,125.    Elapsed: 0:00:09.
  Batch    80  of  1,125.    Elapsed: 0:00:18.
  Batch   120  of  1,125.    Elapsed: 0:00:27.
  Batch   160  of  1,125.    Elapsed: 0:00:36.
  Batch   200  of  1,125.    Elapsed: 0:00:45.
  Batch   240  of  1,125.    Elapsed: 0:00:54.
  Batch   280  of  1,125.    Elapsed: 0:01:02.
  Batch   320  of  1,125.    Elapsed: 0:01:11.
  Batch   360  of  1,125.    Elapsed: 0:01:20.
  Batch   400  of  1,125.    Elapsed: 0:01:29.
  Batch   440  of  1,125.    Elapsed: 0:01:38.
  Batch   480  of  1,125.    Elapsed: 0:01:47.
  Batch   520  of  1,125.    Elapsed: 0:01:56.
  Batch   560  of  1,125.    Elapsed: 0:02:05.
  Batch   600  of  1,125.    Elapsed: 0:02:14.
  Batch   640  of  1,125.    Elapsed: 0:02:23.
  Batch   680  of  1,125.    Elapsed: 0:02:32.
  Batch   720  of 

In [0]:
model_filename = f'END_TO_END_{model_type}_{model_name}_max-length={MAX_LEN}.pth'
torch.save(nnet.state_dict(), MODELS_PATH + model_filename)