In [0]:
!pip install transformers
!pip install gensim



In [0]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (
    AdamW,
    BertConfig,
    BertTokenizer,
    BertForSequenceClassification,
    XLNetConfig, 
    XLNetForSequenceClassification, 
    XLNetTokenizer,
    XLMConfig, 
    XLMForSequenceClassification, 
    XLMTokenizer,
    RobertaConfig, 
    RobertaForSequenceClassification, 
    RobertaTokenizer,
    DistilBertConfig, 
    DistilBertForSequenceClassification, 
    DistilBertTokenizer,
    AlbertConfig, 
    AlbertForSequenceClassification, 
    AlbertTokenizer,
    XLMRobertaConfig, 
    XLMRobertaForSequenceClassification, 
    XLMRobertaTokenizer,
    get_linear_schedule_with_warmup
)


In [0]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
    "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
    "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
    "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
} 

In [0]:
## Model setup
model_type = 'bert'
model_name = 'bert-base-cased'
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

config = config_class.from_pretrained(model_name, num_labels=50, output_hidden_states=True)
model = model_class.from_pretrained(model_name, config=config)
tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=False)


In [0]:
# PARAMETERS
MAX_LEN = 128
batch_size = 1
lr = 4e-5
eps = 1e-8
epochs = 4

In [0]:
ROOT = ""
DATA_PATH = ROOT + 'datasets/'
def get_data(subset='train'):
    texts = []
    for root, folders, files in os.walk(DATA_PATH + '/C50/C50{}'.format(subset)):
        if len(files) == 0:
            continue

        author = root.split('/')[-1]
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r') as f:
                    texts.append({
                        'author': author,
                        'text': f.read(),

                    })
    df = pd.DataFrame(texts)
    unique_authors = sorted(df['author'].unique())
    num_authors = len(unique_authors)
    author_to_id = { unique_authors[i]: i for i in range(num_authors) }
    df = df.assign(author_id=df['author'].apply(lambda a: author_to_id[a]))
    return df

In [0]:
def get_encodings(texts):
    token_ids = []
    attention_masks = []
    for text in texts:
        token_id = tokenizer.encode(text, 
                                    add_special_tokens=True, 
                                    max_length=MAX_LEN,
                                    pad_to_max_length=True)
        token_ids.append(token_id)
    return token_ids



def get_attention_masks(padded_encodings):
    attention_masks = []
    for encoding in padded_encodings:
        attention_mask = [int(token_id > 0) for token_id in encoding]
        attention_masks.append(attention_mask)
    return attention_masks

In [0]:
train_df = pd.read_pickle(DATA_PATH + 'reuters50_train.pkl')
test_df = pd.read_pickle(DATA_PATH + 'reuters50_test.pkl')

In [0]:
train_encodings = get_encodings(train_df.text.values)
train_attention_masks = get_attention_masks(train_encodings)

test_encodings = get_encodings(test_df.text.values)
test_attention_masks = get_attention_masks(test_encodings)


In [0]:
train_input_ids = torch.tensor(train_encodings)
train_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_df.author_id.values)


test_input_ids = torch.tensor(test_encodings)
test_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_df.author_id.values)

# Create the DataLoader for our training set.
train_data = TensorDataset(train_input_ids, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(test_input_ids, test_masks, test_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [0]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
import math 

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def get_confusion_matrix(preds, labels):
    """
    Rows = true labels
    Columns = classified labels
    """
    confusion_matrix = np.zeros((50, 50))
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    for i, label in enumerate(labels):
        pred = preds[i]
        confusion_matrix[label][pred] += 1

    return confusion_matrix

def parse_confusion_matrix(confusion_matrix):
    """
    Rows = labels
    Col0 = tp
    Col1 = fp
    Col2 = fn
    Col3 = tn
    """
    parsed_confusion_matrix = np.zeros((50, 4))
    for i in range(confusion_matrix.shape[0]):
        tp = confusion_matrix[i][i]
        fp = confusion_matrix[:, i].sum() - tp
        fn = confusion_matrix[i, :].sum() - tp 
        tn = confusion_matrix.sum() - tp - fp - fn
        # print(f'Label: {i}, tp: {tp}, fp: {fp}, fn: {fn}, tn: {tn}')
        parsed_confusion_matrix[i][0] = tp
        parsed_confusion_matrix[i][1] = fp
        parsed_confusion_matrix[i][2] = fn
        parsed_confusion_matrix[i][3] = tn
    return parsed_confusion_matrix

def calculate_avg_precision(parsed_confusion_matrix):
    """
    Calculates macro average precision
    """
    total_precision = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, fp, _, _ = parsed_confusion_matrix[i]
        precision = tp / (tp + fp)
        if not np.isnan(precision):
            total_precision += precision
    return total_precision / num_classes

def calculate_avg_recall(parsed_confusion_matrix):
    """
    Calculates macro average recall
    """
    total_recall = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, _, fn, _ = parsed_confusion_matrix[i]
        recall = tp / (tp + fn)
        if not np.isnan(recall):
            total_recall += recall
    return total_recall / num_classes

def calculate_avg_f1(parsed_confusion_matrix):
    """
    Calculates macro average f1 score
    """
    total_f1 = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, fp, fn, _ = parsed_confusion_matrix[i]
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2*((precision * recall) / (precision + recall))
        if not np.isnan(f1):
            total_f1 += f1
    return total_f1 / num_classes

def calculate_avg_mcc(parsed_confusion_matrix):
    """
    Calculates macro average Matthews correlation coefficient
    """
    total_mcc = 0
    num_classes = parsed_confusion_matrix.shape[0]
    for i in range(num_classes):
        tp, fp, fn, tn = parsed_confusion_matrix[i]
        mcc = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        if not np.isnan(mcc):
            total_mcc += mcc
    return total_mcc / num_classes

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
# ========================================
#         Get Training Embeddings
# ========================================

model.eval()
train_features = None
for batch in train_dataloader:

    b_texts = batch[0].to(device)
    b_attention_masks = batch[1].to(device)
    b_authors = batch[2].to(device)

    with torch.no_grad():
        outputs = model(b_texts, 
                        attention_mask=b_attention_masks, 
                        labels=b_authors)
    
    loss = outputs[0]
    logits = outputs[1]
    hidden_states = outputs[2][0]

    if train_features is None:
        train_features = hidden_states
    else:
        train_features = torch.cat((train_features, hidden_states))

In [0]:

# ========================================
#         Get Validation Embeddings
# ========================================


model.eval()
test_features = None

for batch in validation_dataloader:
    
    b_texts = batch[0].to(device)
    b_attention_masks = batch[1].to(device)
    b_authors = batch[2].to(device)
   
    with torch.no_grad():        

        outputs = model(b_texts, 
                        attention_mask=b_attention_masks,
                        labels=b_authors
                        )

    loss = outputs[0]
    logits = outputs[1]
    hidden_states = outputs[2][0]

    if test_features is None:
        test_features = hidden_states
    else:
        test_features = torch.cat((test_features, hidden_states))


In [0]:
train_features.mean(dim=1).shape

torch.Size([4500, 768])

In [0]:
train_features = train_features.mean(dim=1).double().to(device)
test_features = test_features.mean(dim=1).double().to(device)

In [0]:
import pickle
train_xbow_p = DATA_PATH + "text_bow_train.p"
test_xbow_p = DATA_PATH + "text_bow_test.p"
train_bow = pickle.load(open(train_xbow_p, "rb"))
test_bow = pickle.load(open(test_xbow_p, "rb"))


In [0]:
# train_bow = torch.from_numpy(train_bow)
# test_bow = torch.from_numpy(test_bow)
train_bow = torch.tensor(train_bow.todense(), dtype = torch.double, device = device)
test_bow = torch.tensor(test_bow.todense(), dtype = torch.double, device = device)

In [0]:
train_features_comb = torch.cat((train_features, train_bow), dim=1)
test_features_comb = torch.cat((test_features, test_bow), dim=1)

In [0]:
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

class NeuralNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(815, 700)
        # self.hidden2 = nn.Linear(600, 600)
        # self.hidden3 = nn.Linear(192, 96)
        self.output = nn.Linear(700, 50)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)
        # self.batchnorm1 = nn.BatchNorm1d(300)
        # self.batchnorm2 = nn.BatchNorm1d(100)
        # self.batchnorm3 = nn.BatchNorm1d(96)

    
    def forward(self, x):
        x = self.hidden1(x)
        # x = self.batchnorm1(x)
        x = self.relu(x)

        # x = self.hidden2(x)
        # x = self.batchnorm2(x)
        # x = self.relu(x)
        # x = self.dropout(x)

        # x = self.hidden3(x)
        # x = self.batchnorm3(x)
        # x = self.relu(x)
        # x = self.dropout(x)
       
        x = self.output(x)
        return x



In [0]:
nnet = NeuralNet()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(nnet.parameters(), lr=0.001)

nnet.double().to(device)

NeuralNet(
  (hidden1): Linear(in_features=815, out_features=700, bias=True)
  (output): Linear(in_features=700, out_features=50, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

In [0]:
# Create the DataLoader for our training set.
train_data = TensorDataset(train_features_comb, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(test_features_comb, test_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

In [0]:

epochs = 100

for epoch in range(1, epochs+1):
    nnet.train()
    epoch_loss = 0
    epoch_acc = 0
    for batch in train_dataloader:
        features = batch[0].to(device)
        labels = batch[1].to(device)
        optimizer.zero_grad()

        outputs = nnet(features)
        vals, inds = torch.max(outputs, dim=1)
        
        loss = loss_function(outputs, labels)
        epoch_loss += loss

        acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
        epoch_acc += acc

        
        loss.backward()
        optimizer.step()

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    avg_epoch_acc = epoch_acc / len(train_dataloader)
    print(f"Epoch: [{epoch}/{epochs}], loss: {avg_epoch_loss}, acc: {avg_epoch_acc}")


In [0]:
nnet.eval()
total_acc = 0
preds = []
trues = []
for batch in validation_dataloader:
    features = batch[0].to(device)
    labels = batch[1].to(device)
    trues.extend(labels)

    with torch.no_grad():
        outputs = nnet(features)
        vals, inds = torch.max(outputs, dim=1)
        preds.extend(inds)
        loss = loss_function(outputs, labels)
        print(f"Loss: {loss}")
    acc = torch.eq(inds, labels).sum().item() / labels.shape[0]
    total_acc += acc
avg_acc = total_acc / len(validation_dataloader)

print(f'Acc: {avg_acc}')


Loss: 0.12200262460178901
Loss: 1.1559330537118295
Loss: 1.9726935365004608
Loss: 0.7785658032196777
Loss: 0.24049036764612924
Loss: 1.5836146557194524
Loss: 0.5428488692304398
Loss: 0.5594733800538633
Loss: 0.9071051551107013
Loss: 0.7174421776457237
Loss: 0.9718123500830679
Loss: 1.0838673740193756
Loss: 1.0315812517406668
Loss: 0.4610786831195107
Loss: 0.5711651786702145
Loss: 0.9163085083728522
Acc: 0.821484375


In [0]:
pred_array = np.zeros(len(preds))
gold = np.zeros(len(preds))
for i in range(len(trues)):
    pred_array[i] = int(preds[i].cpu().numpy())
    gold[i] = trues[i].cpu().numpy()
pred_array = pred_array.astype(int)
gold = gold.astype(int)

In [0]:
confmatrix = np.zeros((50, 50))
for i, label in enumerate(gold):
    pred = preds[i]
    confmatrix[label][pred] += 1
parsed_confmatrix = parse_confusion_matrix(confmatrix)

In [0]:
calculate_avg_f1(parsed_confmatrix)

0.8201109393067259

In [0]:
calculate_avg_mcc(parsed_confmatrix)

0.821169843706492

In [0]:
calculate_avg_precision(parsed_confmatrix)

0.836338873544756

In [0]:
calculate_avg_recall(parsed_confmatrix)

0.8219999999999997