In [75]:
!pip install transformers



In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import nltk
import torch
import torch.nn as nn
import numpy as np
import random
from scipy.stats import norm
import math

In [77]:
# #  directory with our data
directory="/content/drive/MyDrive/splits/"

In [78]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on {}".format(device))

Running on cpu


In [79]:
data = pd.read_csv('adjudicated.csv')
data['original text'] = data['original text'].str.replace('\t', '').str.replace('\n', '')
data.head()

Unnamed: 0,data point ID,adjudicated,label,original text
0,0,adjudicated,TTA,my boyfriend and I live with my parents. I pay...
1,1,adjudicated,TTA,Hey Reddit! My original post got taken down fo...
2,2,adjudicated,TTA,"To provide context, our daughter is 19 months ..."
3,3,adjudicated,ESH,My husband sprung a very sudden trip on us to ...
4,4,adjudicated,YTA,I'm a little on the fence about this although ...


In [80]:
# randomizing adjudicated data
randomized_data = data.sample(frac=1, random_state=42).reset_index(drop=True)
randomized_data

Unnamed: 0,data point ID,adjudicated,label,original text
0,361,adjudicated,YTA,"I found out I had a huge brain tumor in June, ..."
1,73,adjudicated,YTA,"To make a long story very short, I (38M) met m..."
2,374,adjudicated,TTA,I always wondered how people ended up in the s...
3,155,adjudicated,NAH,"My daughter, Rachel 17, has always been a trac..."
4,104,adjudicated,YTA,"My brother (37 M), who is two years older than..."
...,...,...,...,...
495,106,adjudicated,NAH,Here me out first as the title seems to indica...
496,270,adjudicated,YTA,"me (f15) and my sister (f18) are, or were, ver..."
497,348,adjudicated,TTA,My wife (27F) and I (31M) were playing Super M...
498,435,adjudicated,TTA,"For starters, M(22) and my girlfriend F(23) ha..."


In [81]:
train_dev, test = train_test_split(randomized_data, test_size=0.2, random_state=42)
train, dev = train_test_split(train_dev, test_size=0.25, random_state=42)

print(f"size of train: {train.shape}")
print(f"size of test: {test.shape}")
print(f"size of dev: {dev.shape}")

size of train: (300, 4)
size of test: (100, 4)
size of dev: (100, 4)


In [82]:
# saving train, test, and dev files as txt files ONLY RUN ONCE
train.to_csv("splits/train.txt", sep='\t',  index=False)

test.to_csv("splits/dev.txt", sep='\t', index=False)

dev.to_csv("splits/test.txt",  sep='\t', index=False)

In [83]:
def read_labels(filename):
    labels={}
    with open(filename) as file:
        for line in file:
            cols = line.split("\t")
            label = cols[2]
            if label not in labels:
                labels[label]=len(labels)
    return labels

In [84]:
def read_data(filename, labels, max_data_points=1000):

    data = []
    data_labels = []
    with open(filename) as file:
        for line in file:
            cols = line.split("\t")
            label = cols[2]
            text = cols[3]

            data.append(text)
            data_labels.append(labels[label])


    # shuffle the data
    tmp = list(zip(data, data_labels))
    random.shuffle(tmp)
    data, data_labels = zip(*tmp)

    if max_data_points is None:
        return data, data_labels

    return data[:max_data_points], data_labels[:max_data_points]

In [85]:
labels=read_labels("splits/train.txt")

In [86]:
train_x, train_y=read_data("splits/train.txt", labels, max_data_points=None)

In [87]:
dev_x, dev_y=read_data("splits/dev.txt", labels, max_data_points=None)

In [88]:
test_x, test_y=read_data("splits/test.txt", labels, max_data_points=None)

In [89]:
def evaluate(model, x, y):
    model.eval()
    corr = 0.
    total = 0.
    with torch.no_grad():
        for x, y in zip(x, y):
            y_preds=model.forward(x)
            for idx, y_pred in enumerate(y_preds):
                prediction=torch.argmax(y_pred)
                if prediction == y[idx]:
                    corr += 1.
                total+=1
    return corr/total, total

In [90]:
class BERTClassifier(nn.Module):

    def __init__(self, bert_model_name, params):
        super().__init__()

        self.model_name=bert_model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name, do_lower_case=params["doLowerCase"], do_basic_tokenize=False)
        self.bert = BertModel.from_pretrained(self.model_name)

        self.num_labels = params["label_length"]

        self.fc = nn.Linear(params["embedding_size"], self.num_labels)

    def get_batches(self, all_x, all_y, batch_size=15, max_toks=510):

        """ Get batches for input x, y data, with data tokenized according to the BERT tokenizer
      (and limited to a maximum number of WordPiece tokens """

        batches_x=[]
        batches_y=[]

        for i in range(0, len(all_x), batch_size):

            current_batch=[]

            x=all_x[i:i+batch_size]

            batch_x = self.tokenizer(x, padding=True, truncation=True, return_tensors="pt", max_length=max_toks)
            batch_y=all_y[i:i+batch_size]

            batches_x.append(batch_x.to(device))
            batches_y.append(torch.LongTensor(batch_y).to(device))

        return batches_x, batches_y


    def forward(self, batch_x):

        bert_output = self.bert(input_ids=batch_x["input_ids"],
                         attention_mask=batch_x["attention_mask"],
                         token_type_ids=batch_x["token_type_ids"],
                         output_hidden_states=True)

      # We're going to represent an entire document just by its [CLS] embedding (at position 0)
      # And use the *last* layer output (layer -1)
      # as a result of this choice, this embedding will be optimized for this purpose during the training process.

        bert_hidden_states = bert_output['hidden_states']

        # out = bert_hidden_states[-1][:,0,:]

        # out = self.fc(out)

        # return out.squeeze()

         # Pooling strategy: Mean pooling
        pooled_output = torch.mean(bert_hidden_states[-1], dim=1)  # Mean pooling over token embeddings

        # Classification head
        logits = self.fc(pooled_output)

        return logits

In [91]:
class CombinedBERTClassifier(BERTClassifier):
    def __init__(self, bert_model_name, params):
        super().__init__(bert_model_name, params)

        # Additional components specific to HierarchicalBERTClassifier
        self.sentence_attention = nn.Linear(params["embedding_size"], 1)
        self.dropout = nn.Dropout(.1)

    def forward(self, batch_x):
      input_ids = batch_x["input_ids"]
      attention_mask = batch_x["attention_mask"]
      token_type_ids = batch_x["token_type_ids"]

      # BERT encoding
      bert_output = self.bert(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              output_hidden_states=True)
      bert_hidden_states = bert_output['hidden_states']  # Shape: (batch_size, seq_length, hidden_size)

      # Sentence-level attention
      sentence_att_scores = self.sentence_attention(bert_hidden_states[-1]).squeeze(-1)
      sentence_att_weights = F.softmax(sentence_att_scores, dim=1)  # Shape: (batch_size, seq_length)
      sentence_att_embeddings = torch.bmm(sentence_att_weights.unsqueeze(1), bert_hidden_states[-1]).squeeze(1)


      cls_embedding = bert_hidden_states[-1][:, 0, :]
      combined_embeddings = sentence_att_embeddings +  cls_embedding



      combined_embeddings = self.dropout(combined_embeddings)
      combined_embeddings = self.fc(combined_embeddings)

      return combined_embeddings


In [92]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [93]:
import torch.nn as nn
import torch.nn.functional as F


def calculate_accuracy(logits, targets):
    _, predicted = torch.max(logits, 1)
    correct = (predicted == targets).float()
    accuracy = correct.sum() / len(correct)
    return accuracy.item()

def train(bert_model_name, model_filename, train_x, train_y, dev_x, dev_y, labels, embedding_size=768, doLowerCase=None):

    bert_model = CombinedBERTClassifier(bert_model_name, params={"label_length": len(labels), "doLowerCase":doLowerCase, "embedding_size":embedding_size})
    bert_model.to(device)

    # # Freezing layers
    # for param in bert_model.bert.parameters():
    #     param.requires_grad = False

    batch_x, batch_y = bert_model.get_batches(train_x, train_y)
    dev_batch_x, dev_batch_y = bert_model.get_batches(dev_x, dev_y)

    # Parameters for optimization
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, bert_model.parameters()), lr=1e-4)
    cross_entropy = nn.CrossEntropyLoss()

    num_epochs = 30
    best_dev_acc = 0.
    patience = 5
    best_epoch = 0

    for epoch in range(num_epochs):
        bert_model.train()
        total_correct = 0
        total_samples = 0

        # Train
        for x, y in zip(batch_x, batch_y):
            y_pred = bert_model.forward(x)
            loss = cross_entropy(y_pred.view(-1, bert_model.num_labels), y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate training accuracy
            batch_accuracy = calculate_accuracy(y_pred, y)
            total_correct += batch_accuracy * len(y)
            total_samples += len(y)

        train_accuracy = total_correct / total_samples




        # Evaluate
        dev_accuracy, _ = evaluate(bert_model, dev_batch_x, dev_batch_y)
        if epoch % 1 == 0:
            print("Epoch %s, dev accuracy: %.3f" % (epoch, dev_accuracy))
            if dev_accuracy > best_dev_acc:
                torch.save(bert_model.state_dict(), model_filename)
                best_dev_acc = dev_accuracy
                best_epoch = epoch
        if epoch - best_epoch > patience:
            print("No improvement in dev accuracy over %s epochs; stopping training" % patience)
            break

    bert_model.load_state_dict(torch.load(model_filename))

    print("\nBest Performing Model achieves dev accuracy of : %.3f" % (best_dev_acc))
    print("\nTraining accuracy of : %.3f" % (train_accuracy))

    return bert_model

In [94]:
# small BERT -- can run on laptop
bert_model_name="google/bert_uncased_L-2_H-128_A-2"
model_filename="mybert.model"
embedding_size=128
doLowerCase=True

# bert-base -- slow on laptop; better on Colab
# bert_model_name="google/bert_uncased_L-2_H-128_A-2"
# model_filename="mybert.model"
# embedding_size=768
# doLowerCase=False

model=train(bert_model_name, model_filename, train_x, train_y, dev_x, dev_y, labels, embedding_size=embedding_size, doLowerCase=doLowerCase)

Epoch 0, dev accuracy: 0.416
Epoch 1, dev accuracy: 0.406
Epoch 2, dev accuracy: 0.406
Epoch 3, dev accuracy: 0.386
Epoch 4, dev accuracy: 0.396
Epoch 5, dev accuracy: 0.406
Epoch 6, dev accuracy: 0.366
No improvement in dev accuracy over 5 epochs; stopping training

Best Performing Model achieves dev accuracy of : 0.416

Training accuracy of : 0.615


In [95]:
test_batch_x, test_batch_y = model.get_batches(test_x, test_y)
accuracy, test_n=evaluate(model, test_batch_x, test_batch_y)

lower, upper=confidence_intervals(accuracy, test_n, .95)
print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]\n" % (accuracy, lower, upper))

Test accuracy for best dev model: 0.396, 95% CIs: [0.301 0.491]




### Tuning Process

When initially running the existing bert model on Colab, bert-base, we would get exceed the memory usage. Consequently, we decided to switch to the small BERT model instead. With that we began our fine tuning process:

Test 1: Changing batch size

Initially, we thought increasing batch size would allow for better model accuracy, but after running the model from batch sizes ranging from 15 to 50, although the difference in accuracy wasn't that much of a difference, have a small batch size of 15 seemed to reduce overfitting as the difference between the training accruacy and test accuracy was smaller. Training with small batches introduces noise into the optimization process, which acts as a form of regularization. Furthermore, smaller batach sizes seemed to converge faster.

Test 2: Freezing layers

Next, we decided to try to freeze the parameters of the of the BERT model. This means that the pretrained layers will remain fixed throughout the training process. We decided to try to do this as you can retain the pre-trained representations and avoid overwriting them with gradients computed from our specific task. This could've been benfifical for our smaller training data, allowing us to leverage the rich representations of the pretrained model. However, this made our results significantly worse so we commented it out.

Test 3: Changing learning rate

We tried both increasing and decreasing the learning weight and found that the accuracy is best from 1e-5 to 1e-4. We picked 1e-5 as it converged faster, with fewer epoches required.

Test 4: Mean pooling
We wanted to experiment with another method of obtaining the fixed-size representations from the outputs of BERT, so we tried mean pooling, which involves taking the mean of the output embeddings across all tokens in a sequence. However, we found that CLS is more effective with a higher accruacy for tasks where the sequence-level information is crucial, such as, for our purposes, text classification, so we decided to not do mean pooling.  

Test 5: Adding layers

We added two additional layers to the BertClassifier (CombinedBERTClassifier), a linear layer for sentence attention and a drouput layer. We decided to include the  sentence attention layer as our data consists of paragrpaghs of text. Within a paragraph, certain sentences may carry more significance than others in conveying the main idea or providing crucial details. Sentence attention enables the model to identify and prioritize these key sentences, enhancing its understanding of the paragraph's content. Furthermore, dropout layers are a regularization technique used during training to prevent overfitting by randomly deactivating neuronsThus, adding these additional layers helped increase our accuracy.









