<a href="https://colab.research.google.com/github/fawazshah/Reddit-Analysis/blob/main/5_direct_transfer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
import random
import requests
from sklearn.metrics import f1_score, accuracy_score, classification_report
import time
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
import transformers

In [3]:
# Setting random seed and device
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

### BERT setup

In [4]:
loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# Compute the length of the longest sentence in particular column out of
# all train, val and test data
def compute_max_length(df, bert_input_func):

  sentences = bert_input_func(df)

  max_len = 0

  for sent in sentences:

      # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
      input_ids = tokenizer.encode(sent, add_special_tokens=True)

      # Update the maximum sentence length.
      max_len = max(max_len, len(input_ids))

  return max_len

In [7]:
def compute_sentences_article_body(df):
    return list(df['article body'])

def compute_sentences_comment_body(df):
    return list(df['comment body'])

In [8]:
def create_bert_dataset(df, bert_input_func, max_sequence_len):
    # Returns a TensorDataset of sequences extracted from df

    token_ids = []
    token_type_ids = [] # segment ids 
    attention_masks = []

    sentences = bert_input_func(df)

    for sent in sentences:
        encoding_dict = tokenizer(sent,
                                  add_special_tokens=True,
                                  max_length=max_sequence_len,
                                  padding='max_length',
                                  truncation=True,
                                  return_token_type_ids = True,
                                  return_attention_mask = True,
                                  return_tensors = 'pt'
                                  )
        token_ids.append(encoding_dict['input_ids'])
        token_type_ids.append(encoding_dict['token_type_ids'])
        attention_masks.append(encoding_dict['attention_mask'])
    
    token_ids = torch.cat(token_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['bias'].values)
    
    return TensorDataset(token_ids, token_type_ids, attention_masks, labels)

In [9]:
def train_BERT(train_dataloader, val_dataloader, model, number_epoch):

    train_loss = []
    valid_loss = []

    optimizer = AdamW(model.parameters(),
                    lr = 2e-5, 
                    eps = 1e-8 
                )

    # Create the learning rate scheduler.
    total_steps = len(train_dataloader) * number_epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                                num_training_steps=total_steps)

    for epoch in range(1, number_epoch+1):

        # TRAINING

        time0 = time.time()

        model.train()

        epoch_train_loss = 0
        no_observations = 0
        epoch_train_predictions = []
        epoch_train_labels = []

        for batch in train_dataloader:

            # Each batch contains token ids, token type ids, attention masks and labels
            b_token_ids = batch[0].to(device)
            b_token_type_ids = batch[1].to(device)
            b_attention_masks = batch[2].to(device)
            b_labels = batch[3].to(device)

            no_observations = no_observations + b_labels.shape[0]
            
            output = model(b_token_ids, 
                    token_type_ids=b_token_type_ids, 
                    attention_mask=b_attention_masks, 
                    labels=b_labels)

            model.zero_grad()

            loss = output.loss
            logits = output.logits

            predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = b_labels.detach().cpu().numpy()
            epoch_train_predictions.extend(predictions)
            epoch_train_labels.extend(labels)

            loss.backward()
            # Clip the norm of the gradients to 1 to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step() 

            # Update the learning rate using the scheduler
            scheduler.step()  

            epoch_train_loss += loss.item()*b_labels.shape[0]

        epoch_train_loss, epoch_train_acc = epoch_train_loss / no_observations, accuracy_score(epoch_train_labels, epoch_train_predictions)

        # VALIDATION

        epoch_valid_loss, epoch_val_predictions, epoch_val_labels = evaluate_BERT(val_dataloader, model)
        epoch_valid_acc = accuracy_score(epoch_val_labels, epoch_val_predictions)

        # FINALLY

        print(f"Epoch took: {time.time() - time0}")

        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_train_loss:.2f} | Train Accuracy: {epoch_train_acc:.2f} | \
        Val. Loss: {epoch_valid_loss:.2f} | Val. Accuracy: {epoch_valid_acc:.2f} |')

        train_loss.append(epoch_train_loss)
        valid_loss.append(epoch_valid_loss)
    
    return train_loss, valid_loss

In [10]:
def evaluate_BERT(test_dataloader, model):

    model.eval()
    total_loss = 0
    no_observations = 0
    predictions_all = []
    labels_all = []

    with torch.no_grad():
        for batch in test_dataloader:
            b_token_ids = batch[0].to(device)
            b_token_type_ids = batch[1].to(device)
            b_attention_masks = batch[2].to(device)
            b_labels = batch[3].to(device)

            no_observations += b_labels.shape[0]
            output = model(b_token_ids, token_type_ids=b_token_type_ids, 
                                        attention_mask=b_attention_masks)
            logits = output.logits
            loss = loss_fn(logits, b_labels)

            predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = b_labels.detach().cpu().numpy()
            predictions_all.extend(predictions)
            labels_all.extend(labels)

            total_loss += loss.item()*b_labels.shape[0]
    
    return total_loss / no_observations, predictions_all, labels_all

### Article body -> comments

#### Loading data

In [11]:
submissions_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/collated-data/submissions_preprocessed.tsv'
submissions_df = pd.read_csv(submissions_url, sep='\t')

comments_url = 'https://raw.githubusercontent.com/fawazshah/Reddit-Analysis/master/data/collated-data/comments_preprocessed.tsv'
comments_df = pd.read_csv(comments_url, sep='\t')

In [12]:
print(f"No. submissions: {len(submissions_df)}")
print(f"No. comments: {len(comments_df)}")

No. submissions: 806
No. comments: 11923


#### One hot encode labels

In [13]:
# left == 0
# right == 1

def encode_labels(label):
    if label == "left":
        return 0
    else:
        return 1

submissions_df['bias'] = submissions_df['bias'].apply(encode_labels)
comments_df['bias'] = comments_df['bias'].apply(encode_labels)

#### Split data into train/val/test

In [14]:
# Train/val/test split

TRAIN = 0.7
VAL = 0.1
TEST = 0.2

In [15]:
split_point_1 = int(TRAIN*len(submissions_df))
split_point_2 = int((TRAIN+VAL)*len(submissions_df))

submissions_train_df = submissions_df.iloc[:split_point_1].copy()
submissions_val_df = submissions_df.iloc[split_point_1:split_point_2].copy()
submissions_test_df = submissions_df.iloc[split_point_2:].copy()

In [16]:
print(f"Size of training set: {len(submissions_train_df)}")
print(f"Size of validation set: {len(submissions_val_df)}")
print(f"Size of test set: {len(submissions_test_df)}")

Size of training set: 564
Size of validation set: 80
Size of test set: 162


#### Create BERT dataloaders

In [17]:
dataloaders = {}
BATCH_SIZE = 10

In [18]:
# Using no folds
dataloaders['article bodies'] = {}
train_dataset = create_bert_dataset(submissions_train_df, compute_sentences_article_body, 512)
dataloaders['article bodies']['train'] = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataset = create_bert_dataset(submissions_val_df, compute_sentences_article_body, 512)
dataloaders['article bodies']['val'] = DataLoader(val_dataset, sampler=RandomSampler(val_dataset), batch_size=BATCH_SIZE)
test_dataset = create_bert_dataset(submissions_test_df, compute_sentences_article_body, 512)
dataloaders['article bodies']['test'] = DataLoader(test_dataset, sampler=RandomSampler(test_dataset), batch_size=BATCH_SIZE)

In [19]:
comment_dataset = create_bert_dataset(comments_df, compute_sentences_comment_body, 512)
dataloaders['comment bodies'] = DataLoader(comment_dataset, sampler = RandomSampler(comment_dataset), batch_size=BATCH_SIZE)