# Overview
- The following code is a BERT implementation that has been fine-tuned on the FNC training dataset for stance detection
- Input: competition_test_stances_unlabeled.csv and competition_test_bodies.csv
- Output: competition_test_stances.csv (after our model has labelled the input data)


# Step 0: Install and import required packages

In [None]:
!pip install -q transformers==2.8.0

[K     |████████████████████████████████| 563 kB 9.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 14.3 MB/s 
[K     |████████████████████████████████| 132 kB 39.9 MB/s 
[K     |████████████████████████████████| 895 kB 40.0 MB/s 
[K     |████████████████████████████████| 5.6 MB 8.5 MB/s 
[K     |████████████████████████████████| 79 kB 4.4 MB/s 
[K     |████████████████████████████████| 8.7 MB 37.4 MB/s 
[K     |████████████████████████████████| 138 kB 24.4 MB/s 
[K     |████████████████████████████████| 127 kB 62.6 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[?25h

In [None]:
!pip install -q -U watermark

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
markdown 3.3.6 requires importlib-metadata>=4.4; python_version < "3.10", but you have importlib-metadata 2.1.3 which is incompatible.[0m


In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.7.13
IPython version      : 5.5.0

numpy       : 1.21.5
pandas      : 1.3.5
torch       : 1.10.0+cu111
transformers: 2.8.0



In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import transformers
import torch
import torch.nn.functional as F

from collections import defaultdict
from matplotlib import rc
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, DistilBertModel, DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
from textwrap import wrap
from torch import nn, optim
# handles things like batching
from torch.utils.data import Dataset, DataLoader

# lets us use csv data on our drive
from google.colab import drive
drive.mount('/content/drive')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

class_names = ['unrelated', 'agree', 'disagree', 'discuss']
torch.cuda.empty_cache()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Step 1: Data Exploration
In this section we load in the training and test dataset that have been provided by the FNC Github.

In [None]:
# load in training and testing dataset (bodies + stances)
df_train_bodies = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/train_bodies.csv", dtype=str)
df_train_stances = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/train_stances.csv", dtype=str)
df_test_bodies = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/competition_test_bodies.csv", dtype=str)
df_test_stances_unlabeled = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/competition_test_stances_unlabeled.csv", dtype=str)

# to make things easier, merge the FNC train datasets above
df_train = pd.merge(df_train_bodies, df_train_stances, on='Body ID', how='inner')

# to make things easier, merge the FNC train datasets above
df_test = pd.merge(df_test_stances_unlabeled, df_test_bodies, on='Body ID', how='left')


In [None]:
# # show table of training dataset stances
# # majority of stances are unrelated
# # TODO: say something about this in the report
# sns.countplot(df_train.Stance)
# plt.xlabel('Stance')

In [None]:
# print(df_train.dtypes)

In [None]:
# # translate dataset into unrelated and related stances
# def to_related(stance):
#  if stance == 'agree' or stance == 'disagree' or stance == 'discuss':
#    return 'related'
#  else:
#    return 'unrelated'

# df_train['relation'] = df_train.Stance.apply(to_related)
# sns.countplot(df_train.relation)
# plt.xlabel('relation')

## Step 2: Data Preprocessing
In this section we convert the headline and body data to be acceptable in the BERT model.


In [None]:
# Initialize desired BERT model
# TODO: test out both cased and uncased to see what performs better
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

# Load the pre-trained BertTokenizer
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)

# [SEP] tokens mark the end of a sentence. In our case it will distinguish difference between headline and body.
tokenizer.sep_token, tokenizer.sep_token_id

# [CLS] tokens must be appended to the start of each sentence
# this lets BERT knows we're doing classification
tokenizer.cls_token, tokenizer.cls_token_id

# [UNK] Marks any tokens that are in our current training set that were NOT in the original training set
tokenizer.unk_token, tokenizer.unk_token_id

# [PAD] is the padding token that lets us pass sequences of constant length
tokenizer.pad_token, tokenizer.pad_token_id

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

('[PAD]', 0)

In [None]:
# merge headline and article text separated by a [SEP] token
df_train["headlineWithArticle"] = df_train["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_train["articleBody"]
# convert string stances to a stance label: ['unrelated', 'discuss', 'agree', 'disagree'] -> [0, 1, 2, 3]
df_train['stance_label'] = df_train['Stance'].apply(lambda x: ['unrelated', 'discuss', 'agree', 'disagree'].index(x))
df_train.head()

Unnamed: 0,Body ID,articleBody,Headline,Stance,headlineWithArticle,stance_label
0,0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated,"Soldier shot, Parliament locked down after gun...",0
1,0,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated,Tourist dubbed ‘Spider Man’ after spider burro...,0
2,0,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated,Luke Somers 'killed in failed rescue attempt i...,0
3,0,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated,BREAKING: Soldier shot at War Memorial in Otta...,0
4,0,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated,Giant 8ft 9in catfish weighing 19 stone caught...,0


In [None]:
df_train.shape

(49972, 6)

In [None]:
# merge headline and article text separated by a [SEP] token
df_test["headlineWithArticle"] = df_test["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_test["articleBody"]
df_test.head()

Unnamed: 0,Headline,Body ID,articleBody,headlineWithArticle
0,Ferguson riots: Pregnant woman loses eye after...,2008,A RESPECTED senior French police officer inves...,Ferguson riots: Pregnant woman loses eye after...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,Dave Morin's social networking company Path is...,Crazy Conservatives Are Sure a Gitmo Detainee ...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,A bereaved Afghan mother took revenge on the T...,A Russian Guy Says His Justin Bieber Ringtone ...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,Hewlett-Packard is officially splitting in two...,"Zombie Cat: Buried Kitty Believed Dead, Meows ..."
4,Argentina's President Adopts Boy to End Werewo...,37,An airline passenger headed to Dallas was remo...,Argentina's President Adopts Boy to End Werewo...


# Step 2.1: Going over basic operations to convert sample text to tokens and tokens to unique integgers (ids) (OPTIONAL)

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.' + ' ' + tokenizer.sep_token
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(f' Sentence: {sample_txt}')
print(f' Tokens: {tokens}')
print(f' Token IDs: {token_ids}')

 Sentence: When was I last outside? I am stuck at home for 2 weeks. [SEP]
 Tokens: ['When', 'was', 'I', 'last', 'outside', '?', 'I', 'am', 'stuck', 'at', 'home', 'for', '2', 'weeks', '.', '[SEP]']
 Token IDs: [1332, 1108, 146, 1314, 1796, 136, 146, 1821, 5342, 1120, 1313, 1111, 123, 2277, 119, 102]


In [None]:
# Example of creating encoded version of text for BERT to accept
encoding = tokenizer.encode_plus (
    sample_txt,
    max_length = 32,
    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids = False,
    pad_to_max_length = True,
    #padding = 'max_length',
    return_attention_mask = True,
    return_tensors = 'pt', # return pytorch tensors
)

encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
# notice the padding and the CLS starting token
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

32


tensor([ 101, 1332, 1108,  146, 1314, 1796,  136,  146, 1821, 5342, 1120, 1313,
        1111,  123, 2277,  119,  102,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [None]:
# notice the attention mask has the same length
# In short, this indicates to the model which words should be attended to, and which should not, making it faster.
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

32


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
# inverse the tokenization to look at the special tokens
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

['[CLS]',
 'When',
 'was',
 'I',
 'last',
 'outside',
 '?',
 'I',
 'am',
 'stuck',
 'at',
 'home',
 'for',
 '2',
 'weeks',
 '.',
 '[SEP]',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

# Step 2.2: Choosing Sequence Length
BERT works with fixed-length sequences. We use a simple strategy to choose the max length. First we will examine the token length of each headline and review.

In [None]:
# token_lens = []

# for txt in df_train_stances.Headline:
#   tokens = tokenizer.encode(txt, max_length=80)
#   token_lens.append(len(tokens))

# sns.displot(token_lens)
# plt.xlim([0, 256]);
# plt.xlabel('Token count for headline');

In [None]:
# token_lens = []

# for txt in df_train_bodies.articleBody:
#   tokens = tokenizer.encode(txt)
#   token_lens.append(len(tokens))

# sns.displot(token_lens)
# plt.xlim([0, 4000]);
# plt.xlabel('Token count for bodies');

# Step 2.3: Create a PyTorch dataset class

- We will be feeding in the headline and article as separate sequences during training
- The headline seems to be <100 tokens
- However, the body content can range into thousands of tokens
- Also, the max acceptable sequence length for BERT is 512
- So, we will be concatenating the headline and fill the remainder of the 512 tokens with article tokens and conduct training with this input data

In [None]:
# TODO: Experiment with this value, the smaller the better since 12gb ram limitation
MAX_LEN = 350
# dataset class
class FNCDataset(Dataset):

  def __init__(self, headlineWithArticle, stance, tokenizer, max_len):
    self.headlineWithArticle = headlineWithArticle
    self.stance = stance
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  # returns the number of headlines in the dataset
  def __len__(self):
    return len(self.headlineWithArticle)
  
  # returns the item after being given an item id
  def __getitem__(self, item):
    headlineWithArticle = self.headlineWithArticle[item]
    stance = self.stance[item]

    encoding = self.tokenizer.encode_plus(
      headlineWithArticle,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'headlineWithArticle': headlineWithArticle,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'stance': torch.tensor(stance, dtype=torch.long)
    }

In [None]:
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)
df_train.shape, df_val.shape

((44974, 6), (4998, 6))

# Step 2.4: Instantiate Data Loaders and Link to Dataset Class

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  # this is where we shape the raw FNC data to acceptable BERT formatted data
  
  # instantiate pytorch class with formatted data
  ds = FNCDataset(
    headlineWithArticle=df.headlineWithArticle.to_numpy(),
    stance=df.stance_label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

In [None]:
BATCH_SIZE = 32
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
# test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
# example batch from training data loader
data = next(iter(train_data_loader))
data.keys()


dict_keys(['headlineWithArticle', 'input_ids', 'attention_mask', 'stance'])

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['stance'].shape)

torch.Size([32, 350])
torch.Size([32, 350])
torch.Size([32])


# Step 3: Sentiment Classification with BERT and Hugging Face

In [None]:
# load the bert model
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

# Step 3.1: Test BERT model on sample text

In [None]:
# apply BERT to encoding of sample text and store the output
# pooled output is just the first element of the last hidden state that is passed through the dense layer
last_hidden_state, pooled_output = bert_model(
    input_ids=encoding['input_ids'],
    attention_mask=encoding['attention_mask']
)

In [None]:
# the last hidden state is a sequence of hidden states of the last layer of the model
last_hidden_state.shape

torch.Size([1, 32, 768])

In [None]:
# 768 represents the number of hidden units in the feedforward network
bert_model.config.hidden_size

768

In [None]:
# pooled_output is retrieved by applying the BertPooler on the last_hidden_state
# can think of pooled_output as a summary of the content
pooled_output.shape

torch.Size([1, 768])

# Step 3.2: Create a stance classifier that uses the BERT model

We use a dropout layer for some regularization and a fully-connected layer for our output. Note that we return the raw output of the last layer since it's required for cross-entropy loss in pytorch to work.

In [None]:
class StanceClassifier(nn.Module):

  def __init__(self, n_classes):
    super(StanceClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    # TODO: test out different dropout values, this helps prevent overfitting
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
# create an instance and move to GPU
model = StanceClassifier(len(class_names))
model = model.to(device)

In [None]:
# # move sample batch of training data to GPU
# input_ids = data['input_ids'].to(device)
# attention_mask = data['attention_mask'].to(device)

# print(input_ids.shape) # batch size x seq length
# print(attention_mask.shape) # batch size x seq length

In [None]:
# to get predicted probabilities from trained model, apply the softmax function to the outputs
# F.softmax(model(input_ids, attention_mask ), dim=1)

# Step 4: Training
Use the AdamW optimizer by Hugging Face. It corrects weight decay. We also use a linear schedule with no warmup steps.

BERT author recommendations for fine-tuning:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4

We're going to ignore the number of epochs recommendation but stick with the rest. Note that increasing the batch size reduces the training time significantly, but gives you lower accuracy.

Let's continue with writing a helper function for training our model for one epoch:

In [None]:
# TODO: Experiment with this vlaue
EPOCHS = 2

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    stance = d["stance"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, stance)

    correct_predictions += torch.sum(preds == stance)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

Training the model should look familiar, except for two things. The scheduler gets called every time a batch is fed to the model. We're avoiding exploding gradients by clipping the gradients of the model using clip_gradnorm.

Let's write another one that helps us evaluate the model on a given data loader:


In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      stance = d["stance"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, stance)

      correct_predictions += torch.sum(preds == stance)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

store the training history while using the above functions to write the training loop

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc >= best_accuracy:
    print("Saving pre-trained model")
    torch.save(model.state_dict(), "/content/drive/My Drive/MSCI_598_FNC_DATA/best_model_state_2e_400.bin")
    best_accuracy = val_acc

Epoch 1/2
----------


# Step 5: Evaluation
Applying model to competition test dataset

In [None]:
df_test.shape

In [None]:
# given df with Body ID	articleBody	Headline	headlineWithArticle - > csv with Headline, Body ID, Stance

def get_predictions(model, df):
  model = model.eval()
  predictions = []
  currentRow = 0 
  for row in df.itertuples():
    headlineWithArticle = row.headlineWithArticle

    # encode test data to BERT format
    encoding = tokenizer.encode_plus(
      headlineWithArticle,
      add_special_tokens=True,
      max_length=MAX_LEN,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    # get predictions
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

    print(currentRow,class_names[prediction])
    currentRow += 1
    predictions.append(class_names[prediction])

  # return newly constructed dataframe with predicted labels
  df_competition_test_stances = pd.DataFrame()
  df_competition_test_stances["Headline"] = df["Headline"]
  df_competition_test_stances["Body ID"] = df["Body ID"]
  df_competition_test_stances["Stance"] = pd.Series(predictions)
  return df_competition_test_stances

   




In [None]:
best_model = StanceClassifier(len(class_names))
best_model.load_state_dict(torch.load('/content/drive/My Drive/MSCI_598_FNC_DATA/best_model_state_2e.bin',map_location=torch.device('cpu')))
best_model = best_model.to(device)
df_competition_test_stances = get_predictions(best_model, df_test)
df_competition_test_stances.to_csv("/content/drive/My Drive/MSCI_598_FNC_DATA/bert_answer.csv", index = False, encoding = 'utf-8')


In [None]:
with open('/content/drive/My Drive/MSCI_598_FNC_DATA/bert_answer_2.csv') as f:
    print(f)