# Overview
- The following code is a DistilBERT implementation that has been fine-tuned on the FNC training dataset for stance detection
- Input: competition_test_stances_unlabeled.csv and competition_test_bodies.csv
- Output: competition_test_stances.csv (after our model has labelled the input data)


# Step 0: Install and import required packages

In [1]:
# Required for running on Paperspace Gradient environment
# !pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
!pip install -q transformers



In [3]:
!pip install -q -U watermark



In [4]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.8.12
IPython version      : 8.0.1

numpy       : 1.22.2
pandas      : 1.3.5
torch       : 1.11.0a0+17540c5
transformers: 4.18.0



Load distilbert to test and make sure dependencies are correct

In [5]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
output

BaseModelOutput(last_hidden_state=tensor([[[ 4.4104e-04, -2.6241e-01, -1.0192e-01,  ..., -6.2764e-02,
           2.7584e-01,  3.7014e-01],
         [ 7.2233e-01,  1.6449e-01,  4.0025e-01,  ...,  1.9161e-01,
           4.0458e-01, -5.8094e-02],
         [ 2.8198e-01, -1.7430e-01,  3.9076e-02,  ...,  2.7681e-02,
           1.1886e-01,  9.1439e-01],
         ...,
         [ 6.8016e-01,  7.9712e-02,  8.3603e-01,  ..., -4.8959e-01,
          -2.5017e-01, -2.3518e-01],
         [ 3.8105e-02, -8.1751e-01, -3.4076e-01,  ...,  4.4815e-01,
           9.6725e-02, -2.0311e-01],
         [ 3.5750e-01,  1.9968e-01,  1.7437e-01,  ...,  1.5028e-01,
          -2.3665e-01,  5.4391e-02]]], grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import transformers
import torch
import torch.nn.functional as F

from collections import defaultdict
from matplotlib import rc
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from textwrap import wrap
from torch import nn, optim
# handles things like batching
from torch.utils.data import Dataset, DataLoader

# lets us use csv data on our drive
# from google.colab import drive
# drive.mount('/content/drive')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

class_names = ['unrelated', 'agree', 'disagree', 'discuss']

In [8]:
device

device(type='cuda', index=0)

# Step 1: Data Exploration
In this section we load in the training and test dataset that have been provided by the FNC Github.

In [9]:
# # load in training and testing dataset (bodies + stances)
# df_train_bodies = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/train_bodies.csv", dtype=str)
# df_train_stances = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/train_stances.csv", dtype=str)
# df_test_bodies = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/test_bodies.csv", dtype=str)
# df_test_stances_unlabeled = pd.read_csv("drive/My Drive/MSCI_598_FNC_DATA/test_stances_unlabeled.csv", dtype=str)

df_train_bodies = pd.read_csv("train_bodies.csv", dtype=str)
df_train_stances = pd.read_csv("train_stances.csv", dtype=str)
df_test_bodies = pd.read_csv("competition_test_bodies.csv", dtype=str)
df_test_stances_unlabeled = pd.read_csv("competition_test_stances_unlabeled.csv", dtype=str)

# to make things easier, merge the FNC train datasets above
df_train = pd.merge(df_train_stances, df_train_bodies, on='Body ID', how='left')
df_train.head()

# to make things easier, merge the FNC test datasets above
df_test = pd.merge(df_test_stances_unlabeled, df_test_bodies, on='Body ID', how='left')
df_test.head()


Unnamed: 0,Headline,Body ID,articleBody
0,Ferguson riots: Pregnant woman loses eye after...,2008,A RESPECTED senior French police officer inves...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,Dave Morin's social networking company Path is...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,A bereaved Afghan mother took revenge on the T...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,Hewlett-Packard is officially splitting in two...
4,Argentina's President Adopts Boy to End Werewo...,37,An airline passenger headed to Dallas was remo...


In [10]:
# # show table of training dataset stances
# # majority of stances are unrelated
# # TODO: say something about this in the report
# sns.countplot(df_train.Stance)
# plt.xlabel('Stance')

In [11]:
print(df_train.dtypes)

Headline       object
Body ID        object
Stance         object
articleBody    object
dtype: object


In [12]:
# # translate dataset into unrelated and related stances
# def to_related(stance):
#  if stance == 'agree' or stance == 'disagree' or stance == 'discuss':
#    return 'related'
#  else:
#    return 'unrelated'

# df_train['relation'] = df_train.Stance.apply(to_related)
# sns.countplot(df_train.relation)
# plt.xlabel('relation')

## Step 2: Data Preprocessing
In this section we convert the headline and body data to be acceptable in the BERT model.


In [13]:
# Initialize desired BERT model
# TODO: test out both cased and uncased to see what performs better
# PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
from transformers import AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

# Load the pre-trained BertTokenizer
# tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# [SEP] tokens mark the end of a sentence. In our case it will distinguish difference between headline and body.
tokenizer.sep_token, tokenizer.sep_token_id

# [CLS] tokens must be appended to the start of each sentence
# this lets BERT knows we're doing classification
tokenizer.cls_token, tokenizer.cls_token_id

# [UNK] Marks any tokens that are in our current training set that were NOT in the original training set
tokenizer.unk_token, tokenizer.unk_token_id

# [PAD] is the padding token that lets us pass sequences of constant length
tokenizer.pad_token, tokenizer.pad_token_id

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

('[PAD]', 0)

In [14]:
# merge headline and article text separated by a [SEP] token
df_train["headlineWithArticle"] = df_train["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_train["articleBody"]
# convert string stances to a stance label: ['unrelated', 'discuss', 'agree', 'disagree'] -> [0, 1, 2, 3]
df_train['stance_label'] = df_train['Stance'].apply(lambda x: ['unrelated', 'discuss', 'agree', 'disagree'].index(x))
df_train.head()

Unnamed: 0,Headline,Body ID,Stance,articleBody,headlineWithArticle,stance_label
0,Police find mass graves with at least '15 bodi...,712,unrelated,Danny Boyle is directing the untitled film\n\n...,Police find mass graves with at least '15 bodi...,0
1,Hundreds of Palestinians flee floods in Gaza a...,158,agree,Hundreds of Palestinians were evacuated from t...,Hundreds of Palestinians flee floods in Gaza a...,2
2,"Christian Bale passes on role of Steve Jobs, a...",137,unrelated,30-year-old Moscow resident was hospitalized w...,"Christian Bale passes on role of Steve Jobs, a...",0
3,HBO and Apple in Talks for $15/Month Apple TV ...,1034,unrelated,(Reuters) - A Canadian soldier was shot at the...,HBO and Apple in Talks for $15/Month Apple TV ...,0
4,Spider burrowed through tourist's stomach and ...,1923,disagree,"Fear not arachnophobes, the story of Bunbury's...",Spider burrowed through tourist's stomach and ...,3


In [15]:
df_train.shape

(49972, 6)

In [16]:
# merge headline and article text separated by a [SEP] token
df_test["headlineWithArticle"] = df_test["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_test["articleBody"]
df_test.head()

Unnamed: 0,Headline,Body ID,articleBody,headlineWithArticle
0,Ferguson riots: Pregnant woman loses eye after...,2008,A RESPECTED senior French police officer inves...,Ferguson riots: Pregnant woman loses eye after...
1,Crazy Conservatives Are Sure a Gitmo Detainee ...,1550,Dave Morin's social networking company Path is...,Crazy Conservatives Are Sure a Gitmo Detainee ...
2,A Russian Guy Says His Justin Bieber Ringtone ...,2,A bereaved Afghan mother took revenge on the T...,A Russian Guy Says His Justin Bieber Ringtone ...
3,"Zombie Cat: Buried Kitty Believed Dead, Meows ...",1793,Hewlett-Packard is officially splitting in two...,"Zombie Cat: Buried Kitty Believed Dead, Meows ..."
4,Argentina's President Adopts Boy to End Werewo...,37,An airline passenger headed to Dallas was remo...,Argentina's President Adopts Boy to End Werewo...


# Step 2.1: Going over basic operations to convert sample text to tokens and tokens to unique integers (ids) (OPTIONAL)

In [15]:
# sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.' + ' ' + tokenizer.sep_token
# tokens = tokenizer.tokenize(sample_txt)
# token_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(f' Sentence: {sample_txt}')
# print(f' Tokens: {tokens}')
# print(f' Token IDs: {token_ids}')

In [17]:
# # Example of creating encoded version of text for BERT to accept
# encoding = tokenizer.encode_plus (
#     sample_txt,
#     max_length = 32,
#     add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#     return_token_type_ids = False,
#     padding = 'max_length',
#     return_attention_mask = True,
#     return_tensors = 'pt', # return pytorch tensors
# )

# encoding.keys()

In [18]:
# encoding

In [19]:
# # notice the padding and the CLS starting token
# print(len(encoding['input_ids'][0]))
# encoding['input_ids'][0]

In [20]:
# # notice the attention mask has the same length
# # In short, this indicates to the model which words should be attended to, and which should not, making it faster.
# print(len(encoding['attention_mask'][0]))
# encoding['attention_mask']

In [21]:
# # inverse the tokenization to look at the special tokens
# tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

# Step 2.2: Choosing Sequence Length
BERT works with fixed-length sequences. We use a simple strategy to choose the max length. First we will examine the token length of each headline and review.

In [22]:
# token_lens = []

# for txt in df_train_stances.Headline:
#   tokens = tokenizer.encode(txt, max_length=80)
#   token_lens.append(len(tokens))

# sns.displot(token_lens)
# plt.xlim([0, 256]);
# plt.xlabel('Token count for headline');

In [23]:
# token_lens = []

# for txt in df_train_bodies.articleBody:
#   tokens = tokenizer.encode(txt)
#   token_lens.append(len(tokens))

# sns.displot(token_lens)
# plt.xlim([0, 4000]);
# plt.xlabel('Token count for bodies');

# Step 2.3: Create a PyTorch dataset class

- We will be feeding in the headline and article as separate sequences during training
- The headline seems to be <100 tokens
- However, the body content can range into thousands of tokens
- Also, the max acceptable sequence length for BERT is 512
- So, we will be concatenating the headline and fill the remainder of the 512 tokens with article tokens and conduct training with this input data

In [17]:
# TODO: Experiment with this value, the smaller the better since 12gb ram limitation
MAX_LEN = 512
# dataset class
class FNCDataset(Dataset):

  def __init__(self, headlineWithArticle, stance, tokenizer, max_len):
    self.headlineWithArticle = headlineWithArticle
    self.stance = stance
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  # returns the number of headlines in the dataset
  def __len__(self):
    return len(self.headlineWithArticle)
  
  # returns the item after being given an item id
  def __getitem__(self, item):
    headlineWithArticle = self.headlineWithArticle[item]
    stance = self.stance[item]

    encoding = self.tokenizer.encode_plus(
      headlineWithArticle,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'headlineWithArticle': headlineWithArticle,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'stance': torch.tensor(stance, dtype=torch.long)
    }

In [18]:
df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)
df_train.shape, df_val.shape

((44974, 6), (4998, 6))

# Step 2.4: Instantiate Data Loaders and Link to Dataset Class

In [19]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  # this is where we shape the raw FNC data to acceptable BERT formatted data
  
  # instantiate pytorch class with formatted data
  ds = FNCDataset(
    headlineWithArticle=df.headlineWithArticle.to_numpy(),
    stance=df.stance_label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

In [20]:
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [28]:
# # example batch from training data loader
# data = next(iter(train_data_loader))
# data.keys()


In [29]:
# print(data['input_ids'].shape)
# print(data['attention_mask'].shape)
# print(data['stance'].shape)

# Step 3: Sentiment Classification with BERT and Hugging Face

In [21]:
# load the bert model
bert_model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
# bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Step 3.1: Test BERT model on sample text

In [22]:
# apply BERT to encoding of sample text and store the output
# pooled output is just the first element of the last hidden state that is passed through the dense layer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
pooled_output = bert_model(
    input_ids=encoded_input['input_ids'],
    attention_mask=encoded_input['attention_mask'])[0]
cls_token = pooled_output[:, 0, :]

In [23]:
# # the last hidden state is a sequence of hidden states of the last layer of the model
# last_hidden_state.shape

In [24]:
# # 768 represents the number of hidden units in the feedforward network
# bert_model.config.hidden_size

In [25]:
# # pooled_output is retrieved by applying the BertPooler on the last_hidden_state
# # can think of pooled_output as a summary of the content
# pooled_output.shape
cls_token

tensor([[ 4.4104e-04, -2.6241e-01, -1.0192e-01, -9.3064e-02,  1.1966e-02,
         -3.8165e-01,  2.3678e-01,  6.0327e-01,  1.8048e-01, -3.6348e-01,
         -6.8504e-02, -1.4914e-01, -2.5134e-01,  2.5867e-01,  3.7561e-02,
          7.4723e-02, -3.6818e-02,  2.4032e-01, -8.2582e-03,  1.3728e-01,
         -5.1771e-02, -1.5966e-01, -2.5209e-01,  7.9627e-02,  5.8721e-02,
         -1.8414e-02,  1.7948e-02, -1.3264e-01,  5.5113e-03, -2.6995e-01,
          3.6094e-02,  1.8441e-01, -3.4662e-01, -2.4863e-01, -9.1778e-02,
          1.9617e-01,  7.7814e-02, -8.3963e-02, -1.4762e-01, -2.8499e-02,
         -5.6600e-01,  7.5506e-02,  1.4838e-01,  3.2442e-02,  3.1813e-01,
         -1.9298e-01, -2.6886e+00, -1.3958e-01, -2.3598e-01, -4.1291e-01,
          1.6821e-02,  1.1773e-01,  3.5522e-01,  4.4090e-01,  6.3121e-01,
          3.7207e-01, -2.5132e-01,  2.0780e-01, -3.2147e-02,  4.1447e-01,
          1.2069e-01,  6.9812e-02, -2.3853e-01, -1.2722e-01,  7.5114e-03,
          1.0025e-01,  2.9832e-02,  6.

# Step 3.2: Create a stance classifier that uses the BERT model

We use a dropout layer for some regularization and a fully-connected layer for our output. Note that we return the raw output of the last layer since it's required for cross-entropy loss in pytorch to work.

In [26]:
class StanceClassifier(nn.Module):

  def __init__(self, n_classes):
    super(StanceClassifier, self).__init__()
    # self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.bert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    # TODO: test out different dropout values, this helps prevent overfitting
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    last_hidden_state = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    cls_token = last_hidden_state[0][:, 0, :]
    output = self.drop(cls_token)
    return self.out(output)

In [27]:
# create an instance and move to GPU
model = StanceClassifier(len(class_names))
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
# # move sample batch of training data to GPU
# input_ids = data['input_ids'].to(device)
# attention_mask = data['attention_mask'].to(device)

# print(input_ids.shape) # batch size x seq length
# print(attention_mask.shape) # batch size x seq length

In [29]:
# to get predicted probabilities from trained model, apply the softmax function to the outputs
# F.softmax(model(input_ids, attention_mask ), dim=1)

# Step 4: Training
Use the AdamW optimizer by Hugging Face. It corrects weight decay. We also use a linear schedule with no warmup steps.

BERT author recommendations for fine-tuning:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4

We're going to ignore the number of epochs recommendation but stick with the rest. Note that increasing the batch size reduces the training time significantly, but gives you lower accuracy.

Let's continue with writing a helper function for training our model for one epoch:

In [30]:
# TODO: Experiment with this vlaue
EPOCHS = 4

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)



In [31]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    stance = d["stance"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, stance)

    correct_predictions += torch.sum(preds == stance)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

Training the model should look familiar, except for two things. The scheduler gets called every time a batch is fed to the model. We're avoiding exploding gradients by clipping the gradients of the model using clip_gradnorm.

Let's write another one that helps us evaluate the model on a given data loader:


In [32]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      stance = d["stance"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, stance)

      correct_predictions += torch.sum(preds == stance)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

store the training history while using the above functions to write the training loop

In [33]:
# torch.cuda.empty_cache()

In [35]:
%%time
EPOCHS = 4

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    metrics = {
        "model": f"distilbert_epoch_{epoch+1}",
        "train_acc": train_acc,
        "train_loss": train_loss,
        "val_acc": val_acc,
        "val_loss": val_loss 
    }
    metrics_df = pd.DataFrame([metrics])
    metrics_df.to_csv(f"512_distilbert_epoch_{epoch+1}_training_metrics.csv", index = False, encoding = 'utf-8')

    # save model after every epoch
    torch.save(model.state_dict(), f'512_distilbert_fnc_{epoch + 1}_epochs.bin')
    best_accuracy = val_acc

print(history)

Epoch 1/4
----------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train loss 0.21946929402210247 accuracy 0.9253346377907234


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val   loss 0.12235623533698393 accuracy 0.9577831132452982

Epoch 2/4
----------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train loss 0.0935394619116565 accuracy 0.9696491306087961


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Val   loss 0.11247784732707757 accuracy 0.9723889555822329

Epoch 3/4
----------


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7eac947dc0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1337, in __del__
    self._shutdown_workers(

KeyboardInterrupt: 

In [38]:
# given df with Body ID	articleBody	Headline	headlineWithArticle - > csv with Headline, Body ID, Stance
MAX_LEN = 512
def get_predictions(model, df):
  model = model.eval()
  predictions = []
  currentRow = 0 
  for row in df.itertuples():
    headlineWithArticle = row.headlineWithArticle

    # encode test data to BERT format
    encoding = tokenizer.encode_plus(
      headlineWithArticle,
      add_special_tokens=True,
      max_length=MAX_LEN,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    # get predictions
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)

    print(currentRow,class_names[prediction])
    currentRow += 1
    predictions.append(class_names[prediction])

  # return newly constructed dataframe with predicted labels
  df_competition_test_stances = pd.DataFrame()
  df_competition_test_stances["Headline"] = df["Headline"]
  df_competition_test_stances["Body ID"] = df["Body ID"]
  df_competition_test_stances["Stance"] = pd.Series(predictions)
  return df_competition_test_stances

In [39]:
best_model = StanceClassifier(len(class_names))
best_model.load_state_dict(torch.load('512_distilbert_fnc_2_epochs.bin',map_location=torch.device('cpu')))
best_model = best_model.to(device)
df_competition_test_stances = get_predictions(best_model, df_test)
df_competition_test_stances.to_csv("512_distilbert_2_epochs_answer.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the 

0 unrelated
1 unrelated
2 unrelated
3 unrelated
4 unrelated
5 unrelated
6 unrelated
7 unrelated
8 agree
9 unrelated
10 agree
11 unrelated
12 agree
13 unrelated
14 agree
15 disagree
16 unrelated
17 unrelated
18 unrelated
19 unrelated
20 agree
21 disagree
22 disagree
23 disagree
24 disagree
25 unrelated
26 unrelated
27 agree
28 unrelated
29 disagree
30 unrelated
31 unrelated
32 unrelated
33 unrelated
34 agree
35 unrelated
36 unrelated
37 unrelated
38 unrelated
39 unrelated
40 unrelated
41 unrelated
42 unrelated
43 unrelated
44 agree
45 unrelated
46 unrelated
47 unrelated
48 agree
49 agree
50 agree
51 unrelated
52 unrelated
53 unrelated
54 unrelated
55 agree
56 unrelated
57 agree
58 unrelated
59 unrelated
60 unrelated
61 unrelated
62 disagree
63 unrelated
64 unrelated
65 unrelated
66 agree
67 unrelated
68 unrelated
69 unrelated
70 agree
71 unrelated
72 disagree
73 unrelated
74 disagree
75 unrelated
76 disagree
77 unrelated
78 agree
79 unrelated
80 unrelated
81 discuss
82 agree
83 unrelate

In [47]:
# FOR CONVENIENCE OF RE-INIT WITH MAX LEN = 300
# # TODO: Experiment with this value, the smaller the better since 12gb ram limitation
# MAX_LEN = 300
# # dataset class
# class FNCDataset(Dataset):

#   def __init__(self, headlineWithArticle, stance, tokenizer, max_len):
#     self.headlineWithArticle = headlineWithArticle
#     self.stance = stance
#     self.tokenizer = tokenizer
#     self.max_len = max_len
  
#   # returns the number of headlines in the dataset
#   def __len__(self):
#     return len(self.headlineWithArticle)
  
#   # returns the item after being given an item id
#   def __getitem__(self, item):
#     headlineWithArticle = self.headlineWithArticle[item]
#     stance = self.stance[item]

#     encoding = self.tokenizer.encode_plus(
#       headlineWithArticle,
#       add_special_tokens=True,
#       max_length=self.max_len,
#       return_token_type_ids=False,
#       pad_to_max_length=True,
#       return_attention_mask=True,
#       return_tensors='pt',
#     )

#     return {
#       'headlineWithArticle': headlineWithArticle,
#       'input_ids': encoding['input_ids'].flatten(),
#       'attention_mask': encoding['attention_mask'].flatten(),
#       'stance': torch.tensor(stance, dtype=torch.long)
#     }

# def create_data_loader(df, tokenizer, max_len, batch_size):
#   # this is where we shape the raw FNC data to acceptable BERT formatted data
  
#   # instantiate pytorch class with formatted data
#   ds = FNCDataset(
#     headlineWithArticle=df.headlineWithArticle.to_numpy(),
#     stance=df.stance_label.to_numpy(),
#     tokenizer=tokenizer,
#     max_len=MAX_LEN
#   )

#   return DataLoader(
#     ds,
#     batch_size=batch_size,
#     num_workers=2
#   )

In [48]:
df_test_bodies = pd.read_csv("competition_test_bodies.csv", dtype=str)
df_test_stances_unlabeled = pd.read_csv("competition_test_stances_unlabeled.csv", dtype=str)

df_test = pd.merge(df_test_stances_unlabeled, df_test_bodies, on='Body ID', how='left')

df_test["headlineWithArticle"] = df_test["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_test["articleBody"]

BATCH_SIZE = 16


distilbert_3_epochs = StanceClassifier(len(class_names))
distilbert_3_epochs.load_state_dict(torch.load('distilbert_fnc_3_epochs.bin',map_location=torch.device('cpu')))
distilbert_3_epochs = distilbert_3_epochs.to(device)
df_competition_test_stances = get_predictions(distilbert_3_epochs, df_test)
df_competition_test_stances.to_csv("distilbert_3_epochs_answer.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 unrelated
1 unrelated
2 unrelated
3 unrelated
4 unrelated
5 unrelated
6 unrelated
7 unrelated
8 agree
9 unrelated
10 agree
11 unrelated
12 agree
13 unrelated
14 agree
15 disagree
16 unrelated
17 unrelated
18 unrelated
19 unrelated
20 agree
21 disagree
22 disagree
23 discuss
24 disagree
25 unrelated
26 unrelated
27 agree
28 unrelated
29 disagree
30 unrelated
31 unrelated
32 unrelated
33 unrelated
34 agree
35 unrelated
36 unrelated
37 unrelated
38 unrelated
39 unrelated
40 unrelated
41 unrelated
42 unrelated
43 unrelated
44 agree
45 unrelated
46 unrelated
47 unrelated
48 disagree
49 agree
50 agree
51 unrelated
52 unrelated
53 unrelated
54 unrelated
55 agree
56 unrelated
57 agree
58 unrelated
59 unrelated
60 unrelated
61 unrelated
62 disagree
63 unrelated
64 unrelated
65 unrelated
66 agree
67 unrelated
68 unrelated
69 unrelated
70 agree
71 unrelated
72 disagree
73 unrelated
74 disagree
75 unrelated
76 disagree
77 unrelated
78 agree
79 unrelated
80 unrelated
81 discuss
82 agree
83 unrela

In [50]:
df_test_bodies = pd.read_csv("competition_test_bodies.csv", dtype=str)
df_test_stances_unlabeled = pd.read_csv("competition_test_stances_unlabeled.csv", dtype=str)

df_test = pd.merge(df_test_stances_unlabeled, df_test_bodies, on='Body ID', how='left')

df_test["headlineWithArticle"] = df_test["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_test["articleBody"]

BATCH_SIZE = 16


distilbert_2_epochs = StanceClassifier(len(class_names))
distilbert_2_epochs.load_state_dict(torch.load('distilbert_fnc_2_epochs.bin',map_location=torch.device('cpu')))
distilbert_2_epochs = distilbert_2_epochs.to(device)
df_competition_test_stances = get_predictions(distilbert_2_epochs, df_test)
df_competition_test_stances.to_csv("distilbert_2_epochs_answer.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 unrelated
1 unrelated
2 unrelated
3 unrelated
4 unrelated
5 unrelated
6 unrelated
7 unrelated
8 unrelated
9 unrelated
10 agree
11 unrelated
12 agree
13 unrelated
14 agree
15 disagree
16 unrelated
17 unrelated
18 unrelated
19 unrelated
20 agree
21 discuss
22 disagree
23 discuss
24 discuss
25 unrelated
26 unrelated
27 agree
28 unrelated
29 discuss
30 unrelated
31 unrelated
32 unrelated
33 unrelated
34 agree
35 unrelated
36 unrelated
37 unrelated
38 unrelated
39 unrelated
40 unrelated
41 unrelated
42 unrelated
43 unrelated
44 unrelated
45 unrelated
46 unrelated
47 unrelated
48 disagree
49 agree
50 agree
51 unrelated
52 unrelated
53 unrelated
54 unrelated
55 agree
56 unrelated
57 agree
58 unrelated
59 unrelated
60 unrelated
61 unrelated
62 discuss
63 unrelated
64 unrelated
65 unrelated
66 disagree
67 unrelated
68 unrelated
69 unrelated
70 agree
71 unrelated
72 disagree
73 unrelated
74 disagree
75 unrelated
76 disagree
77 unrelated
78 agree
79 unrelated
80 unrelated
81 discuss
82 agree
83

In [51]:
#  FOR CONVENIENCE OF RE-INIT WITH MAX LEN = 512
# # given df with Body ID	articleBody	Headline	headlineWithArticle - > csv with Headline, Body ID, Stance
# MAX_LEN = 512
# def get_predictions(model, df):
#   model = model.eval()
#   predictions = []
#   currentRow = 0 
#   for row in df.itertuples():
#     headlineWithArticle = row.headlineWithArticle

#     # encode test data to BERT format
#     encoding = tokenizer.encode_plus(
#       headlineWithArticle,
#       add_special_tokens=True,
#       max_length=MAX_LEN,
#       return_token_type_ids=False,
#       pad_to_max_length=True,
#       return_attention_mask=True,
#       return_tensors='pt',
#     )

#     # get predictions
#     input_ids = encoding['input_ids'].to(device)
#     attention_mask = encoding['attention_mask'].to(device)
    
#     output = model(input_ids, attention_mask)
#     _, prediction = torch.max(output, dim=1)

#     print(currentRow,class_names[prediction])
#     currentRow += 1
#     predictions.append(class_names[prediction])

#   # return newly constructed dataframe with predicted labels
#   df_competition_test_stances = pd.DataFrame()
#   df_competition_test_stances["Headline"] = df["Headline"]
#   df_competition_test_stances["Body ID"] = df["Body ID"]
#   df_competition_test_stances["Stance"] = pd.Series(predictions)
#   return df_competition_test_stances

In [52]:
df_test_bodies = pd.read_csv("competition_test_bodies.csv", dtype=str)
df_test_stances_unlabeled = pd.read_csv("competition_test_stances_unlabeled.csv", dtype=str)

df_test = pd.merge(df_test_stances_unlabeled, df_test_bodies, on='Body ID', how='left')

df_test["headlineWithArticle"] = df_test["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_test["articleBody"]

BATCH_SIZE = 16


distilbert_1_epochs = StanceClassifier(len(class_names))
distilbert_1_epochs.load_state_dict(torch.load('512_distilbert_fnc_1_epochs.bin',map_location=torch.device('cpu')))
distilbert_1_epochs = distilbert_1_epochs.to(device)
df_competition_test_stances = get_predictions(distilbert_1_epochs, df_test)
df_competition_test_stances.to_csv("512_distilbert_1_epochs_answer.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 unrelated
1 unrelated
2 unrelated
3 unrelated
4 unrelated
5 unrelated
6 unrelated
7 unrelated
8 agree
9 unrelated
10 agree
11 unrelated
12 agree
13 unrelated
14 agree
15 disagree
16 unrelated
17 unrelated
18 unrelated
19 unrelated
20 agree
21 disagree
22 disagree
23 disagree
24 disagree
25 unrelated
26 unrelated
27 agree
28 unrelated
29 disagree
30 unrelated
31 unrelated
32 unrelated
33 unrelated
34 agree
35 unrelated
36 unrelated
37 unrelated
38 unrelated
39 unrelated
40 unrelated
41 unrelated
42 unrelated
43 unrelated
44 agree
45 unrelated
46 unrelated
47 unrelated
48 agree
49 disagree
50 agree
51 unrelated
52 unrelated
53 unrelated
54 unrelated
55 agree
56 unrelated
57 agree
58 unrelated
59 unrelated
60 unrelated
61 unrelated
62 disagree
63 unrelated
64 unrelated
65 unrelated
66 agree
67 unrelated
68 unrelated
69 unrelated
70 agree
71 unrelated
72 disagree
73 unrelated
74 disagree
75 unrelated
76 disagree
77 unrelated
78 agree
79 unrelated
80 unrelated
81 disagree
82 agree
83 unre

In [44]:
# # Get metrics for 4 epochs

# # re-init dataloader 
# df_train_bodies = pd.read_csv("train_bodies.csv", dtype=str)
# df_train_stances = pd.read_csv("train_stances.csv", dtype=str)
# df_train = pd.merge(df_train_stances, df_train_bodies, on='Body ID', how='left')

# # merge headline and article text separated by a [SEP] token
# df_train["headlineWithArticle"] = df_train["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_train["articleBody"]
# # convert string stances to a stance label: ['unrelated', 'discuss', 'agree', 'disagree'] -> [0, 1, 2, 3]
# df_train['stance_label'] = df_train['Stance'].apply(lambda x: ['unrelated', 'discuss', 'agree', 'disagree'].index(x))

# df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)

# BATCH_SIZE = 16

# val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

# distilbert_4_epochs = StanceClassifier(len(class_names))
# distilbert_4_epochs.load_state_dict(torch.load('distilbert_fnc_4_epochs.bin',map_location=torch.device('cpu')))
# distilbert_4_epochs = distilbert_4_epochs.to(device)


# # Get metrics from training data
# # train_acc, train_loss = eval_model(
# #     model,
# #     train_data_loader,
# #     loss_fn, 
# #     device, 
# #     len(df_train))

# val_acc, val_loss = eval_model(
#     distilbert_4_epochs,
#     val_data_loader,
#     loss_fn, 
#     device, 
#     len(df_val))

# _4_epochs = {
#     "model": "distilbert_4_epochs",
#     "val_acc": val_acc,
#     "val_loss": val_loss 
# }

# print(_4_epochs)
# training_metrics = pd.DataFrame([_4_epochs])
# training_metrics.to_csv("distilbert_4_epochs_training_metrics.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'model': 'distilbert_4_epochs', 'val_acc': tensor(0.9932, device='cuda:0', dtype=torch.float64), 'val_loss': 0.028401949924780918}


In [45]:
# # Get metrics for 3 epochs

# # re-init dataloader 
# df_train_bodies = pd.read_csv("train_bodies.csv", dtype=str)
# df_train_stances = pd.read_csv("train_stances.csv", dtype=str)
# df_train = pd.merge(df_train_stances, df_train_bodies, on='Body ID', how='left')

# # merge headline and article text separated by a [SEP] token
# df_train["headlineWithArticle"] = df_train["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_train["articleBody"]
# # convert string stances to a stance label: ['unrelated', 'discuss', 'agree', 'disagree'] -> [0, 1, 2, 3]
# df_train['stance_label'] = df_train['Stance'].apply(lambda x: ['unrelated', 'discuss', 'agree', 'disagree'].index(x))

# df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)

# BATCH_SIZE = 16

# val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

# distilbert_3_epochs = StanceClassifier(len(class_names))
# distilbert_3_epochs.load_state_dict(torch.load('distilbert_fnc_3_epochs.bin',map_location=torch.device('cpu')))
# distilbert_3_epochs = distilbert_3_epochs.to(device)


# # Get metrics from training data
# # train_acc, train_loss = eval_model(
# #     model,
# #     train_data_loader,
# #     loss_fn, 
# #     device, 
# #     len(df_train))

# val_acc, val_loss = eval_model(
#     distilbert_3_epochs,
#     val_data_loader,
#     loss_fn, 
#     device, 
#     len(df_val))

# _3_epochs = {
#     "model": "distilbert_3_epochs",
#     "val_acc": val_acc,
#     "val_loss": val_loss 
# }

# print(_3_epochs)
# training_metrics = pd.DataFrame([_3_epochs])
# training_metrics.to_csv("distilbert_3_epochs_training_metrics.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'model': 'distilbert_3_epochs', 'val_acc': tensor(0.9910, device='cuda:0', dtype=torch.float64), 'val_loss': 0.035733403112770616}


In [46]:
# # Get metrics for 2 epochs

# # re-init dataloader 
# df_train_bodies = pd.read_csv("train_bodies.csv", dtype=str)
# df_train_stances = pd.read_csv("train_stances.csv", dtype=str)
# df_train = pd.merge(df_train_stances, df_train_bodies, on='Body ID', how='left')

# # merge headline and article text separated by a [SEP] token
# df_train["headlineWithArticle"] = df_train["Headline"] + ' ' + tokenizer.sep_token + ' ' + df_train["articleBody"]
# # convert string stances to a stance label: ['unrelated', 'discuss', 'agree', 'disagree'] -> [0, 1, 2, 3]
# df_train['stance_label'] = df_train['Stance'].apply(lambda x: ['unrelated', 'discuss', 'agree', 'disagree'].index(x))

# df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=RANDOM_SEED)

# BATCH_SIZE = 16

# val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

# distilbert_2_epochs = StanceClassifier(len(class_names))
# distilbert_2_epochs.load_state_dict(torch.load('distilbert_fnc_2_epochs.bin',map_location=torch.device('cpu')))
# distilbert_2_epochs = distilbert_2_epochs.to(device)


# # Get metrics from training data
# # train_acc, train_loss = eval_model(
# #     model,
# #     train_data_loader,
# #     loss_fn, 
# #     device, 
# #     len(df_train))

# val_acc, val_loss = eval_model(
#     distilbert_2_epochs,
#     val_data_loader,
#     loss_fn, 
#     device, 
#     len(df_val))

# _2_epochs = {
#     "model": "distilbert_2_epochs",
#     "val_acc": val_acc,
#     "val_loss": val_loss 
# }

# print(_2_epochs)
# training_metrics = pd.DataFrame([_2_epochs])
# training_metrics.to_csv("distilbert_2_epochs_training_metrics.csv", index = False, encoding = 'utf-8')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'model': 'distilbert_2_epochs', 'val_acc': tensor(0.9856, device='cuda:0', dtype=torch.float64), 'val_loss': 0.046772648038139145}
