<a href="https://colab.research.google.com/github/danielvangelder/Applied-NLP-Project-IN4325/blob/main/albert_fnc1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Using ALBERT on the FNC-1 data set
Mount drive and install libraries.
If you get a TQDM Metafile error, re-run this code.

In [4]:
from google.colab import drive
drive.mount('/content/drive')
!pip install simpletransformers
!pip install tqdm

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data reader class
Reads the data set and performs scoring and data splitting.

In [5]:
from typing import List, Union

import pandas as pd
from math import ceil
import numpy as np

class Fnc1Reader:
    """Reads the Fake News Detection data set."""

    def __init__(self, loc: str):
        """Inits the data reader with the data at the given location. Expects train and test set data."""
        self.loc = loc
        if self.loc[len(loc) - 1] != '/':
            self.loc += '/'
        self.train_bodies, self.train_stances = self.read_train()
        self.test_bodies, self.test_stances = self.read_test()
        self.comp_bodies, self.comp_stances = self.read_comp()

    def read_train(self) -> [pd.DataFrame, pd.DataFrame]:
        """Reads the train set from the data location."""
        return self.read_labelled('train_bodies.csv', 'train_stances.csv')

    def read_comp(self) -> [pd.DataFrame, pd.DataFrame]:
        """Reads the competition data set from the data location"""
        return self.read_labelled('competition_test_bodies.csv', 'competition_test_stances.csv')

    def read_labelled(self, bodies_loc: str, stances_loc: str) -> [pd.DataFrame, pd.DataFrame]:
        bodies = pd.read_csv(self.loc + bodies_loc, names=['Body ID', 'articleBody'], header=0)
        stances = pd.read_csv(self.loc + stances_loc, names=['Headline', 'Body ID', 'Stance'], header=0)
        labels = list(map(self.stance_to_label, stances['Stance'].to_list()))
        stances['Label'] = labels
        assert len(bodies) != 0 and len(stances) != 0
        assert bodies.columns.to_list() == ['Body ID', 'articleBody'] \
               and stances.columns.to_list() == ['Headline', 'Body ID', 'Stance', 'Label']

        return bodies, stances

    def stance_to_label(self, stance: str) -> int:
        """
        1, Agrees: The body text agrees with the headline.
        2, Disagrees: The body text disagrees with the headline.
        3, Discusses: The body text discuss the same topic as the headline, but does not take a position
        4, Unrelated: The body text discusses a different topic than the headline
        """
        if stance == 'agree':
            return 0
        elif stance == 'disagree':
            return 1
        elif stance == 'discuss':
            return 2
        elif stance == 'unrelated':
            return 3
        raise Exception('Stance does not exist: ' + stance)

    def read_test(self) -> [pd.DataFrame, pd.DataFrame]:
        """Reads the test set from the data location."""
        bodies = pd.read_csv(self.loc + 'train_bodies.csv', names=['Body ID', 'articleBody'], header=0)
        stances = pd.read_csv(self.loc + 'train_stances.csv', names=['Headline', 'Body ID'], header=0)
        assert len(bodies) != 0 and len(stances) != 0
        assert bodies.columns.to_list() == ['Body ID', 'articleBody'] \
               and stances.columns.to_list() == ['Headline', 'Body ID']

        return bodies, stances

    def kfold(self, n: int) -> List[pd.DataFrame]:
        """Returns a list of n random folds of the training set."""
        size = len(self.train_stances.index)
        shuffled = self.train_stances.sample(frac=1).reset_index(drop=True)

        folds = []
        for i in range(0, n - 1):
            lower = ceil(i / n * size)
            upper = ceil((i + 1) / n * size)
            if i == n - 1:
                upper = size
            fold = shuffled.iloc[lower:upper]
            folds.append(fold.reset_index(drop=True))

        return folds

    def get_body_train(self, body_id: int) -> str:
        """Returns the right body text from the train set."""
        bodies = self.train_bodies.loc[self.train_bodies['Body ID'] == body_id]['articleBody'].to_list()
        if len(bodies) == 0:
            raise Exception('No body with ID', str(body_id))
        return bodies[0]
    
    def get_body_test(self, body_id: int) -> str:
        """Returns the right body text from the train set."""
        bodies = self.test_bodies.loc[self.test_bodies['Body ID'] == body_id]['articleBody'].to_list()
        if len(bodies) == 0:
            raise Exception('No body with ID', str(body_id))
        return bodies[0]
    
    def get_body_comp(self, body_id: int) -> str:
        """Returns the right body text from the train set."""
        bodies = self.comp_bodies.loc[self.comp_bodies['Body ID'] == body_id]['articleBody'].to_list()
        if len(bodies) == 0:
            raise Exception('No body with ID', str(body_id))
        return bodies[0]

    def evaluate_comp(self, labels: Union[List[int], List[str]]) -> float:
        """Evaluates the given labels on the competition data set."""
        if all(isinstance(label, int) for label in labels):
            return self.evaluate_fold(self.comp_stances, labels)
        elif all(isinstance(label, str) for label in labels):
            return self.evaluate_fold(self.comp_stances, list(map(self.stance_to_label, labels)))
        else:
            raise Exception('Bad labels format: ' + str(type(labels)))

    def evaluate_fold(self, fold: pd.DataFrame, labels: List[int]) -> float:
        """Evaluates a data fold with the given labels"""
        assert len(fold.index) == len(labels)
        score = 0
        for i, row in fold.iterrows():
            score += self.score(row['labels'], labels[i])
        return score

    def score(self, actual: int, output: int) -> float:
        """
        As in scorer.py provided by FNC-1.
        +0.25 for each correct unrelated
        +0.25 for each correct related (label is any of agree, disagree, discuss)
        +0.75 for each correct agree, disagree, discuss
        """
        assert output in [0, 1, 2, 3]
        score = 0
        if actual == output:
            score += 0.25
            if actual != 3:
                score += 0.50
        if actual in [0, 1, 2] and output in [0, 1, 2]:
            score += 0.25
        return score

### Training the model
The ALBERT model is trained on the whole train set with 5 epochs and batches of size 10. (When loading the model externally you can skip this step)

In [6]:
import pandas as pd
from simpletransformers.classification import ClassificationModel


def fold_to_transf_input(fold: pd.DataFrame, reader: Fnc1Reader):
    assert fold.columns.tolist() == ['Headline', 'Body ID', 'Stance', 'Label']
    result = fold.copy()
    result['Body ID'] = result['Body ID'].map(reader.get_body_train)
    del result['Stance']
    result = result.rename(columns={'Headline': 'text_a', 'Body ID': 'text_b', 'Label': 'labels'})
    return result


if __name__ == '__main__':
    reader = Fnc1Reader('drive/My Drive/IR-Files/fnc-1/')
    data = fold_to_transf_input(reader.train_stances, reader)

    model = ClassificationModel('albert', 'albert-base-v2', num_labels=4, use_cuda=True, args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'process_count': 10,
        'train_batch_size': 10,
        'eval_batch_size': 4,
        'max_seq_length': 512,
        'fp16': True,
        'output_dir': 'drive/My Drive/IR-Files/albert/',
        'best_model_dir': 'drive/My Drive/IR-Files/albert/best_model/',
    })

    model.train_model(data)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

  0%|          | 0/49972 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/4998 [00:00<?, ?it/s]



Running Epoch 1 of 5:   0%|          | 0/4998 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/4998 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/4998 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/4998 [00:00<?, ?it/s]

### Evaluating the model
The FNC-1 scoring function is used to evaluate the model on the competition set. The model is loaded from the drive.

In [8]:
def output_to_labels(output):
    output_labels = []

    for o in output:
        max = o[0]
        i_max = 0
        for i in range(1, len(o)):
            if (o[i] > max):
                max = o[i]
                i_max = i
        output_labels.append(i_max)
    
    return output_labels

def fold_to_transf_input_comp(fold: pd.DataFrame, reader: Fnc1Reader):
    assert fold.columns.tolist() == ['Headline', 'Body ID', 'Stance', 'Label']
    result = fold.copy()
    result['Body ID'] = result['Body ID'].map(reader.get_body_comp)
    del result['Stance']
    result = result.rename(columns={'Headline': 'text_a', 'Body ID': 'text_b', 'Label': 'labels'})
    return result

# Comment this model declaration to use the trained model.
model = ClassificationModel('albert', 'drive/My Drive/IR-Files/albert_all_train/', num_labels=4, use_cuda=True, args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'process_count': 10,
        'train_batch_size': 10,
        'eval_batch_size': 4,
        'max_seq_length': 512,
        'fp16': True,
    })

reader = Fnc1Reader('drive/My Drive/IR-Files/fnc-1/')
test_data = fold_to_transf_input_comp(reader.comp_stances, reader)
_, output, _ = model.eval_model(test_data)
output_labels = output_to_labels(output)
reader.evaluate_fold(test_data, output_labels)

  0%|          | 0/25413 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/6354 [00:00<?, ?it/s]

9253.75