<a href="https://colab.research.google.com/github/danielvangelder/Applied-NLP-Project-IN4325/blob/main/albert_fnc1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Using ALBERT on the FNC-1 data set
First download the [FNC-1 dataset](http://www.fakenewschallenge.org). Then mount drive and install libraries.
If you get a TQDM Metafile error, re-run this code.

In [None]:
PRETRAINED_MODEL_LOCATION = 'drive/My Drive/IR-Files/albert_all_train/'
MODEL_OUT_LOCATION = 'drive/My Drive/IR-Files/albert/'
FNC1_LOCATION = 'drive/My Drive/IR-Files/fnc-1/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install tqdm
!pip install simpletransformers

### Data reader class
Reads the data set and performs scoring and data splitting.

In [None]:
from typing import List, Union

import pandas as pd
from math import ceil
import numpy as np

class Fnc1Reader:
    """Reads the Fake News Detection data set."""

    def __init__(self, loc: str):
        """Inits the data reader with the data at the given location. Expects train and test set data."""
        self.loc = loc
        if self.loc[len(loc) - 1] != '/':
            self.loc += '/'
        self.train_bodies, self.train_stances = self.read_train()
        self.test_bodies, self.test_stances = self.read_test()
        self.comp_bodies, self.comp_stances = self.read_comp()

    def read_train(self) -> [pd.DataFrame, pd.DataFrame]:
        """Reads the train set from the data location."""
        return self.read_labelled('train_bodies.csv', 'train_stances.csv')

    def read_comp(self) -> [pd.DataFrame, pd.DataFrame]:
        """Reads the competition data set from the data location"""
        return self.read_labelled('competition_test_bodies.csv', 'competition_test_stances.csv')

    def read_labelled(self, bodies_loc: str, stances_loc: str) -> [pd.DataFrame, pd.DataFrame]:
        bodies = pd.read_csv(self.loc + bodies_loc, names=['Body ID', 'articleBody'], header=0)
        stances = pd.read_csv(self.loc + stances_loc, names=['Headline', 'Body ID', 'Stance'], header=0)
        labels = list(map(self.stance_to_label, stances['Stance'].to_list()))
        stances['Label'] = labels
        assert len(bodies) != 0 and len(stances) != 0
        assert bodies.columns.to_list() == ['Body ID', 'articleBody'] \
               and stances.columns.to_list() == ['Headline', 'Body ID', 'Stance', 'Label']

        return bodies, stances

    def stance_to_label(self, stance: str) -> int:
        """
        1, Agrees: The body text agrees with the headline.
        2, Disagrees: The body text disagrees with the headline.
        3, Discusses: The body text discuss the same topic as the headline, but does not take a position
        4, Unrelated: The body text discusses a different topic than the headline
        """
        if stance == 'agree':
            return 0
        elif stance == 'disagree':
            return 1
        elif stance == 'discuss':
            return 2
        elif stance == 'unrelated':
            return 3
        raise Exception('Stance does not exist: ' + stance)

    def read_test(self) -> [pd.DataFrame, pd.DataFrame]:
        """Reads the test set from the data location."""
        bodies = pd.read_csv(self.loc + 'train_bodies.csv', names=['Body ID', 'articleBody'], header=0)
        stances = pd.read_csv(self.loc + 'train_stances.csv', names=['Headline', 'Body ID'], header=0)
        assert len(bodies) != 0 and len(stances) != 0
        assert bodies.columns.to_list() == ['Body ID', 'articleBody'] \
               and stances.columns.to_list() == ['Headline', 'Body ID']

        return bodies, stances

    def kfold(self, n: int) -> List[pd.DataFrame]:
        """Returns a list of n random folds of the training set."""
        size = len(self.train_stances.index)
        shuffled = self.train_stances.sample(frac=1).reset_index(drop=True)

        folds = []
        for i in range(0, n - 1):
            lower = ceil(i / n * size)
            upper = ceil((i + 1) / n * size)
            if i == n - 1:
                upper = size
            fold = shuffled.iloc[lower:upper]
            folds.append(fold.reset_index(drop=True))

        return folds

    def get_body_train(self, body_id: int) -> str:
        """Returns the right body text from the train set."""
        bodies = self.train_bodies.loc[self.train_bodies['Body ID'] == body_id]['articleBody'].to_list()
        if len(bodies) == 0:
            raise Exception('No body with ID', str(body_id))
        return bodies[0]
    
    def get_body_test(self, body_id: int) -> str:
        """Returns the right body text from the train set."""
        bodies = self.test_bodies.loc[self.test_bodies['Body ID'] == body_id]['articleBody'].to_list()
        if len(bodies) == 0:
            raise Exception('No body with ID', str(body_id))
        return bodies[0]
    
    def get_body_comp(self, body_id: int) -> str:
        """Returns the right body text from the train set."""
        bodies = self.comp_bodies.loc[self.comp_bodies['Body ID'] == body_id]['articleBody'].to_list()
        if len(bodies) == 0:
            raise Exception('No body with ID', str(body_id))
        return bodies[0]

    def evaluate_comp(self, labels: Union[List[int], List[str]]) -> float:
        """Evaluates the given labels on the competition data set."""
        if all(isinstance(label, int) for label in labels):
            return self.evaluate_fold(self.comp_stances, labels)
        elif all(isinstance(label, str) for label in labels):
            return self.evaluate_fold(self.comp_stances, list(map(self.stance_to_label, labels)))
        else:
            raise Exception('Bad labels format: ' + str(type(labels)))

    def evaluate_fold(self, fold: pd.DataFrame, labels: List[int]) -> float:
        """Evaluates a data fold with the given labels"""
        assert len(fold.index) == len(labels)
        score = 0
        for i, row in fold.iterrows():
            score += self.score(row['labels'], labels[i])
        return score

    def score(self, actual: int, output: int) -> float:
        """
        As in scorer.py provided by FNC-1.
        +0.25 for each correct unrelated
        +0.25 for each correct related (label is any of agree, disagree, discuss)
        +0.75 for each correct agree, disagree, discuss
        """
        assert output in [0, 1, 2, 3]
        score = 0
        if actual == output:
            score += 0.25
            if actual != 3:
                score += 0.50
        if actual in [0, 1, 2] and output in [0, 1, 2]:
            score += 0.25
        return score

### Training the model
The ALBERT model is trained on the whole train set with 5 epochs and batches of size 10. (When loading the model externally you can skip this step)

In [None]:
import pandas as pd
from simpletransformers.classification import ClassificationModel


def fold_to_transf_input(fold: pd.DataFrame, reader: Fnc1Reader):
    assert fold.columns.tolist() == ['Headline', 'Body ID', 'Stance', 'Label']
    result = fold.copy()
    result['Body ID'] = result['Body ID'].map(reader.get_body_train)
    del result['Stance']
    result = result.rename(columns={'Headline': 'text_a', 'Body ID': 'text_b', 'Label': 'labels'})
    return result


if __name__ == '__main__':
    reader = Fnc1Reader(FNC1_LOCATION)
    data = fold_to_transf_input(reader.train_stances, reader)

    model = ClassificationModel('albert', 'albert-base-v2', num_labels=4, use_cuda=True, args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'process_count': 10,
        'train_batch_size': 10,
        'eval_batch_size': 4,
        'max_seq_length': 512,
        'fp16': True,
        'output_dir': MODEL_OUT_LOCATION,
        'best_model_dir': MODEL_OUT_LOCATION + 'best/',
    })

    model.train_model(data)

### Evaluating the model
The FNC-1 scoring function is used to evaluate the model on the competition set. The model is loaded from the drive.

In [None]:
import pandas as pd
from simpletransformers.classification import ClassificationModel

def output_to_labels(output):
    output_labels = []

    for o in output:
        max = o[0]
        i_max = 0
        for i in range(1, len(o)):
            if (o[i] > max):
                max = o[i]
                i_max = i
        output_labels.append(i_max)
    
    return output_labels

def fold_to_transf_input_comp(fold: pd.DataFrame, reader: Fnc1Reader):
    assert fold.columns.tolist() == ['Headline', 'Body ID', 'Stance', 'Label']
    result = fold.copy()
    result['Body ID'] = result['Body ID'].map(reader.get_body_comp)
    del result['Stance']
    result = result.rename(columns={'Headline': 'text_a', 'Body ID': 'text_b', 'Label': 'labels'})
    return result

# Comment this model declaration to use the trained model.
model = ClassificationModel('albert', PRETRAINED_MODEL_LOCATION, num_labels=4, use_cuda=True, args={
        'learning_rate':3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'process_count': 10,
        'train_batch_size': 10,
        'eval_batch_size': 4,
        'max_seq_length': 512,
        'fp16': True,
    })

def labels_to_stances(labels):
    stances = []
    for l in labels:
        if l == 0:
            stances.append('agree')
        elif l == 1:
            stances.append('disagree')
        elif l == 2:
            stances.append('discuss')
        elif l == 3:
            stances.append('unrelated')
    return stances

In [None]:
reader = Fnc1Reader(FNC1_LOCATION)
test_data = fold_to_transf_input_comp(reader.comp_stances, reader)
_, output, _ = model.eval_model(test_data)
output_labels = output_to_labels(output)
print(reader.evaluate_fold(test_data, output_labels))

output_stances = labels_to_stances(output_labels)
csv = pd.read_csv(FNC1_LOCATION + 'competition_test_stances.csv', header=0)
csv['Stance'] = output_stances
csv.to_csv(FNC1_LOCATION + 'albert_test_output.csv', index=False)

## Gender Bias Analysis
In order to perform gender bias analysis, we must first create an augmented data set in which all gender definitions are flipped.

In [None]:
!pip install spacy
!pip install faker
!pip install gender_guesser

In [None]:
import spacy
import faker
import gender_guesser.detector as gender

reader = Fnc1Reader(FNC1_LOCATION)
s = reader.get_body_train(0)

f = faker.Faker()
d = gender.Detector()

nlp = spacy.load('en_core_web_sm')

def create_person_mapping(line):
    """ Creates a person name mapping from male to female or vice-versa. """
    parsed = nlp(line)
    persons = [ent.text for ent in parsed.ents if ent.label_ == 'PERSON']

    replaces = {}
    for person in persons:
        s = person.split(' ')
        g = d.get_gender(s[0])
        result = ''
        if g == 'male':
            fn = f.first_name_female()
            replaces[s[0]] = fn
            result += fn
        elif g == 'female':
            fn = f.first_name_male()
            replaces[s[0]] = fn
            result += fn
        else:
            continue

        if len(s) > 1:
            result += ' ' + ' '.join(s[1:])
        replaces[person] = result

    return replaces

def flip_genders(line, mapping, count):
    """ 
    Flips all gender definitions in a piece of text. Keeps track of the amount
    of changes in `count`.
    """
    res = ''
    doc = nlp(line)
    for ent in doc:
        replace = ''
        if ent.text.lower() == 'he':
            replace += 'she'
        elif ent.text.lower() == 'him' or ent.text.lower() == 'his':
            replace += 'her'
        elif ent.text.lower() == 'she':
            replace += 'he'
        elif ent.text.lower() == 'her' and ent.tag_ == 'PRP$':
            replace += 'his'
        elif ent.text.lower() == 'her' and ent.tag_ == 'PRP':
            replace += 'him'
        elif ent.text.lower == 'man':
            replace += 'woman'
        elif ent.text.lower == 'woman':
            replace += 'man'
        else:
            replace += ent.text
            count[0] -= 1
        count[0] += 1
        if ent.text[0].isupper():
            replace = replace[0].upper() + replace[1:]
        res += replace + ' ' * (len(ent.text_with_ws) - len(ent.text))

    #TODO: This could swap one name multiple times
    for name in mapping:
        if name in line:
            count[1] += line.count(name)
            res = res.replace(name, mapping[name])

    return res

Creates the augmented test set. Can be skipped if it already exists on your drive.

In [None]:
mapping = {}

augmented_bodies = reader.comp_bodies.copy()
augmented_stances = reader.comp_stances.copy()

for i, row in augmented_stances.iterrows():
    m = create_person_mapping(row['Headline'])
    for k in m:
        if k not in mapping:
            mapping[k] = m[k]

for i, row in augmented_bodies.iterrows():
    m = create_person_mapping(row['articleBody'])
    for k in m:
        if k not in mapping:
            mapping[k] = m[k]

count = [0, 0]

augmented_stances['Headline'] = augmented_stances['Headline'].map(lambda r: flip_genders(r, mapping, count))
del augmented_stances['Label']
augmented_bodies['articleBody'] = augmented_bodies['articleBody'].map(lambda r: flip_genders(r, mapping, count))

print(count)

augmented_stances.to_csv(FNC1_LOCATION + 'augmented_test_stances.csv', index=False)
augmented_bodies.to_csv(FNC1_LOCATION + 'augmented_test_bodies.csv', index=False)

Reads the augmented test set from the drive and evaluates the performance of the fine tuned model. Then writes the generated labels to the drive.

In [None]:
reader = Fnc1Reader(FNC1_LOCATION)
reader.comp_bodies, reader.comp_stances = reader.read_labelled('augmented_test_bodies.csv', 
                                                            'augmented_test_stances.csv')
test_data = fold_to_transf_input_comp(reader.comp_stances, reader)
_, output, _ = model.eval_model(test_data)
output_labels = output_to_labels(output)
print(reader.evaluate_fold(test_data, output_labels))

output_stances = labels_to_stances(output_labels)
csv = pd.read_csv(FNC1_LOCATION + 'competition_test_stances.csv', header=0)
csv['Stance'] = output_stances
csv.to_csv(FNC1_LOCATION + 'albert_test_augmented_output.csv', index=False)