In [1]:
from models.Model import Model
from models.GPT2Generator import GPT2Generator
from models.Logistic.BOWLogistic import BOWLogistic
from models.MLP.BOWMLP import BOWMLP
from models.MLP.CBOW_BOWMLP import CBOW_BOWMLP
from models.MLP.CBOWMLP import CBOWMLP
from models.RandomForest.BOWRandomForest import BOWRandomForest
from models.RandomForest.CBOW_BOWRandomForest import CBOW_BOWRandomForest
from models.RandomForest.CBOWRandomForest import CBOWRandomForest
from models.Logistic.CBOW_BOWLogistic import CBOW_BOWLogistic
from models.Logistic.CBOWLogistic import CBOWLogistic
from models.XGBoost.BOWXGBoost import BOWXGBoost
from models.XGBoost.CBOW_BOWXGBoost import CBOW_BOWXGBoost
from models.XGBoost.CBOWXGBoost import CBOWXGBoost

from models.SequenceLabeller_BiLSTM_CRF import SequenceLabeller_BiLSTM_CRF
from models.SequenceLabeller_BiLSTM_CRF_Beam import SequenceLabeller_BiLSTM_CRF_Beam
from models.SequenceLabeller_BERT import SequenceLabeller_BERT


from languages.LanguageModel import LanguageModel
from DataExploration import DataExploration
# from languages.Japanese import Japanese
from languages.English import English
from languages.Finnish import Finnish
from Preprocess import Preprocess
from Pipeline import Pipeline
from typing import List
import torch
import datasets
import pandas as pd
import random
import numpy as np
import transformers

In [None]:
# Download word_vectors
!mkdir word_vectors
%cd word_vectors
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz

!gzip -d cc.en.300.vec.gz
!gzip -d cc.ja.300.vec.gz
!gzip -d cc.fi.300.vec.gz

%cd ..

In [2]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [3]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [4]:
# Is used to minimize the clutter in the console
datasets.logging.set_verbosity_error()

# Define the languages to be used
languages: List[LanguageModel] = [
    English(),
    Finnish(),
    #Japanese()
]

# gpt2Generator = GPT2Generator()
torch.cuda.empty_cache()


In [5]:
def print_headline(language: str):
    print(f'\n\n--- Language: {language} ---')

# Preprocessing and Dataset Analysis

In [6]:
all_data = {}

for language in languages:
    pipeline = Pipeline()

    # Get the preprocessed data and split it into training and validation data
    preprocessor = Preprocess(language.tokenize, language.clean)
    data = pipeline.get_data(language=language.name, preproccesor=preprocessor)
    train_data, validation_data = pipeline.split_data(data)

    all_data[language.name] = {
        "train_data": train_data,
        "validation_data": validation_data
    }


Loading data...

Loading data...


In [None]:
# Explore the data for each language
for language in languages:
    print_headline(language.name)
    data_exploration = DataExploration(all_data[language.name]["train_data"])
    data_exploration.find_frequent_words()

# Binary Question Classification
Binary classfiers that only takes features based on the question, context document and combinations of the two

In [None]:
for language in languages:
    print_headline(language.name)

    feature_based_classifiers = [
        BOWRandomForest(language.name),
        BOWMLP(language.name),
        BOWLogistic(language.name),
        BOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )
    

# Representation Learning
Extension of our binary question classifers to also include features based on continous vector representations

In [None]:
for language in languages:
    print_headline(language.name)

    continous_based_classifiers = [
        CBOW_BOWRandomForest(language.name),
        CBOW_BOWMLP(language.name),
        CBOW_BOWLogistic(language.name),
        CBOW_BOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )

We also test how the performance if only the continous representations was to be used

In [None]:
for language in languages:
    print_headline(language.name)

    continous_based_classifiers = [
        CBOWRandomForest(language.name),
        CBOWMLP(language.name),
        CBOWLogistic(language.name),
        CBOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )

# Language modelling
Extension to the classifiers in which word/sentence representations are instead extracted from neural language models

In [None]:
for language in languages:
    print_headline(language.name)

    classifier = GPT2CBOWLogistic(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=classifier,
        X=classifier.extract_X(train_data),
        y=classifier.extract_y(train_data)
    )

    pipeline.evaluate(
        model=classifier,
        X=classifier.extract_X(validation_data),
        y=classifier.extract_y(validation_data)
    )

We try to sample from these language models to see what kinds of sentences they generate. Moreover we measure the performance on the TyDi QA validation data with a commonly used language model evaluations metric

In [None]:
question_beginning = {
    'english': ['When', 'What', 'How'],
    'finnish': ['Milloin', 'Mikä', 'Missä'],
    'japanese': ['日本', '『', 'アメリカ']
}

for language in languages:
    print_headline(language.name)

    model = GPT2Generator(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=classifier,
        X=classifier.extract_X(train_data),
        y=classifier.extract_y(train_data)
    )

    for starting_word in question_beginning[language.name]:
        model.generate_text(f'Question: {starting_word}')
    model.get_perplexity(model.extract_X(validation_data))

# Error Analysis and Interpretability

# Sequence Labelling
We implement a sequence labeller, which predicts which parts of a paragraph are likel part of the answer to the corresponding question

In [8]:
config = {
    'num_train_epochs': 10,
    'learning_rate': 2e-5,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'warmup_steps': 200,
    'weight_decay': 0.01,
    'lstm_dim': 128,
    'dropout_prob': 0.1,
    'n_workers': 0,
    'beam_size': 2,
}

In [None]:
import os
os.environ["WANDDB_DISABLED"] = "true"

In [None]:
for language in languages:
    print_headline(language.name)

    sequence_labellers = [
        SequenceLabeller_BERT(language.name, config),
        SequenceLabeller_BiLSTM_CRF(language.name, config),
    ]

    for sequence_labeller in sequence_labellers:
        print(f'--- Sequence Labeller: {sequence_labeller.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(train_data),
            y=sequence_labeller.extract_y(train_data)
        )

        pipeline.evaluate(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(validation_data),
            y=sequence_labeller.extract_y(validation_data)
        )
        sequence_labeller.save()

We add an extension to the sequence labeller which uses beam search to select the optimal sequence of labels for the location of the answer in the text. Analyse how the performance of this system differs with beam search.

In [None]:
for language in languages:
    print_headline(language.name)

    sequence_labeller = SequenceLabeller_BiLSTM_CRF_Beam(language.name, config)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(train_data),
        y=sequence_labeller.extract_y(train_data)
    )

    num_beams = [1, 2, 3]

    for beam in num_beams:
        sequence_labeller.beam_size = beam
        pipeline.evaluate(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(validation_data),
            y=sequence_labeller.extract_y(validation_data)
        )
    
    sequence_labeller.save()

In [16]:
# Qualitative investigation of the predicted answer spans
transformers.logging.set_verbosity_error()
sequence_labeller = SequenceLabeller_BERT('english', config)
sequence_labeller.load()

validation_data = all_data['english']["validation_data"]

for i in range(10):
    i += 10 # plus 10 due to the first 10 examples being mostly withouth answer
    row = validation_data.iloc[[i]]
    X = sequence_labeller.extract_X(row)

    predictions = sequence_labeller.predict(X)[0] # [0] because we only have one example

    predicted_answers = ['']

    for i in range(len(predictions)):

        predicted_label = predictions[i]
        if predicted_label == 'B' or predicted_label == 'I': # if the label is B or I, we add the token to the current answer
            predicted_answers[-1] += row['tokenized_plaintext'].values[0][i] + ' '

        elif predicted_label == 'O' and predicted_answers[-1] != '': # if the label is O and the current answer is not empty, we add a new answer
            predicted_answers[-1] = predicted_answers[-1][:-1] # remove trailing space
            predicted_answers.append('')

    if predicted_answers[-1] == '': # remove last empty answer
        predicted_answers = predicted_answers[:-1]


    print(f'Question: {row["question_text"].values[0]}')
    print(f'Answer: {row["answer_text"].values[0]}')
    print(f'Predicted answers: {predicted_answers}')
    print()

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: When was github created?
Answer: February 2008
Predicted answers: []



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: Who was the first leader of West Germany?
Answer: Theodor Heuss
Predicted answers: ['Theodor Heuss']



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: What martial arts do Marines learn?
Answer: Marine Corps Martial Arts Program
Predicted answers: ['combine existing and new hand-to-hand and close quarters combat techniques with morale and team-building functions and instruction in the Warrior Ethos', '[', 'which began in 2001 , trains Marines', 'of']



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: What is the world's largest horse show?
Answer: Devon Horse Show
Predicted answers: ['Since 1896 , the Devon Horse', 'is']



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: When did the first episode of Big Brother Australia air?
Answer: 
Predicted answers: []



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: When was Nike founded?
Answer: January 25, 1964
Predicted answers: []



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: When did Final Fantasy Type-0 come out?
Answer: 
Predicted answers: []



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: What is the oldest city in Myanmar?
Answer: Beikthano
Predicted answers: []



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: What is the strongest recorded wind?
Answer: was during the passage of Tropical Cyclone Olivia on 10 April 1996: an automatic weather station on Barrow Island, Australia, registered a maximum wind gust of 408km/h
Predicted answers: []



  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


Question: Who was president in 1817?
Answer: 
Predicted answers: []



# Multilingual QA

We start by implementing our binary question system with a multilingual encoder instead of the monolingual ones. With this we perform zero-shot cross-lingual evaluation

In [None]:
# Mutli-lingual binary classification

We now implement our sequence tagger with a multilingual encoding and perform zero-shot cross-lingual evaluation

In [None]:
# Zero shot classification
for language in languages:
    print(f'\n\n--- Training on {language.name} ---')

    sequence_labeller = SequenceLabeller_BERT('multilingual', config)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]


    pipeline = Pipeline()
    pipeline.train(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(train_data),
        y=sequence_labeller.extract_y(train_data)
    )

    for language in languages:
      print(f'--- Validating on {language.name} ---')
      validation_data = all_data[language.name]['validation_data']
      pipeline.evaluate(
          model=sequence_labeller,
          X=sequence_labeller.extract_X(validation_data),
          y=sequence_labeller.extract_y(validation_data)
      )
    
    sequence_labeller.save()