In [1]:
from models.Model import Model
from models.GPT2CBOWLogistic import GPT2CBOWLogistic
from models.GPT2Generator import GPT2Generator
from models.Logistic.BOWLogistic import BOWLogistic
from models.MLP.BOWMLP import BOWMLP
from models.MLP.CBOW_BOWMLP import CBOW_BOWMLP
from models.MLP.CBOWMLP import CBOWMLP
from models.RandomForest.BOWRandomForest import BOWRandomForest
from models.RandomForest.CBOW_BOWRandomForest import CBOW_BOWRandomForest
from models.RandomForest.CBOWRandomForest import CBOWRandomForest
from models.Logistic.CBOW_BOWLogistic import CBOW_BOWLogistic
from models.Logistic.CBOWLogistic import CBOWLogistic
from models.XGBoost.BOWXGBoost import BOWXGBoost
from models.XGBoost.CBOW_BOWXGBoost import CBOW_BOWXGBoost
from models.XGBoost.CBOWXGBoost import CBOWXGBoost

from models.SequenceLabeller_BiLSTM_CRF import SequenceLabeller_BiLSTM_CRF
from models.SequenceLabeller_BiLSTM_CRF_Beam import SequenceLabeller_BiLSTM_CRF_Beam
from models.SequenceLabeller_BERT import SequenceLabeller_BERT


from languages.LanguageModel import LanguageModel
from DataExploration import DataExploration
# from languages.Japanese import Japanese
from languages.English import English
from languages.Finnish import Finnish
from Preprocess import Preprocess
from Pipeline import Pipeline
from typing import List
import torch
import datasets
import pandas as pd
import random
import numpy as np

In [None]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [2]:
# Is used to minimize the clutter in the console
datasets.logging.set_verbosity_error()

# Define the languages to be used
languages: List[LanguageModel] = [
    English(),
    Finnish(),
    # Japanese()
]

# gpt2Generator = GPT2Generator()
torch.cuda.empty_cache()


In [3]:
def print_headline(language: str):
    print(f'\n\n--- Language: {language} ---')

# Preprocessing and Dataset Analysis

In [4]:
all_data = {}

for language in languages:
    pipeline = Pipeline()

    # Get the preprocessed data and split it into training and validation data
    preprocessor = Preprocess(language.tokenize, language.clean)
    data = pipeline.get_data(language=language.name, preproccesor=preprocessor)
    train_data, validation_data = pipeline.split_data(data)

    all_data[language.name] = {
        "train_data": train_data,
        "validation_data": validation_data
    }


Loading data...

Loading data...


In [5]:
# Explore the data for each language
for language in languages:
    print_headline(language.name)
    data_exploration = DataExploration(all_data[language.name]["train_data"])
    data_exploration.find_frequent_words()



--- Language: english ---

Most frequent first words:
[('When', 1994), ('What', 1967), ('How', 1188), ('Who', 945), ('Where', 419)]
Most frequent last words:
[('?', 6693), ('\\', 2), ('BCE', 2), ("''", 2), ('metabolite', 1)]



--- Language: finnish ---

Most frequent first words:
[('Milloin', 3210), ('Mikä', 2048), ('Missä', 1508), ('Kuka', 1435), ('Mitä', 928)]
Most frequent last words:
[('?', 12292), ('tohtoriksi+', 2), ('syntynyt', 2), ('pinta-ala', 2), ('=', 2)]



# Binary Question Classification
Binary classfiers that only takes features based on the question, context document and combinations of the two

In [None]:
for language in languages:
    print_headline(language.name)

    feature_based_classifiers = [
        BOWRandomForest(language.name),
        BOWMLP(language.name),
        BOWLogistic(language.name),
        BOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )
    

# Representation Learning
Extension of our binary question classifers to also include features based on continous vector representations

In [None]:
for language in languages:
    print_headline(language.name)

    continous_based_classifiers = [
        CBOW_BOWRandomForest(language.name),
        CBOW_BOWMLP(language.name),
        CBOW_BOWLogistic(language.name),
        CBOW_BOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )

We also test how the performance if only the continous representations was to be used

In [None]:
for language in languages:
    print_headline(language.name)

    continous_based_classifiers = [
        CBOWRandomForest(language.name),
        CBOWMLP(language.name),
        CBOWLogistic(language.name),
        CBOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )

# Language modelling
Extension to the classifiers in which word/sentence representations are instead extracted from neural language models

In [None]:
for language in languages:
    print_headline(language.name)

    classifier = GPT2CBOWLogistic(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=classifier,
        X=classifier.extract_X(train_data),
        y=classifier.extract_y(train_data)
    )

    pipeline.evaluate(
        model=classifier,
        X=classifier.extract_X(validation_data),
        y=classifier.extract_y(validation_data)
    )

We try to sample from these language models to see what kinds of sentences they generate. Moreover we measure the performance on the TyDi QA validation data with a commonly used language model evaluations metric

In [None]:
question_beginning = {
    'english': ['When', 'What', 'How'],
    'finnish': ['Milloin', 'Mikä', 'Missä'],
    'japanese': ['日本', '『', 'アメリカ']
}

for language in languages:
    print_headline(language.name)

    model = GPT2Generator(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=classifier,
        X=classifier.extract_X(train_data),
        y=classifier.extract_y(train_data)
    )

    for starting_word in question_beginning[language.name]:
        model.generate_text(f'Question: {starting_word}')
    model.get_perplexity(model.extract_X(validation_data))

# Error Analysis and Interpretability

# Sequence Labelling
We implement a sequence labeller, which predicts which parts of a paragraph are likel part of the answer to the corresponding question

In [None]:
for language in languages:
    print_headline(language.name)

    sequence_labellers = [
        SequenceLabeller_BiLSTM_CRF(language.name),
        SequenceLabeller_BERT(language.name),
    ]

    for sequence_labeller in sequence_labellers:
        print(f'--- Sequence Labeller: {sequence_labeller.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(train_data),
            y=sequence_labeller.extract_y(train_data)
        )

        pipeline.evaluate(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(validation_data),
            y=sequence_labeller.extract_y(validation_data)
        )

We add an extension to the sequence labeller which uses beam search to select the optimal sequence of labels for the location of the answer in the text. Analyse how the performance of this system differs with beam search.

In [None]:
for language in languages:
    print_headline(language.name)

    sequence_labeller = SequenceLabeller_BiLSTM_CRF_Beam(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(train_data),
        y=sequence_labeller.extract_y(train_data)
    )

    num_beams = [1, 2, 3]

    for beam in num_beams:
        sequence_labeller.beam_size = beam
        pipeline.evaluate(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(validation_data),
            y=sequence_labeller.extract_y(validation_data)
        )

In [None]:
# Qualitative investigation of the predicted answer spans

# Multilingual QA

We start by implementing our binary question system with a multilingual encoder instead of the monolingual ones. With this we perform zero-shot cross-lingual evaluation

In [None]:
# Mutli-lingual binary classification

We now implement our sequence tagger with a multilingual encoding and perform zero-shot cross-lingual evaluation

In [5]:
# Zero shot classification
for language in languages:
    print(f'\n\n--- Training on {language.name} ---')

    val_languages = [l.name for l in languages if l.name != language.name]
    print(f'--- Validating on {" and ".join(val_languages)} ---')

    sequence_labeller = SequenceLabeller_BERT('multilingual', {})

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    val_df = [all_data[val_name]['validation_data'] for val_name in val_languages]

    validation_data = pd.concat(val_df, ignore_index=True, axis=0)

    pipeline = Pipeline()
    pipeline.train(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(train_data),
        y=sequence_labeller.extract_y(train_data)
    )

    pipeline.evaluate(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(validation_data),
        y=sequence_labeller.extract_y(validation_data)
    )



--- Training on english ---
--- Validating on finnish ---


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint a

  0%|          | 0/7 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tags, tokenized_plaintext, __index_level_0__. If tokenized_question, tags, tokenized_plaintext, __index_level_0__ are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6700
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1260
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Training the model...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mchristian2903[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1260 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


KeyboardInterrupt: 