In [1]:
from models.Model import Model
from models.GPT2Generator import GPT2Generator
from models.Logistic.BOWLogistic import BOWLogistic
from models.MLP.BOWMLP import BOWMLP
from models.MLP.CBOW_BOWMLP import CBOW_BOWMLP
from models.MLP.CBOWMLP import CBOWMLP
from models.RandomForest.BOWRandomForest import BOWRandomForest
from models.RandomForest.CBOW_BOWRandomForest import CBOW_BOWRandomForest
from models.RandomForest.CBOWRandomForest import CBOWRandomForest
from models.Logistic.CBOW_BOWLogistic import CBOW_BOWLogistic
from models.Logistic.CBOWLogistic import CBOWLogistic
from models.XGBoost.BOWXGBoost import BOWXGBoost
from models.XGBoost.CBOW_BOWXGBoost import CBOW_BOWXGBoost
from models.XGBoost.CBOWXGBoost import CBOWXGBoost

from models.SequenceLabeller_BiLSTM_CRF import SequenceLabeller_BiLSTM_CRF
from models.SequenceLabeller_BiLSTM_CRF_Beam import SequenceLabeller_BiLSTM_CRF_Beam
from models.SequenceLabeller_BERT import SequenceLabeller_BERT


from languages.LanguageModel import LanguageModel
from DataExploration import DataExploration
from languages.Japanese import Japanese
from languages.English import English
from languages.Finnish import Finnish
from Preprocess import Preprocess
from Pipeline import Pipeline
from typing import List
import torch
import datasets
import pandas as pd
import random
import numpy as np

In [None]:
# Download word_vectors
!mkdir word_vectors
%cd word_vectors
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz

!gzip -d cc.en.300.vec.gz
!gzip -d cc.ja.300.vec.gz
!gzip -d cc.fi.300.vec.gz

%cd ..

In [2]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [3]:
# Is used to minimize the clutter in the console
datasets.logging.set_verbosity_error()

# Define the languages to be used
languages: List[LanguageModel] = [
    English(),
    Finnish(),
    Japanese()
]

# gpt2Generator = GPT2Generator()
torch.cuda.empty_cache()


In [4]:
def print_headline(language: str):
    print(f'\n\n--- Language: {language} ---')

# Preprocessing and Dataset Analysis

In [5]:
all_data = {}

for language in languages:
    pipeline = Pipeline()

    # Get the preprocessed data and split it into training and validation data
    preprocessor = Preprocess(language.tokenize, language.clean)
    data = pipeline.get_data(language=language.name, preproccesor=preprocessor)
    train_data, validation_data = pipeline.split_data(data)

    all_data[language.name] = {
        "train_data": train_data,
        "validation_data": validation_data
    }


Loading data...

Loading data...

Loading data...


In [6]:
# Explore the data for each language
for language in languages:
    print_headline(language.name)
    data_exploration = DataExploration(all_data[language.name]["train_data"])
    data_exploration.find_frequent_words()



--- Language: english ---

Most frequent first words:
[('When', 1999), ('What', 1977), ('How', 1182), ('Who', 930), ('Where', 416)]
Most frequent last words:
[('?', 6693), ('zombie', 2), ('metabolite', 2), ('BCE', 2), ('\\', 1)]



--- Language: finnish ---

Most frequent first words:
[('Milloin', 3234), ('Mikä', 2059), ('Missä', 1474), ('Kuka', 1418), ('Mitä', 954)]
Most frequent last words:
[('?', 12290), ('tulitaistelussa', 2), ('syntynyt', 2), ('pinta-ala', 2), ('lintulaji', 2)]



--- Language: japanese ---

Most frequent first words:
[('日本', 326), ('『', 269), ('アメリカ', 96), ('世界', 89), ('第', 50)]
Most frequent last words:
[('？', 5286), ('いつ', 649), ('た', 544), ('どこ', 529), ('何', 407)]



# Binary Question Classification
Binary classfiers that only takes features based on the question, context document and combinations of the two

In [None]:
for language in languages:
    print_headline(language.name)

    feature_based_classifiers = [
        BOWRandomForest(language.name),
        BOWMLP(language.name),
        BOWLogistic(language.name),
        BOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )
    

# Representation Learning
Extension of our binary question classifers to also include features based on continous vector representations

In [None]:
for language in languages:
    print_headline(language.name)

    continous_based_classifiers = [
        CBOW_BOWRandomForest(language.name),
        CBOW_BOWMLP(language.name),
        CBOW_BOWLogistic(language.name),
        CBOW_BOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )

We also test how the performance if only the continous representations was to be used

In [None]:
for language in languages:
    print_headline(language.name)

    continous_based_classifiers = [
        CBOWRandomForest(language.name),
        CBOWMLP(language.name),
        CBOWLogistic(language.name),
        CBOWXGBoost(language.name)
    ]
    
    for classifier in feature_based_classifiers:
        print(f'--- Classifier: {classifier.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=classifier,
            X=classifier.extract_X(train_data),
            y=classifier.extract_y(train_data)
        )

        pipeline.evaluate(
            model=classifier,
            X=classifier.extract_X(validation_data),
            y=classifier.extract_y(validation_data)
        )

# Language modelling
Extension to the classifiers in which word/sentence representations are instead extracted from neural language models

In [None]:
for language in languages:
    print_headline(language.name)

    classifier = GPT2CBOWLogistic(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=classifier,
        X=classifier.extract_X(train_data),
        y=classifier.extract_y(train_data)
    )

    pipeline.evaluate(
        model=classifier,
        X=classifier.extract_X(validation_data),
        y=classifier.extract_y(validation_data)
    )

We try to sample from these language models to see what kinds of sentences they generate. Moreover we measure the performance on the TyDi QA validation data with a commonly used language model evaluations metric

In [None]:
question_beginning = {
    'english': ['When', 'What', 'How'],
    'finnish': ['Milloin', 'Mikä', 'Missä'],
    'japanese': ['日本', '『', 'アメリカ']
}

for language in languages:
    print_headline(language.name)

    model = GPT2Generator(language.name)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=classifier,
        X=classifier.extract_X(train_data),
        y=classifier.extract_y(train_data)
    )

    for starting_word in question_beginning[language.name]:
        model.generate_text(f'Question: {starting_word}')
    model.get_perplexity(model.extract_X(validation_data))

# Error Analysis and Interpretability

# Sequence Labelling
We implement a sequence labeller, which predicts which parts of a paragraph are likel part of the answer to the corresponding question

In [7]:
config = {
    'num_train_epochs': 10,
    'learning_rate': 2e-5,
    'per_device_train_batch_size': 8,
    'per_device_eval_batch_size': 8,
    'warmup_steps': 200,
    'weight_decay': 0.01,
    'lstm_dim': 128,
    'dropout_prob': 0.1,
    'n_workers': 0,
    'beam_size': 2,
}

In [8]:
import os
os.environ["WANDDB_DISABLED"] = "true"

In [15]:
try:
    import wandb
    from wandb import init, log, join  # test that these are available
except ImportError:
    print("msg")

msg


In [16]:
import wandb
from wandb import init, log, join

ImportError: cannot import name 'init' from 'wandb' (unknown location)

In [11]:
for language in languages:
    print_headline(language.name)

    sequence_labellers = [
        SequenceLabeller_BERT(language.name, config),
        SequenceLabeller_BiLSTM_CRF(language.name, config),
    ]

    for sequence_labeller in sequence_labellers:
        print(f'--- Sequence Labeller: {sequence_labeller.__class__.__name__} ---')
        train_data = all_data[language.name]["train_data"]
        validation_data = all_data[language.name]["validation_data"]

        pipeline = Pipeline()
        pipeline.train(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(train_data),
            y=sequence_labeller.extract_y(train_data)
        )

        pipeline.evaluate(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(validation_data),
            y=sequence_labeller.extract_y(validation_data)
        )
        sequence_labeller.save()



--- Language: english ---


loading configuration file config.json from cache at C:\Users\chris/.cache\huggingface\hub\models--bert-base-uncased\snapshots\5546055f03398095e385d7dc625e636cc8910bf2\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\chris/.cache\huggingface\hub\models--bert-base-uncased\snapshots\5546055f03398095e385d7dc625e636cc8910bf2\vocab.t

--- Sequence Labeller: SequenceLabeller_BERT ---


  0%|          | 0/7 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokenized_question, tokenized_plaintext, __index_level_0__, tags. If tokenized_question, tokenized_plaintext, __index_level_0__, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6700
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5028
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED

Training the model...


AttributeError: module 'wandb' has no attribute 'run'

We add an extension to the sequence labeller which uses beam search to select the optimal sequence of labels for the location of the answer in the text. Analyse how the performance of this system differs with beam search.

In [None]:
for language in languages:
    print_headline(language.name)

    sequence_labeller = SequenceLabeller_BiLSTM_CRF_Beam(language.name, config)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    pipeline = Pipeline()
    pipeline.train(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(train_data),
        y=sequence_labeller.extract_y(train_data)
    )

    num_beams = [1, 2, 3]

    for beam in num_beams:
        sequence_labeller.beam_size = beam
        pipeline.evaluate(
            model=sequence_labeller,
            X=sequence_labeller.extract_X(validation_data),
            y=sequence_labeller.extract_y(validation_data)
        )
    
    sequence_labeller.save()

In [None]:
# Qualitative investigation of the predicted answer spans

# Multilingual QA

We start by implementing our binary question system with a multilingual encoder instead of the monolingual ones. With this we perform zero-shot cross-lingual evaluation

In [None]:
# Mutli-lingual binary classification

We now implement our sequence tagger with a multilingual encoding and perform zero-shot cross-lingual evaluation

In [13]:
# Zero shot classification
for language in languages:
    print(f'\n\n--- Training on {language.name} ---')

    val_languages = [l.name for l in languages if l.name != language.name]
    print(f'--- Validating on {" and ".join(val_languages)} ---')

    sequence_labeller = SequenceLabeller_BERT('multilingual', config)

    train_data = all_data[language.name]["train_data"]
    validation_data = all_data[language.name]["validation_data"]

    val_df = [all_data[val_name]['validation_data'] for val_name in val_languages]

    validation_data = pd.concat(val_df, ignore_index=True, axis=0)

    pipeline = Pipeline()
    pipeline.train(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(train_data),
        y=sequence_labeller.extract_y(train_data)
    )

    pipeline.evaluate(
        model=sequence_labeller,
        X=sequence_labeller.extract_X(validation_data),
        y=sequence_labeller.extract_y(validation_data)
    )

    sequence_labeller.save()



--- Training on english ---
--- Validating on finnish ---


loading configuration file config.json from cache at /Users/christianjensen/.cache/huggingface/hub/models--bert-base-multilingual-uncased/snapshots/800c34f3d5aa174fe531f560b44b8d14592225b7/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": 

  0%|          | 0/7 [00:00<?, ?ba/s]

KeyboardInterrupt: 