In [1]:
from models.Model import Model
from models.Logistic.MultiGPT2Logistic import MultiGPT2Logistic
from languages.LanguageModel import LanguageModel
from DataExploration import DataExploration
# from languages.Japanese import Japanese
from languages.English import English
from languages.Finnish import Finnish
from Preprocess import Preprocess
from Pipeline import Pipeline
from typing import List
import torch
import datasets
import pandas as pd
import random
import numpy as np

2022-10-31 17:44:02.856811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently 
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

In [3]:
# Is used to minimize the clutter in the console
datasets.logging.set_verbosity_error()

# Define the languages to be used
languages: List[LanguageModel] = [
    English(),
    Finnish(),
    # Japanese()
]

# gpt2Generator = GPT2Generator()
torch.cuda.empty_cache()


# Preprocessing and Dataset Analysis

In [4]:
all_data = {}

for language in languages:
    pipeline = Pipeline()

    # Get the preprocessed data and split it into training and validation data
    preprocessor = Preprocess(language.tokenize, language.clean)
    data = pipeline.get_data(language=language.name, preproccesor=preprocessor)
    train_data, validation_data = pipeline.split_data(data)

    all_data[language.name] = {
        "train_data": train_data,
        "validation_data": validation_data
    }


Loading data...

Loading data...


In [5]:
all_data["english"]["train_data"]

Unnamed: 0,question_text,document_title,language,document_plaintext,document_url,answer_start,answer_text,is_answerable,tokenized_question,tokenized_plaintext,cleaned_question,cleaned_plaintext
5415,When did the episode The Sue Sylvester Shuffle...,The Sue Sylvester Shuffle,english,Musical performances also attracted mixed comm...,https://en.wikipedia.org/wiki/The%20Sue%20Sylv...,-1,,False,"[When, did, the, episode, The, Sue, Sylvester,...","[Musical, performances, also, attracted, mixed...","[episod, sue, sylvest, shuffl, air, ?]","[music, perform, also, attract, mix, commentar..."
751,When was Cadillac founded?,Cadillac,english,Cadillac is among the first automobile brands ...,https://en.wikipedia.org/wiki/Cadillac,188,1902,True,"[When, was, Cadillac, founded, ?]","[Cadillac, is, among, the, first, automobile, ...","[cadillac, found, ?]","[cadillac, among, first, automobil, brand, wor..."
3774,How many cricket teams are in Australia?,Cricket in Australia,english,The 2015 Cricket World Cup was jointly hosted ...,https://en.wikipedia.org/wiki/Cricket%20in%20A...,110,Fourteen,True,"[How, many, cricket, teams, are, in, Australia...","[The, 2015, Cricket, World, Cup, was, jointly,...","[mani, cricket, team, australia, ?]","[2015, cricket, world, cup, joint, host, austr..."
7283,Who owns the AMC network?,AMC Networks,english,Rainbow ran the local-minded MSG Metro Channel...,https://en.wikipedia.org/wiki/AMC%20Networks,-1,,False,"[Who, owns, the, AMC, network, ?]","[Rainbow, ran, the, local-minded, MSG, Metro, ...","[own, amc, network, ?]","[rainbow, ran, local-mind, msg, metro, channel..."
4382,What was the first hormone discovered?,Secretin,english,It has been suggested that abnormalities in su...,https://en.wikipedia.org/wiki/Secretin,-1,,False,"[What, was, the, first, hormone, discovered, ?]","[It, has, been, suggested, that, abnormalities...","[first, hormon, discov, ?]","[suggest, abnorm, secretin, releas, could, exp..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4373,What was the first country to adopt socialist ...,History of socialism,english,The initial success of the Russian Revolution ...,https://en.wikipedia.org/wiki/History%20of%20s...,-1,,False,"[What, was, the, first, country, to, adopt, so...","[The, initial, success, of, the, Russian, Revo...","[first, countri, adopt, socialist, principl, ?]","[initi, success, russian, revolut, inspir, rev..."
7891,What station did Monk air on?,Monk (TV series),english,"A ""behind the scenes"" audio podcast entitled ""...",https://en.wikipedia.org/wiki/Monk%20%28TV%20s...,-1,,False,"[What, station, did, Monk, air, on, ?]","[A, ``, behind, the, scenes, '', audio, podcas...","[station, monk, air, ?]","[``, behind, scene, '', audio, podcast, entitl..."
4859,What is the most common element in the Earth's...,Abundance of the chemical elements,english,"The elements – that is, ordinary (baryonic) ma...",https://en.wikipedia.org/wiki/Abundance%20of%2...,-1,,False,"[What, is, the, most, common, element, in, the...","[The, elements, –, that, is, ,, ordinary, (, b...","[common, element, earth, 's, atmospher, ?]","[element, –, ,, ordinari, (, baryon, ), matter..."
3264,Who won the last Super Bowl?,List of Super Bowl champions,english,The Pittsburgh Steelers (6–2) have won the mos...,https://en.wikipedia.org/wiki/List%20of%20Supe...,94,New England Patriots,True,"[Who, won, the, last, Super, Bowl, ?]","[The, Pittsburgh, Steelers, (, 6–2, ), have, w...","[last, super, bowl, ?]","[pittsburgh, steeler, (, 6–2, ), super, bowl, ..."


In [38]:
# import transformers
# from transformers import BertTokenizer, BertLMHeadModel
# import torch
# from torch.nn import functional as F
# tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# model = BertLMHeadModel.from_pretrained('bert-base-multilingual-uncased',
# return_dict=True, is_decoder = True)
# text = "A knife is very "
# input = tokenizer.encode_plus(text, return_tensors = "pt")
# output = model(**input).logits[:, -1, :]
# softmax = F.softmax(output, -1)
# index = torch.argmax(softmax, dim = -1)
# x = tokenizer.decode(index)
# print("word:", x)

In [6]:
import transformers
from transformers import BertTokenizer, BertLMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
from datasets import Dataset
from torch.nn import functional as F

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertLMHeadModel.from_pretrained('bert-base-multilingual-uncased', is_decoder = True)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
dataset = all_data["english"]["train_data"]
dataset = dataset.head(10)
train_dataset = Dataset.from_pandas(dataset[['question_text', 'document_plaintext']])
val_dataset =  Dataset.from_pandas(dataset[['question_text', 'document_plaintext']])

In [11]:
def tokenize_function(examples):
    input_str = 'Question: ' + \
                examples['question_text'] + '\nContext: ' + \
                examples['document_plaintext']
    return tokenizer(input_str, padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    remove_columns=['question_text', 'document_plaintext'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    remove_columns=['question_text', 'document_plaintext'],
)

  0%|          | 0/10 [00:00<?, ?ex/s]

  0%|          | 0/10 [00:00<?, ?ex/s]

In [12]:
model_headlines_path = './model_headlines_news'

training_args = TrainingArguments(
    output_dir=model_headlines_path,          # output directory
    num_train_epochs=6,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=200,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_headlines_path,            # directory for storing logs
    prediction_loss_only=True,
    save_steps=10000 
)

data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

In [13]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertLMHeadModel.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertLMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6


  0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
text = "A knife is very "
text_ids = tokenizer.encode(text, return_tensors = 'pt')

generated_text_samples = model.generate(
    text_ids,
    max_length= 50,  
    num_beams=5,
    num_return_sequences= 5,
    early_stopping=True 
)
generated_text_samples

tensor([[  101,   143, 65016, 10127, 12495,   102,   143,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119],
        [  101,   143, 65016, 10127, 12495,   102,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119,
           119,   119,   119,   119,   119,   119,   119,   119,   119,   119],
        [  101,   143, 65016, 10127, 12495,   102,   157,   157,   157,   157,
           157,   157,   157,   157,   157,   157,   157,   157,   157,   157,
           157,   157,   157,   157,   157,   157,

In [None]:
for i, beam in enumerate(generated_text_samples):
    print(f"{i}: {tokenizer.decode(beam, skip_special_tokens=True)}")
    print()

0: a knife is very a...........................................

1: a knife is very............................................

2: a knife is very o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o o

3: a knife is very de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de

4: a knife is very,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,



# Multilingual QA

We start by implementing our binary question system with a multilingual encoder instead of the monolingual ones. With this we perform zero-shot cross-lingual evaluation

In [5]:
# Zero shot classification
for train_language in languages:
    print(f'\n\n--- Training on {train_language.name} ---')
    model = MultiGPT2Logistic()
    model.set_language(train_language.name)
    
    try:
        model.load()
    except:
        train_data = all_data[train_language.name]["train_data"]
        X_train = model.extract_X(train_data)
        y_train = train_data['is_answerable']
        model = pipeline.train(
            model,
            X_train,
            y_train
        )
        model.save()
    
    for val_language in languages:
        print(f'\n\t- Validating on {val_language.name}')
        validation_data = all_data[val_language.name]["validation_data"]
        X_validation = model.extract_X(validation_data)
        y_validation = validation_data['is_answerable']
        pipeline.evaluate(
            model,
            X_validation,
            y_validation
        )



--- Training on english ---


Downloading vocab.json:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.21G [00:00<?, ?B/s]

: 

: 