# Token prediction (bidirectional ANNs)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giuliarambelli//Event_Knowledge_Model_Comparison/blob/master/ANNs_predict-token-masked.ipynb) 

In [1]:
#!pip install pandas
import os
import math
import pandas as pd

## 1. Load the dataset


In [2]:
# This function load the datasets modified, the 3rd column is the position of the word we have to mask
def load_data(df):
    ids = []
    sents = []
    pos = []
    for index, row in df.iterrows():
        ids.append(row[0])
        pos.append(row[2]) 
        if row[1][-1]!='.':
            sents.append(row[1]+' .')
        else:
            sents.append(row[1])
    #return (ids, sents)
    return ids, sents, pos

In [3]:
# path to files in dataset/id_verbs subdirectory (position of the verb has to be given)
dtfit=pd.read_csv('datasets/id_verbs/DTFit_vassallo_deps.verbs.txt', sep='\t', header=None)
ev1=pd.read_csv('datasets/id_verbs/ev1_deps.verbs.txt', sep='\t', header=None)
events_adapt=pd.read_csv('datasets/id_verbs/newsentences_EventsAdapt.verbs.txt', sep='\t', header=None)

In [4]:
events_adapt.head()

Unnamed: 0,0,1,2
0,0,The raider caught the illness .,2
1,1,The illness caught the raider .,2
2,2,The illness was caught by the raider .,3
3,3,The raider was caught by the illness .,3
4,4,The marauder contracted the disease .,2


In [5]:
datasets = {'ev1': load_data(ev1),
            'dtfit': load_data(dtfit),
            'new-EventsAdapt': load_data(events_adapt)
           }

In [6]:
#!pip install tensorflow
#!pip install pytokenizations
#!pip install sentencepiece
import numpy as np
import tokenizations   #   pip install pytokenizations  (https://pypi.org/project/pytokenizations/)
import tensorflow as tf  #  TensorFlow 2.0 is required (Python 3.5-3.7, Pip 19.0 or later)

import sentencepiece as spm
from transformers import BertTokenizer, TFBertForMaskedLM
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
from transformers import XLNetTokenizer, TFXLNetLMHeadModel
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

In [9]:
BATCH_SIZE = 256
N_PREDICTIONS = 15

dict_tokenizers = {"bert-large-cased": BertTokenizer.from_pretrained('bert-large-cased'),
                   "roberta-large": RobertaTokenizer.from_pretrained('roberta-large'),
                   "xlnet-large-cased":XLNetTokenizer.from_pretrained('xlnet-large-cased')}


dict_mlm_models = {"bert-large-cased": TFBertForMaskedLM.from_pretrained('bert-large-cased'),
                   "roberta-large": TFRobertaForMaskedLM.from_pretrained('roberta-large'),
                   "xlnet-large-cased":TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')}


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForMaskedLM.

All the layers of TFRobertaForMaskedLM were initialized from the model checkpoint at roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForMaskedLM for predictions without further training.
All model checkpoint layers were used when initializing TFXLNetLMHeadModel.

All the layers of TFXLNetLMHeadModel were initialized from the model checkpoint at xlnet-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetLMHeadModel for predictions without 

In [10]:
class TransformerModel:
    
    def __init__(self, transf_model):
        self.model_name = transf_model
        self.tokenizer = dict_tokenizers[transf_model]
        self.mlm_model = dict_mlm_models[transf_model]
        
    def prepare_input(self, sentences, pos_ids):
        target_tokens = []
        sentences_with_mask = []
        dependents_indices = []
        #sentences = sentences.reset_index(drop=True)
        for i in range(len(sentences)):
            sent = sentences[i]
            id_dep = pos_ids[i]
            s = sent.split(" ")
            #print(s, id_dep)
            target_token = sent.split(" ")[id_dep]
            
            #  check if target token is in dictionary - otherwise add None to the lists     
            # BERT
            if self.model_name.startswith("bert"):
                if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids(target_token)) == "[UNK]":
                    #target_tokens.append(None)
                    target_tokens.append(self.tokenizer.tokenize(target_token))
                else:
                    target_tokens.append(target_token)
            
            # RoBERTa
            if self.model_name.startswith("roberta"):
                if id_dep == 0:
                    if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids(target_token)) == \
                            "<unk>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append(target_token)
                else:
                    if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids("Ġ"+target_token)) == \
                            "<unk>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append("Ġ"+target_token)
                        
            if self.model_name.startswith("xlnet"):
                if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids(u"\u2581"+target_token)) == \
                            "<unk>":
                    #target_tokens.append(None)
                    target_tokens.append(self.tokenizer.tokenize(target_token))
                else:
                    target_tokens.append(u"\u2581"+target_token)
                    #since in sentencepiece tokenizer this symbol is used for whitespace
                        
            # GPT-2
            if self.model_name.startswith("gpt"):
                if id_dep == 0:
                    if self.tokenizer.convert_ids_to_tokens(
                            self.tokenizer.convert_tokens_to_ids(target_token)) == "<|endoftext|>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append(target_token)
                else:
                    if self.tokenizer.convert_ids_to_tokens(
                            self.tokenizer.convert_tokens_to_ids("Ġ" + target_token)) == "<|endoftext|>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append("Ġ" + target_token)
                        
            # mask the sentence
            list_words = []
            for w in range(len(sent.split(" "))):
                if w != id_dep:
                    list_words.append(sent.split(" ")[w])
                else:
                    if self.model_name.startswith("bert"):
                        list_words.append("[MASK]")
                    if self.model_name.startswith(("roberta", 'xlnet')):
                        list_words.append("<mask>")
                    if self.model_name.startswith("gpt"):
                        list_words.append(sent.split(" ")[w])  #  mask is not needed for gpt
            masked_sent = " ".join(list_words)
            sentences_with_mask.append(masked_sent)
            
            model_tokenization = self.tokenizer.tokenize(masked_sent)
            #print(model_tokenization)
            
            if self.model_name.startswith("bert"):
                dependent_index = model_tokenization.index("[MASK]") + 1  # take into account token [CLS]
            if self.model_name.startswith("roberta"):
                dependent_index = model_tokenization.index("<mask>") + 1
            if self.model_name.startswith("gpt"):
                our_tokenization = masked_sent.split(" ")
                other_tokens_2_model_tokens, model_tokens_2_other_tokens = tokenizations.\
                    get_alignments(our_tokenization, model_tokenization)
                dependent_index = other_tokens_2_model_tokens[id_dep][0] + 1
            if self.model_name.startswith("xlnet"):
                dependent_index = model_tokenization.index("<mask>") 
                #since xlnet tokenizer does not add cls token at the beginning of the sequence
                
            dependents_indices.append(dependent_index)
            i += 1
        return target_tokens, sentences_with_mask, dependents_indices
    
    def compute_filler_probability(self, list_target_words, list_masked_sentences, \
                                   list_dependents_indexes, unidirectional=False):
        
        if self.model_name.startswith("gpt"):
            self.tokenizer.pad_token = self.tokenizer.eos_token
            inputs = self.tokenizer(["<|endoftext|>" + sent + "<|endoftext|>" for sent in list_masked_sentences],
                                    padding=True, return_tensors="tf")
            # it is necessary to add a token at the beginning of the sentence
        elif self.model_name.startswith("xlnet"):
            self.tokenizer.padding_side = "right" #since instances of xlnet tokenizer by default apply padding to the left
            inputs = self.tokenizer(list_masked_sentences, padding=True, return_tensors="tf") 
        else:
            inputs = self.tokenizer(list_masked_sentences, padding=True, return_tensors="tf")
            
        if not unidirectional:
            probabilities_fillers = []
            predicted_fillers = []

            #print("Executing model for batch...")
            #print()
            outputs = self.mlm_model(inputs)[0]
            for batch_elem, target_word, dep_index in zip(range(outputs.shape[0]), list_target_words,
                                                          list_dependents_indexes):
                #if target_word is None:
                    #probabilities_fillers.append(None)
                    #predicted_fillers.append(None)
                if type(target_word) == list: # word is OOV, get its subcomponents probability and average them
                    prob_subwords = []
                    for target_subword in target_word:
                        if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                        if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index - 1]).numpy()
                        
                        prob_subwords.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_subword)])
                    #print(probabilities_fillers, prob_subwords, sum(prob_subwords)/len(prob_subwords))
                    probabilities_fillers.append(sum(prob_subwords)/len(prob_subwords))
                    #idxs_predictions = (-(np.array(all_probabilities))).argsort()[:N_PREDICTIONS]
                    #predictions = self.tokenizer.convert_ids_to_tokens([int(index) for index in idxs_predictions])
                        
                else:
                    if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                    if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index - 1]).numpy()

                    probabilities_fillers.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_word)])
                    """
                    idxs_predictions = (-(np.array(all_probabilities))).argsort()[:N_PREDICTIONS]
                    predictions = self.tokenizer.convert_ids_to_tokens([int(index) for index in idxs_predictions])
                    string_predicted_fillers = ""
                    for word, index in zip(predictions, idxs_predictions):
                        string_predicted_fillers += word.replace("Ġ", "")+"_("+str(all_probabilities[index])+")"+";"
                    predicted_fillers.append(string_predicted_fillers)
                    """
            return probabilities_fillers#, predicted_fillers    
        
        else:    
            probabilities_uni_fillers = []
            predicted_uni_fillers = []
            
            new_attention_mask = []
            for mask, id, sent in zip(inputs["attention_mask"], list_dependents_indexes, list_masked_sentences):
                mask_array = np.array([0 for elem in mask])
                for i in range(0, id+1):
                    mask_array[i] = 1
                new_attention_mask.append(tf.convert_to_tensor(mask_array))
            inputs["attention_mask"] = tf.convert_to_tensor(new_attention_mask)
            #print("Executing model for batch...")
            #print()
            outputs = self.mlm_model(inputs)[0]
            for batch_elem, target_word, dep_index in zip(range(outputs.shape[0]), list_target_words,
                                                          list_dependents_indexes):
                #if target_word is None:
                #    probabilities_uni_fillers.append(None)
                if type(target_word) == list: # word is OOV, get its subcomponents probability and average them
                    prob_subwords = []
                    for target_subword in target_word:
                        if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                        if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index - 1]).numpy()
                        
                        prob_subwords.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_subword)])
                    probabilities_uni_fillers.append(sum(prob_subwords)/len(prob_subwords))
                else:
                    if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                    if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, 0]).numpy()
                    probabilities_uni_fillers.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_word)])
        
            return probabilities_uni_fillers#, predicted_fillers, probabilities_unidirectional
    
    
    def run_prediction(self, data_sequences, indexes, unilateral=False, batch_dimension=64):
        num_sentences = len(data_sequences)
        if num_sentences % batch_dimension == 0:
            num_batches = num_sentences // batch_dimension
        else:
            num_batches = num_sentences // batch_dimension + 1
        total_scores = []
        total_best_fillers = []
        total_uni_scores = []
        for batch in range(num_batches):
            #print()
            #print("Processing batch {} of {} . Progress: {} ...".format(batch + 1, num_batches,
            #                                                                  np.round((100 / num_batches) * (batch + 1)
            #                                                                           , 2)))
            if batch != num_batches - 1:
                target_words, masked_sentences, positions_dependents = self.\
                    prepare_input(data_sequences[batch * batch_dimension: (batch + 1) * batch_dimension], 
                                  indexes[batch * batch_dimension: (batch + 1) * batch_dimension])
                scores = self.compute_filler_probability(target_words, masked_sentences, 
                                                                                positions_dependents, unilateral)
            else:
                target_words, masked_sentences, positions_dependents = self.\
                    prepare_input(data_sequences[batch * batch_dimension:], 
                                  indexes[batch * batch_dimension:])
                scores = self.compute_filler_probability(target_words, masked_sentences,
                                                                       positions_dependents, unilateral)
            total_scores.extend(scores)
            #total_best_fillers.extend(best_fillers)
            #total_uni_scores.extend(uni_scores)
            
        return total_scores#, total_best_fillers, total_uni_scores
        
    
        


## 3. TASK: masked word prediction

In [11]:
def mask_word(model, sentences, pos):
    model_probs = model.run_prediction(sentences, pos)
    log_probs = [math.log(x) for x in model_probs]
    return log_probs

In [12]:
models = ["bert-large-cased","roberta-large","xlnet-large-cased"]

### 3.1 SUB-TASK: Verb prediction
Given a sentence, mask the verb and compute its probability. 

In [13]:
out_dir = 'verb-probs/'
os.makedirs(out_dir, exist_ok=True)

In [14]:
# get probability of the verb
def verb_prob(models, name):
    ids, sents, verb_id = datasets[name]
    
    for m in models:
        print('Model ',m)
        model = TransformerModel(m)
        probs_verb = mask_word(model, sents, verb_id)


        out_verbs = os.path.join(out_dir, '{}.{}.verb-prob.txt'.format(name, m))
        print('Write ', out_verbs)
        with open(out_verbs, 'w') as fout:
            for i, sent,score in zip(ids,sents,probs_verb):
                fout.write('{}\t{}\t{}\n'.format(i, sent,score))
            
    

In [16]:
for dataset_name in datasets:
    print('Processing: ', dataset_name)
    verb_prob(models, dataset_name)

Model  bert-large-cased
Write  verb-probs/ev1.bert-large-cased.verb-prob.txt
Model  roberta-large
Write  verb-probs/ev1.roberta-large.verb-prob.txt
Model  xlnet-large-cased
Write  verb-probs/ev1.xlnet-large-cased.verb-prob.txt
Model  bert-large-cased
Write  verb-probs/dtfit.bert-large-cased.verb-prob.txt
Model  roberta-large
Write  verb-probs/dtfit.roberta-large.verb-prob.txt
Model  xlnet-large-cased
Write  verb-probs/dtfit.xlnet-large-cased.verb-prob.txt
Model  bert-large-cased
Write  verb-probs/new-EventsAdapt.bert-large-cased.verb-prob.txt
Model  roberta-large
Write  verb-probs/new-EventsAdapt.roberta-large.verb-prob.txt
Model  xlnet-large-cased
Write  verb-probs/new-EventsAdapt.xlnet-large-cased.verb-prob.txt


### 3.2 SUB-TASK: Lask word prediction 
Given a sentence, mask the last token and compute its probability. 

In [17]:
out_dir = 'lastword-probs/'
os.makedirs(out_dir, exist_ok=True)

In [18]:
# get probability of the last word
def last_word_prob(models, name):
    ids, sents, verb_id = datasets[name]
    for m in models:
        print('Model ',m)
        model = TransformerModel(m)

        pos = [len(s.strip().split(' '))-2 for s in sents]
        probs_last_word = mask_word(model, sents, pos)

        out_lastw = os.path.join(out_dir, '{}.{}.last-word-prob.txt'.format(name, m))
        print('Write ', out_lastw)        
        with open(out_lastw, 'w') as fout:
            for i, sent,score in zip(ids,sents,probs_last_word):
                fout.write('{}\t{}\t{}\n'.format(i, sent,score))

In [19]:
for dataset_name in datasets:
    print('Processing: ', dataset_name)
    last_word_prob(models, dataset_name)

Processing:  ev1
Model  bert-large-cased
Write  lastword-probs/ev1.bert-large-cased.last-word-prob.txt
Model  roberta-large
Write  lastword-probs/ev1.roberta-large.last-word-prob.txt
Model  xlnet-large-cased
Write  lastword-probs/ev1.xlnet-large-cased.last-word-prob.txt
Processing:  dtfit
Model  bert-large-cased
Write  lastword-probs/dtfit.bert-large-cased.last-word-prob.txt
Model  roberta-large
Write  lastword-probs/dtfit.roberta-large.last-word-prob.txt
Model  xlnet-large-cased
Write  lastword-probs/dtfit.xlnet-large-cased.last-word-prob.txt
Processing:  new-EventsAdapt
Model  bert-large-cased
Write  lastword-probs/new-EventsAdapt.bert-large-cased.last-word-prob.txt
Model  roberta-large
Write  lastword-probs/new-EventsAdapt.roberta-large.last-word-prob.txt
Model  xlnet-large-cased
Write  lastword-probs/new-EventsAdapt.xlnet-large-cased.last-word-prob.txt
