# Verb prediction (bidirectional ANNs)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giuliarambelli/Event_Knowledge_ANN/blob/master/ANNs_predict-verb-masked.ipynb) 

## 1. Load the dataset


In [None]:
# This function load the datasets modified, the 3rd column is the position of the word we have to mask
def load_data(inpath):
    idxs = []
    sentences = []
    pos = []
    with open(inpath, 'r') as f:
        for line in f:
            idx, sentence, target_pos = line.strip().split('\t')
            idxs.append(idx)
            sentences.append(sentence)
            pos.append(int(target_pos))
    return idxs, sentences, pos

In [None]:
# load dataset new-EventsAdapt-sentences.ids.txt
idxs_sent, sentences, pos = load_data('originals/new-EventsAdapt-sentences.ids.txt')

In [None]:
# Function load for word2word mask tasks
def prepare_data(df):
    ids = []
    sents = []
    for index, row in df.iterrows():
        ids.append(row[0])
        if row[1][-1]!='.':
            sents.append(row[1]+' .')
        else:
            sents.append(row[1])
    return (ids, sents)

In [None]:
import pandas as pd
# load just ids and sentences (for word-by-word preds)
dtfit=pd.read_csv('originals/DTFit_vassallo_deps.txt', sep='\t', header=None)
ev1=pd.read_csv('originals/ev1_deps.txt', sep='\t',header=None)
ev2=pd.read_csv('originals/ev2_deps.txt', sep='\t',header=None)
new_ev=pd.read_csv('originals/new-EventsAdapt-sentences.ids.txt', sep='\t',header=None)

In [None]:
new_ev=pd.read_csv('originals/new-EventsAdapt-sentences.ids.txt', sep='\t',header=None)

In [None]:
dtfit.head()

In [None]:
datasets = {'ev1': prepare_data(ev1),
            'dtfit': prepare_data(dtfit),
            'ev2': prepare_data(ev2),
            'new-EventsAdapt': prepare_data(new_ev)
           }

## Transformer object

In [None]:
import numpy as np
import tokenizations   #   pip install pytokenizations  (https://pypi.org/project/pytokenizations/)
import tensorflow as tf  #  TensorFlow 2.0 is required (Python 3.5-3.7, Pip 19.0 or later)

import sentencepiece as spm
from transformers import BertTokenizer, TFBertForMaskedLM
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
from transformers import XLNetTokenizer, TFXLNetLMHeadModel
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel

In [None]:
BATCH_SIZE = 256
N_PREDICTIONS = 15

dict_tokenizers = {"bert-base-cased": BertTokenizer.from_pretrained('bert-base-cased'),
                   "bert-large-cased": BertTokenizer.from_pretrained('bert-large-cased'),
                   "roberta-large": RobertaTokenizer.from_pretrained('roberta-large'),
                   "xlnet-large-cased":XLNetTokenizer.from_pretrained('xlnet-large-cased'),
                   "gpt2-medium": GPT2Tokenizer.from_pretrained('gpt2-medium')}


dict_mlm_models = {"bert-base-cased": TFBertForMaskedLM.from_pretrained('bert-base-cased'),
                   "bert-large-cased": TFBertForMaskedLM.from_pretrained('bert-large-cased'),
                   "roberta-large": TFRobertaForMaskedLM.from_pretrained('roberta-large'),
                   "xlnet-large-cased":TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased'),
                   "gpt2-medium": TFGPT2LMHeadModel.from_pretrained('gpt2-medium')}


In [None]:
class TransformerModel:
    
    def __init__(self, transf_model):
        self.model_name = transf_model
        self.tokenizer = dict_tokenizers[transf_model]
        self.mlm_model = dict_mlm_models[transf_model]
        
    def prepare_input(self, sentences, pos_ids):
        target_tokens = []
        sentences_with_mask = []
        dependents_indices = []
        #sentences = sentences.reset_index(drop=True)
        for i in range(len(sentences)):
            sent = sentences[i]
            id_dep = pos_ids[i]
            s = sent.split(" ")
            #print(s, id_dep)
            target_token = sent.split(" ")[id_dep]
            
            #  check if target token is in dictionary - otherwise add None to the lists     
            # BERT
            if self.model_name.startswith("bert"):
                if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids(target_token)) == "[UNK]":
                    #target_tokens.append(None)
                    target_tokens.append(self.tokenizer.tokenize(target_token))
                else:
                    target_tokens.append(target_token)
            
            # RoBERTa
            if self.model_name.startswith("roberta"):
                if id_dep == 0:
                    if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids(target_token)) == \
                            "<unk>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append(target_token)
                else:
                    if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids("Ġ"+target_token)) == \
                            "<unk>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append("Ġ"+target_token)
                        
            if self.model_name.startswith("xlnet"):
                if self.tokenizer.convert_ids_to_tokens(self.tokenizer.convert_tokens_to_ids(u"\u2581"+target_token)) == \
                            "<unk>":
                    #target_tokens.append(None)
                    target_tokens.append(self.tokenizer.tokenize(target_token))
                else:
                    target_tokens.append(u"\u2581"+target_token)
                    #since in sentencepiece tokenizer this symbol is used for whitespace
                        
            # GPT-2
            if self.model_name.startswith("gpt"):
                if id_dep == 0:
                    if self.tokenizer.convert_ids_to_tokens(
                            self.tokenizer.convert_tokens_to_ids(target_token)) == "<|endoftext|>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append(target_token)
                else:
                    if self.tokenizer.convert_ids_to_tokens(
                            self.tokenizer.convert_tokens_to_ids("Ġ" + target_token)) == "<|endoftext|>":
                        #target_tokens.append(None)
                        target_tokens.append(self.tokenizer.tokenize(target_token))
                    else:
                        target_tokens.append("Ġ" + target_token)
                        
            # mask the sentence
            list_words = []
            for w in range(len(sent.split(" "))):
                if w != id_dep:
                    list_words.append(sent.split(" ")[w])
                else:
                    if self.model_name.startswith("bert"):
                        list_words.append("[MASK]")
                    if self.model_name.startswith(("roberta", 'xlnet')):
                        list_words.append("<mask>")
                    if self.model_name.startswith("gpt"):
                        list_words.append(sent.split(" ")[w])  #  mask is not needed for gpt
            masked_sent = " ".join(list_words)
            sentences_with_mask.append(masked_sent)
            
            model_tokenization = self.tokenizer.tokenize(masked_sent)
            #print(model_tokenization)
            
            if self.model_name.startswith("bert"):
                dependent_index = model_tokenization.index("[MASK]") + 1  # take into account token [CLS]
            if self.model_name.startswith("roberta"):
                dependent_index = model_tokenization.index("<mask>") + 1
            if self.model_name.startswith("gpt"):
                our_tokenization = masked_sent.split(" ")
                other_tokens_2_model_tokens, model_tokens_2_other_tokens = tokenizations.\
                    get_alignments(our_tokenization, model_tokenization)
                dependent_index = other_tokens_2_model_tokens[id_dep][0] + 1
            if self.model_name.startswith("xlnet"):
                dependent_index = model_tokenization.index("<mask>") 
                #since xlnet tokenizer does not add cls token at the beginning of the sequence
                
            dependents_indices.append(dependent_index)
            i += 1
        return target_tokens, sentences_with_mask, dependents_indices
    
    def compute_filler_probability(self, list_target_words, list_masked_sentences, \
                                   list_dependents_indexes, unidirectional=False):
        
        if self.model_name.startswith("gpt"):
            self.tokenizer.pad_token = self.tokenizer.eos_token
            inputs = self.tokenizer(["<|endoftext|>" + sent + "<|endoftext|>" for sent in list_masked_sentences],
                                    padding=True, return_tensors="tf")
            # it is necessary to add a token at the beginning of the sentence
        elif self.model_name.startswith("xlnet"):
            self.tokenizer.padding_side = "right" #since instances of xlnet tokenizer by default apply padding to the left
            inputs = self.tokenizer(list_masked_sentences, padding=True, return_tensors="tf") 
        else:
            inputs = self.tokenizer(list_masked_sentences, padding=True, return_tensors="tf")
            
        if not unidirectional:
            probabilities_fillers = []
            predicted_fillers = []

            #print("Executing model for batch...")
            #print()
            outputs = self.mlm_model(inputs)[0]
            for batch_elem, target_word, dep_index in zip(range(outputs.shape[0]), list_target_words,
                                                          list_dependents_indexes):
                #if target_word is None:
                    #probabilities_fillers.append(None)
                    #predicted_fillers.append(None)
                if type(target_word) == list: # word is OOV, get its subcomponents probability and average them
                    prob_subwords = []
                    for target_subword in target_word:
                        if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                        if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index - 1]).numpy()
                        
                        prob_subwords.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_subword)])
                    #print(probabilities_fillers, prob_subwords, sum(prob_subwords)/len(prob_subwords))
                    probabilities_fillers.append(sum(prob_subwords)/len(prob_subwords))
                    #idxs_predictions = (-(np.array(all_probabilities))).argsort()[:N_PREDICTIONS]
                    #predictions = self.tokenizer.convert_ids_to_tokens([int(index) for index in idxs_predictions])
                        
                else:
                    if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                    if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index - 1]).numpy()

                    probabilities_fillers.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_word)])
                    """
                    idxs_predictions = (-(np.array(all_probabilities))).argsort()[:N_PREDICTIONS]
                    predictions = self.tokenizer.convert_ids_to_tokens([int(index) for index in idxs_predictions])
                    string_predicted_fillers = ""
                    for word, index in zip(predictions, idxs_predictions):
                        string_predicted_fillers += word.replace("Ġ", "")+"_("+str(all_probabilities[index])+")"+";"
                    predicted_fillers.append(string_predicted_fillers)
                    """
            return probabilities_fillers#, predicted_fillers    
        
        else:    
            probabilities_uni_fillers = []
            predicted_uni_fillers = []
            
            new_attention_mask = []
            for mask, id, sent in zip(inputs["attention_mask"], list_dependents_indexes, list_masked_sentences):
                mask_array = np.array([0 for elem in mask])
                for i in range(0, id+1):
                    mask_array[i] = 1
                new_attention_mask.append(tf.convert_to_tensor(mask_array))
            inputs["attention_mask"] = tf.convert_to_tensor(new_attention_mask)
            #print("Executing model for batch...")
            #print()
            outputs = self.mlm_model(inputs)[0]
            for batch_elem, target_word, dep_index in zip(range(outputs.shape[0]), list_target_words,
                                                          list_dependents_indexes):
                #if target_word is None:
                #    probabilities_uni_fillers.append(None)
                if type(target_word) == list: # word is OOV, get its subcomponents probability and average them
                    prob_subwords = []
                    for target_subword in target_word:
                        if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                        if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                            all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index - 1]).numpy()
                        
                        prob_subwords.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_subword)])
                    probabilities_uni_fillers.append(sum(prob_subwords)/len(prob_subwords))
                else:
                    if (self.model_name.startswith("bert")) or (self.model_name.startswith("roberta")):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, dep_index]).numpy()
                    if self.model_name.startswith("gpt") or self.model_name.startswith("xlnet"):
                        all_probabilities = tf.nn.softmax(outputs[batch_elem, 0]).numpy()
                    probabilities_uni_fillers.append(all_probabilities[self.tokenizer.convert_tokens_to_ids(target_word)])
        
            return probabilities_uni_fillers#, predicted_fillers, probabilities_unidirectional
    
    
    def run_prediction(self, data_sequences, indexes, unilateral, batch_dimension=64):
        num_sentences = len(data_sequences)
        if num_sentences % batch_dimension == 0:
            num_batches = num_sentences // batch_dimension
        else:
            num_batches = num_sentences // batch_dimension + 1
        total_scores = []
        total_best_fillers = []
        total_uni_scores = []
        for batch in range(num_batches):
            #print()
            #print("Processing batch {} of {} . Progress: {} ...".format(batch + 1, num_batches,
            #                                                                  np.round((100 / num_batches) * (batch + 1)
            #                                                                           , 2)))
            if batch != num_batches - 1:
                target_words, masked_sentences, positions_dependents = self.\
                    prepare_input(data_sequences[batch * batch_dimension: (batch + 1) * batch_dimension], indexes)
                scores = self.compute_filler_probability(target_words, masked_sentences, 
                                                                                positions_dependents, unilateral)
            else:
                target_words, masked_sentences, positions_dependents = self.\
                    prepare_input(data_sequences[batch * batch_dimension:], indexes)
                scores = self.compute_filler_probability(target_words, masked_sentences,
                                                                       positions_dependents, unilateral)
            total_scores.extend(scores)
            #total_best_fillers.extend(best_fillers)
            #total_uni_scores.extend(uni_scores)
            
        return total_scores#, total_best_fillers, total_uni_scores
        
    
        


## 1. Verb prediction task

In [None]:
def mask_word(sentences):
    model_probs = model.run_prediction(sentences, pos, False, BATCH_SIZE)
    log_probs = [math.log(x) for x in model_probs]
    return log_probs

In [None]:
# example
model = TransformerModel('roberta-large')
probs = mask_word(sentences)

for i, sent, score in zip(idxs_sent,sentences, probs):
    print(i, sent, score)

## 2. Sequential word prediction

### 2.1 Pseudo-log likelihood

In [None]:
def mask_word_by_word(sentences):
    results = []
    for sent in sentences:
        s = sent.split(' ')
        ids = [w_id for w_id in range(0, len(s))]
        ss = [sent for i in range(0, len(s))]    
        # run model
        model_probs = model.run_prediction(ss, ids, False, BATCH_SIZE)

        try:
            results.append((sent, sum(model_probs)))
        except TypeError:
            results.append((sent, None))
            
    log_probs = [math.log(x) for x in results]
    return log_probs
    

In [None]:
# example
model = TransformerModel('roberta-large')
probs = mask_word_by_word(sentences)

for i, sent, score in zip(idxs_sent, sentences, probs):
    print(i, sent, score)


### 2.2 Left-to-right generation

In [None]:
import math
from tqdm import tqdm
def mask_word_left2right(sentences):
    results = []
    for sent in tqdm(sentences):
        s = sent.split(' ')
        ids = [w_id for w_id in range(0, len(s))]
        ss = [sent for i in range(0, len(s))]  
        model_probs = model.run_prediction(ss, ids, True, BATCH_SIZE)
        #try:
        results.append((sent, sum(model_probs)))
        #except TypeError:
        #    results.append((sent, None))
    log_probs = [math.log(x[1]) for x in results]
    return log_probs

    


In [None]:
out_folder = 'left2right_res/'
model = TransformerModel('bert-large-cased')
ids, sents=prepare_data(new_ev)
d='new-EventsAdapt'
probs = mask_word_left2right(sents)
with open(os.path.join(out_folder, d+'.'+'bert-large-cased.l2r.txt'), 'w') as fout:
    for i, sent,score in zip(ids,sents,probs):
        fout.write('{}\t{}\t{}\n'.format(i, sent,score))

In [None]:
import os

out_folder = 'left2right_res/'
model_name = 'xlnet-large-cased'
model = TransformerModel(model_name)
for d in datasets:
    print(d)
    ids, sents = datasets[d]
    probs = mask_word_left2right(sents)
    with open(os.path.join(out_folder, '{}.{}.l2r.txt'.format(d, model_name)), 'w') as fout:
        for i, sent,score in zip(ids,sents,probs):
            fout.write('{}\t{}\t{}\n'.format(i, sent,score))

In [None]:
model = TransformerModel('bert-large-cased')
f=['The babysitter won the game .']
for sent in f:
    s = sent.split(' ')
    ids = [w_id for w_id in range(0, len(s)-1)]
    ss = [sent for i in range(0, len(s)-1)]  
    #x=model.prepare_input(ss, ids)
    #print(x)
    model_probs = model.run_prediction(ss, ids, True, BATCH_SIZE)


In [None]:
import math
print([math.log(x) for x in model_probs])

In [None]:
model.tokenizer.tokenize('babysitter')