<a href="https://colab.research.google.com/github/giuliarambelli/Event_Knowledge/blob/master/model-jupiter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import argparse
import re
import os
import copy
import numpy as np

#from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel


Experimental code for *What BERT is not: Lessons from a new suite of psycholinguistic diagnostics for language models*, by Allyson Ettinger.
> https://github.com/aetting/lm-diagnostics




In [3]:
#!pip install pytorch_pretrained_bert
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 26.3MB/s eta 0:00:01[K     |█████▎                          | 20kB 2.1MB/s eta 0:00:01[K     |████████                        | 30kB 3.1MB/s eta 0:00:01[K     |██████████▋                     | 40kB 2.1MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.6MB/s eta 0:00:01[K     |███████████████▉                | 61kB 3.1MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 3.6MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 2.7MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 3.1MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 3.4MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 3.4MB/s eta 0:00:01[K     |██████████████████████

In [0]:
#def load_model(modeldir):
def load_model():
    print("load model")
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # Load pre-trained model (weights)
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    model.eval()
    model.to('cuda')
    return model,tokenizer


def prep_input(input_sents, tokenizer,bert=True):
    # Modify data for Language Model Task
    print("prepare input")
    for sent in input_sents:
        masked_index = None
        text = []
        mtok = '[MASK]'
        if not bert:
            sent = re.sub('\[MASK\]','X',sent)
            mtok = 'x</w>'
        if bert: text.append('[CLS]')
        text += sent.strip().split()
        if text[-1] != '.': text.append('.')
        if bert: text.append('[SEP]')
        text = ' '.join(text)
        tokenized_text = tokenizer.tokenize(text)
        for i,tok in enumerate(tokenized_text):
            if tok == mtok: masked_index = i
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        yield tokens_tensor, masked_index,tokenized_text


def get_predictions(input_sents,model,tokenizer,k=5,bert=True):
    token_preds = []
    tok_probs = []
    for tokens_tensor, mi,_ in prep_input(input_sents,tokenizer,bert=bert):
        tokens_tensor = tokens_tensor.to('cuda')
        with torch.no_grad():
            predictions = model(tokens_tensor)
        predicted_tokens = []
        predicted_token_probs = []
        if bert:
            softpred = torch.softmax(predictions[0,mi],0)
        else:
            softpred = torch.softmax(predictions[0, mi, :],0)
        top_inds = torch.argsort(softpred,descending=True)[:k].cpu().numpy()
        top_probs = [softpred[tgt_ind].item() for tgt_ind in top_inds]
        top_tok_preds = tokenizer.convert_ids_to_tokens(top_inds)
        if not bert:
            top_tok_preds = [re.sub('\<\/w\>','',e) for e in top_tok_preds]

        token_preds.append(top_tok_preds)
        tok_probs.append(top_probs)
    return token_preds,tok_probs

def get_probabilities(input_sents,tgtlist,model,tokenizer,bert=True):
    print("get probabilities")
    token_probs = []
    for i,(tokens_tensor, mi,_) in enumerate(prep_input(input_sents,tokenizer,bert=bert)):
        tokens_tensor = tokens_tensor.to('cuda')
        print(mi)
        with torch.no_grad():
            predictions = model(tokens_tensor)
        tgt = tgtlist[i]
        if bert:
            softpred = torch.softmax(predictions[0,mi],0)
        else:
            softpred = torch.softmax(predictions[0, mi, :],0)
        try:
            tgt_ind = tokenizer.convert_tokens_to_ids([tgt])[0]
        except:
            this_tgt_prob = np.nan
        else:
            this_tgt_prob = softpred[tgt_ind].item()
        token_probs.append(this_tgt_prob)
    return token_probs

In [24]:
# load Bert model
model, tokenizer=load_model()

load model


In [0]:
#Input example
sentences = ["The fireman is rescuing the [MASK]", "The criminal is arresting the [MASK]"]
targets = ["grandmother", "cop"]



In [27]:
# Print top N predictions
print(get_predictions(sentences, model, tokenizer))

prepare input
([['girl', 'woman', 'children', 'victim', 'victims'], ['girl', 'suspect', 'woman', 'murderer', 'criminal']], [[0.14221793413162231, 0.08759653568267822, 0.07673992216587067, 0.04699753597378731, 0.03726351633667946], [0.07381103187799454, 0.06507331132888794, 0.03878232091665268, 0.02652042917907238, 0.02608479931950569]])


In [22]:
# Print probability of the target word
print(get_probabilities(sentences,targets, model, tokenizer))

get probabilities
prepare input
7
6
[0.00013654639769811183, 0.009434310719370842]
