In [1]:
import torch
import string
import copy
import random

from transformers import BertTokenizer, BertForMaskedLM
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()


In [2]:
top_k = 10

model_dict = {'bert':(bert_tokenizer, bert_model)}


In [89]:
def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return tokens[:top_clean]


def encode(tokenizer, text_sentence, add_special_tokens=True):
    text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
    if tokenizer.mask_token == text_sentence.split()[-1]:
        text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    return input_ids, mask_idx


In [186]:
def get_predictions(text_sentence, mask_positions, top_clean=5):
    """
    Psuedocode:
        Get the masked sentence.
        Encode it & pass it through model
        Now, decode at each position.
    """
    # ========================= BERT =================================
    input_ids, mask_idx = encode(bert_tokenizer, text_sentence)
    new_sentences = []
    for i in range(top_clean): 
        predicted_sentence = ('[CLS] '+text_sentence+' [SEP]').strip().split()
        new_sentences.append(predicted_sentence)
    
    print(new_sentences)
    with torch.no_grad():
        predict = bert_model(input_ids)[0]
    
    # Place the predictions in the sentence
    for mask_idx in mask_positions:
        predicted_words = decode(bert_tokenizer, predict[0, mask_idx, :].topk(top_k).indices.tolist(), top_clean)
        print(predicted_words)
        for idx in range((len(predicted_words))): 
            new_sentences[idx][mask_idx] = predicted_words[idx]

    new_sentences = [sent[1:-1] for sent in new_sentences]  #remove the cls and sep tag
    print(new_sentences)

In [187]:
get_predictions("i <mask> confused and don ' t know what to do",[2, 11])

[['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]']]
['am', 'feel', 'get', 'was', 'look']
['do', 'say', 'think', 'make', '...']
[['i', 'am', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do'], ['i', 'feel', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'say'], ['i', 'get', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'think'], ['i', 'was', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'make'], ['i', 'look', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', '...']]


In [173]:
get_mask_predictions("I am confused and don't know what to do")

i <mask> confused and don ' t know what to do
[['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]'], ['[CLS]', 'i', '<mask>', 'confused', 'and', 'don', "'", 't', 'know', 'what', 'to', 'do', '[SEP]']]
['am', 'feel', 'get', 'was', 'look']
['do', 'say', 'think', 'make', '...']


IndexError: list assignment index out of range

In [136]:
strg = [['asdas','asdasdd']]*5

In [166]:
strg = [0,1,2,3]

In [167]:
strg[1:-1]

[1, 2]

In [58]:
def mask_sentence(tokens, tokenizer, style='bert'):
    """
    Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
    :param tokens: list of str, tokenized sentence.
    :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
    :return: (list of str, list of int), masked tokens and related labels for LM prediction

    Replace some with <mask>, some with random words.
    """
    output_label = []
    mask_positions = [] # For storing the position where words are changed

    for i, token in enumerate(tokens):
        prob = random.random()
        # mask token with 15% probability
        if prob < 0.15:
            prob /= 0.15

            # 80% randomly change token to mask token
            if prob < 0.8:
                tokens[i] = "<mask>"

            # 10% randomly change token to random token
            elif prob < 0.9:
                tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]

            # -> rest 10% randomly keep current token

            # Let's store the position where words are changed
            mask_positions.append(i)
            # append current token to output (we will predict these later)
            
            
            try:
                output_label.append(tokenizer.vocab[token])
            except KeyError:
                # For unknown words (should not occur with BPE vocab)
                output_label.append(tokenizer.vocab["[UNK]"])
                print("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
        else:
            # no masking token (will be ignored by loss function later)
            output_label.append(-1)

    return tokens, mask_positions, output_label 



In [59]:


def prepare_input( tokenizer: object, style:str, text: str):
    """
    Psuedocode:
    * Tokenize the sentence.
    * Send it to mask_sentence() -> Get the masked sentence and a
      list. This list will have Indices numbers for positions
      where masking is done.
    * Convert sentences to ids.

    Input : 
        :param -> tokenizer (transformer's object)
        :style -> Masking style
        :text  -> Sentence
    Return: 
        :param -> Masked sentence, 
        :param -> Mask labels
    """
    # Tokenize input
    tokenized_text = tokenizer.tokenize(text)
    tokenized_text_ = copy.copy(tokenized_text)
    masktokenized_text, mask_positions, mask_labels = mask_sentence(tokenized_text_, tokenizer, style=style)

    # Convert token to vocabulary indices
    indexed_tokens = tokenizer.convert_tokens_to_ids(masktokenized_text)

    # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
    # segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
    segments_ids = [0]* len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    input_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
                
    # return tokenized_text, masktokenized_text, input_tensor,\
    #         mask_labels, segments_tensors
    return masktokenized_text, mask_positions, mask_labels


In [79]:
def get_mask_predictions(sentence: str):
    """
    Psuedocode:
    For each model,
        * Mask the sentence
        * Stitch it to a normal sentence (undo BPE). TODO
        * Send it through the model
    """
    masktokenized_text = ''
    for style in model_dict.keys():
        while '<mask>' not in masktokenized_text:
            masktokenized_text, mask_positions, _ = prepare_input(tokenizer=model_dict[style][0], \
                                                            style=f'{style}', text=sentence)
            
        masked_sentence = ' '.join(masktokenized_text)
        print(masked_sentence)
        # Adding 1 for every position since <CLS> & <SEP> are added at the ends.
        mask_positions =  [(pos+1) for pos in mask_positions]

        get_predictions(masked_sentence, mask_positions, top_clean=5)

In [85]:
get_mask_predictions("I am confused and don't know what to do")

i am confused and <mask> <mask> t know <mask> to do
['i am confused and <mask> <mask> t know <mask> to do', 'i am confused and <mask> <mask> t know <mask> to do', 'i am confused and <mask> <mask> t know <mask> to do', 'i am confused and <mask> <mask> t know <mask> to do', 'i am confused and <mask> <mask> t know <mask> to do']


TypeError: 'str' object does not support item assignment