In [46]:
import string
import re
import os
import sys
import csv
import logging
import numpy as np
import torch
import pandas as pd

In [47]:
#Sometimes models place special characters in front of words.   we don't need them.
def remove_starting_character(options, starting_char):
        new_predictions = list()
        for prediction in options:
            if prediction[0] == starting_char:
                new_prediction = prediction[1:]
                new_predictions.append(new_prediction)
            else:
                new_predictions.append(prediction)
        return new_predictions

In [48]:
from transformers import RobertaForMaskedLM, RobertaTokenizer

In [49]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model2 = RobertaForMaskedLM.from_pretrained('roberta-base')


In [50]:
def predict(st):
    input_ids = torch.tensor(tokenizer.encode(st)).unsqueeze(0) 
    outputs = model2(input_ids, masked_lm_labels=input_ids)
    loss, prediction_scores = outputs[:2]
    text = ''
    for i in range(1, prediction_scores.shape[1]-1):
        t = np.argmax(prediction_scores[0, i].tolist())
        options = tokenizer.convert_ids_to_tokens([t])
        options = remove_starting_character(options, "Ġ")
        text = text+ ' '+options[0]
    return text

In [235]:
print(predict("he went to the farmer's <mask> and <mask> a bunch of green <mask>" ))

 he went to the farmer 's market and saw a bunch of green beans


In [239]:
print(predict("they went to the farmer's <mask> and <mask> a bunch of hot <mask>" ))

 they went to the farmer 's market and bought a bunch of hot dogs


In [222]:
print(predict("John were a bad speller"))

 John was a bad spell er


In [223]:
print(predict("John got in <mask> car and then he went to the store where <mask> bought stuff"))

 John got in his car and then he went to the store where he b ought stuff


In [32]:
print(predict("he went to the <mask> office and purchased lots of postage stamps."))

 he went to the post office and purchased lots of postage stamps .


In [8]:
print(predict('Whenever <mask> go to the <mask> <mask>,  they create a lot of <mask>'))

 Whenever people go to the same center ,  they create a lot of problems


In [65]:
print(predict('Whenever <mask> go to the whiskey <mask>,  <mask> have a lot of <mask>'))

 Whenever you go to the whiskey bar , you have a lot of fun


In [231]:
print(predict('Because he was a good runner, <mask> planned to <mask> across the <mask>'))

 Because he was a good runner , he planned to run ac ross the country


In [7]:
print(predict('He  <mask> across the ocean.'))

 He sailed ac ross the ocean .


In [25]:
print(predict('There are many people in <mask> York city'))

 There are many people in New York city


In [27]:
print(predict('New <mask> is a city full of people'))

 New York is a city full of people


The following small experiment show how to draw out the transformer embedding function.

In [51]:
em = model2.get_input_embeddings()
print(em)
x = em(torch.tensor([210]))
x.shape

Embedding(50265, 768, padding_idx=1)


torch.Size([1, 768])

In [52]:
ou = model2.get_output_embeddings()
y = ou(x).tolist()
z = np.argmax(y[0])
z

210

In [53]:
st = 'my home is Seattle'
in_tokens = torch.tensor(tokenizer.encode(st)).unsqueeze(0) 
in_tokens

tensor([[   0, 4783,  184,   16, 3417,    2]])

In [54]:
r = em(in_tokens.squeeze(0))
r.shape

torch.Size([6, 768])

In [55]:
z = torch.mm(r, r.T)/np.sqrt(768)
z

tensor([[ 0.3037,  0.0055, -0.0154,  0.0177, -0.0040,  0.0777],
        [ 0.0055,  0.4223,  0.0042,  0.0188, -0.0081,  0.0260],
        [-0.0154,  0.0042,  0.3387,  0.0122,  0.0103,  0.0136],
        [ 0.0177,  0.0188,  0.0122,  0.1937,  0.0046,  0.0717],
        [-0.0040, -0.0081,  0.0103,  0.0046,  0.3262,  0.0105],
        [ 0.0777,  0.0260,  0.0136,  0.0717,  0.0105,  0.2116]],
       grad_fn=<DivBackward0>)

In [56]:
zsoft = torch.softmax(z, 1)
zsoft

tensor([[0.2104, 0.1562, 0.1529, 0.1581, 0.1547, 0.1678],
        [0.1530, 0.2321, 0.1528, 0.1550, 0.1509, 0.1562],
        [0.1532, 0.1562, 0.2183, 0.1575, 0.1572, 0.1577],
        [0.1605, 0.1607, 0.1596, 0.1914, 0.1584, 0.1694],
        [0.1556, 0.1550, 0.1579, 0.1570, 0.2165, 0.1579],
        [0.1678, 0.1593, 0.1574, 0.1668, 0.1569, 0.1918]],
       grad_fn=<SoftmaxBackward>)

In [57]:
q =torch.mm(zsoft, r)
q

tensor([[-0.0033, -0.0361,  0.0055,  ...,  0.0633,  0.0160, -0.0369],
        [-0.0181, -0.0205, -0.0035,  ...,  0.0598,  0.0162, -0.0402],
        [-0.0159, -0.0460,  0.0125,  ...,  0.0715,  0.0137, -0.0353],
        [-0.0124, -0.0344,  0.0023,  ...,  0.0651,  0.0143, -0.0416],
        [-0.0111, -0.0395, -0.0076,  ...,  0.0736,  0.0236, -0.0372],
        [-0.0112, -0.0369,  0.0021,  ...,  0.0672,  0.0149, -0.0390]],
       grad_fn=<MmBackward>)

In [None]:
#here is the transformer layerNorm function (my version)
def layerNorm(x):
    u = x.mean(-1, keepdim=True)
    print(u)
    print(x-u)
    s = (x - u).pow(2).mean(-1, keepdim=True)
    print(s)
    x = (x - u) / torch.sqrt(s)
    return x