### Load Bert pretrained model and use on text

In [1]:
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(len(tokenizer))

# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson  was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

30522
['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]']


In [2]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
# assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1,1,1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
print(tokens_tensor)
print(segments_tensors)

tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958,   103,  2001,
          1037, 13997, 11510,   102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])


In [9]:
# Load pre-trained model (weights)
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
# print(model)
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor
segments_tensors = segments_tensors
# model

# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    outputs_no_tokentype = model(tokens_tensor)

    predictions = outputs[0]
    prediction_no = outputs_no_tokentype[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
print('predicted_index',predicted_index)

predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
# assert predicted_token == 'henson'

print('Predicted token is:',predicted_token)

predicted_index 27227
Predicted token is: henson


In [4]:
outputs[0].shape

torch.Size([1, 14, 30522])

In [10]:
torch.argmax(predictions[0,8]).item()

27227

In [14]:
tokenizer.convert_ids_to_tokens([27227])[0]

'henson'

In [5]:
# WITH FEATURE TYPE
print(predictions[0])
print(predictions.shape)
index = torch.argmax(predictions[0,8]).item()
print(index)
predicted_token = tokenizer.convert_ids_to_tokens([index])[0]
print(predicted_token)
print('weights for the token masked',torch.argmax(predictions[0][8][27227]))

tensor([[ -7.8798,  -7.7874,  -7.7861,  ...,  -7.0438,  -6.7454,  -4.6013],
        [-13.3633, -13.7694, -13.7819,  ..., -11.8128, -11.1635, -13.8906],
        [-10.9775, -10.5383, -10.9659,  ..., -11.5549,  -8.0309,  -6.3979],
        ...,
        [ -5.2284,  -5.6572,  -5.3550,  ...,  -3.4507,  -3.8718,  -8.6904],
        [ -8.5290,  -8.4146,  -9.0744,  ...,  -7.1710,  -6.9877,  -6.1301],
        [-12.5968, -12.3769, -12.4222,  ..., -10.1020,  -9.8764,  -9.4495]])
torch.Size([1, 14, 30522])
27227
henson
weights for the token masked tensor(0)


In [6]:
# NO FEATURE TYPE
print(prediction_no)
print(prediction_no.shape)
index = torch.argmax(prediction_no[0,8]).item()
print(index)
predicted_token_no = tokenizer.convert_ids_to_tokens([index])[0]
print(predicted_token_no)
print('weights for the token masked without feature type',prediction_no[0][8][27227])

tensor([[[ -6.9487,  -6.9031,  -6.8937,  ...,  -6.2844,  -6.0191,  -4.1828],
         [-13.8302, -14.2852, -14.2682,  ..., -12.3324, -11.4464, -14.9032],
         [-11.7922, -11.4641, -11.7961,  ..., -12.6160,  -8.6125,  -8.0468],
         ...,
         [ -5.7428,  -6.1183,  -5.6917,  ...,  -3.8172,  -3.9389,  -8.3408],
         [ -8.9365,  -8.7451,  -9.1933,  ...,  -7.8316,  -8.3316,  -3.8440],
         [-12.2189, -12.0151, -11.9650,  ...,  -9.8166,  -9.7025,  -9.3017]]])
torch.Size([1, 14, 30522])
27227
henson
weights for the token masked without feature type tensor(15.5186)


In [7]:
torch.argmax(predictions[0][8])#[27227]

tensor(27227)

In [274]:
import torch
from torch import nn
loss = nn.CrossEntropyLoss()
input = torch.randn(25600,26493 , requires_grad=True)
# input = torch.tensor([[2.,4.,3.,4,5],[2,4,3,4,5],[2,4,3,4,5]],requires_grad=True)
print('input',input,input.shape)
print(50*'%')
target = torch.empty(25600,dtype=torch.long).random_(20)
print('target',target,target.shape)
output = loss(input, target)
output.backward()
print(50*'%')
print(output)

input tensor([[-1.4429, -1.2701, -0.2727,  ...,  2.4877, -1.5465,  1.3952],
        [ 0.1014, -0.5270, -0.0786,  ..., -0.4124, -1.6045,  0.6366],
        [-1.2232,  0.3723, -0.2657,  ...,  0.5608,  1.4149, -1.4305],
        ...,
        [ 0.3804,  1.8699, -0.0807,  ...,  1.1345, -1.7949,  0.0969],
        [ 0.7246,  0.8214, -0.5409,  ..., -1.4545,  0.1113,  2.1624],
        [ 0.8984, -0.4903, -0.5561,  ...,  1.1320,  0.6060,  0.6637]],
       requires_grad=True) torch.Size([25600, 26493])
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
target tensor([ 2,  6, 19,  ...,  0, 13,  4]) torch.Size([25600])
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
tensor(10.6953, grad_fn=<NllLossBackward>)


In [275]:
predicted = torch.argmax(input ,1)# 25600 shape

In [276]:
import pandas as pd
pd.Series(predicted==target).value_counts()

False    25600
dtype: int64

In [277]:
((predicted==target).sum())/target.shape[0]

tensor(0)

In [289]:
a = torch.Tensor(np.array([1,2,3,4]))
b = torch.Tensor(np.array([1,4,5,4]))
int((a==b).sum())/10

0.2

In [285]:
(a==b).sum()

tensor(2)

In [172]:
import torch
import numpy as np
from torch import nn
loss = nn.CrossEntropyLoss(ignore_index= -1)
input = torch.randn(3,5 , requires_grad=True)
print('input',input,input.shape)
print(50*'%')
# target = torch.empty(3,dtype=torch.long).random_(3)
target = torch.tensor([-1, 2, 1])
print('target',target,target.shape)
output = loss(input, target)
output.backward()
print(50*'%')
print(output)

input tensor([[ 1.7264e+00,  7.8608e-01,  2.2085e+00,  1.0811e-01,  2.5748e-01],
        [-1.8813e-01,  5.0193e-01,  4.2372e-01, -1.9748e-03,  1.2297e+00],
        [-7.3344e-01,  1.2675e-02, -1.3900e+00,  7.7147e-01, -2.1631e+00]],
       requires_grad=True) torch.Size([3, 5])
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
target tensor([-1,  2,  1]) torch.Size([3])
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
tensor(1.5431, grad_fn=<NllLossBackward>)


In [160]:
-input[target]

tensor([[ 0.9162,  0.2505,  0.0294,  0.4328, -1.0175],
        [-0.3876, -0.7693, -1.2163,  0.0885, -0.5896],
        [-1.7336, -0.6171,  0.8032, -1.2351,  0.6130]], grad_fn=<NegBackward>)

### Below is to get the latest layer (weights of the tokens) out from the model 

In [None]:
from bert_embedding import BertEmbedding
import numpy as np

bert_abstract = """We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers."""
sentences = bert_abstract.split('\n')
bert_embedding = BertEmbedding()
result = bert_embedding(sentences)

In [None]:
print(np.array(result[0][0]))
print(np.array(result[0][0]).shape)    

In [None]:
print(np.array(result[0][1]))
print(np.array(result[0][1]).shape) 