In [26]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
#% matplotlib inline

In [174]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xwan6/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [4]:
# sample of wordPiece
list(tokenizer.vocab.keys())[5000:5010]

['knight',
 'lap',
 'survey',
 'ma',
 '##ow',
 'noise',
 'billy',
 '##ium',
 'shooting',
 'guide']

In [5]:
# BERT requires [SEP], [CLS], Token IDs, Mask IDs, Segment IDs, Positional Embeddings
# Tokenization
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."
marked_text = "[CLS] " + text + " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
print (tokenized_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

['[CLS]', 'after', 'stealing', 'money', 'from', 'the', 'bank', 'vault', ',', 'the', 'bank', 'robber', 'was', 'seen', 'fishing', 'on', 'the', 'mississippi', 'river', 'bank', '.', '[SEP]']
[CLS]           101
after         2,044
stealing     11,065
money         2,769
from          2,013
the           1,996
bank          2,924
vault        11,632
,             1,010
the           1,996
bank          2,924
robber       27,307
was           2,001
seen          2,464
fishing       5,645
on            2,006
the           1,996
mississippi   5,900
river         2,314
bank          2,924
.             1,012
[SEP]           102


In [6]:
# Segment ID
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [8]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [9]:
model.encoder.layer[11]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [10]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [11]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 22
Number of hidden units: 768


In [12]:
# Current dimensions: [# layers, # batches, # tokens, # features]
# Desired dimensions: [# tokens, # layers, # features]

# Concatenate the tensors for all layers. We use `stack` here to
# create a new dimension in the tensor.
token_embeddings = torch.stack(hidden_states, dim=0)

print(token_embeddings.size())

# Remove dimension 1, the "batches".
token_embeddings = torch.squeeze(token_embeddings, dim=1)

print(token_embeddings.size())

# switch around the “layers” and “tokens” dimensions with permute
# Swap dimensions 0 and 1.
token_embeddings = token_embeddings.permute(1,0,2)

print(token_embeddings.size())

torch.Size([13, 1, 22, 768])
torch.Size([13, 22, 768])
torch.Size([22, 13, 768])


In [13]:
# Creating word and sentence vectors from hidden states
# creating the word vectors by summing together the last four layers.
# Stores the token vectors, with shape [22 x 768]
token_vecs_sum = []

# `token_embeddings` is a [22 x 12 x 768] tensor.

# For each token in the sentence...
for token in token_embeddings:

    # `token` is a [12 x 768] tensor

    # Sum the vectors from the last four layers.
    sum_vec = torch.sum(token[-4:], dim=0)
    
    # Use `sum_vec` to represent `token`.
    token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

Shape is: 22 x 768


In [14]:
len(token_vecs_sum)

22

In [15]:
# Sentence Vectors
# `hidden_states` has shape [13 x 1 x 22 x 768]

# `token_vecs` is a tensor with shape [22 x 768]
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

Our final sentence embedding vector of shape: torch.Size([768])


In [16]:
# Calculate the similarity between different bank in the sentences (6, 10, and 19)
from scipy.spatial.distance import cosine

# Calculate the cosine similarity between the word bank 
# in "bank robber" vs "river bank" (different meanings).
diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# Calculate the cosine similarity between the word bank
# in "bank robber" vs "bank vault" (same meaning).
same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
print('Vector similarity for *different* meanings:  %.2f' % diff_bank)

Vector similarity for  *similar*  meanings:  0.94
Vector similarity for *different* meanings:  0.69


In [9]:
# Finetuning BERT with our own text
import json
with open('generate_abstract.json') as outfile:
    raw_sentences = json.load(outfile)
sentences = []
for k in raw_sentences:
    sentences.append(raw_sentences[k])

In [132]:
# prepare the data
max_len = max([len(a.split(' ')) for a in sentences])
input_ids = []
attention_masks = []
labels = []
for sent in sentences:
    
    encode_label = tokenizer.encode(sent, add_special_tokens=True, 
                                    max_length = int(max_len*1.5),  pad_to_max_length = True,)
    labels.append(encode_label)
    
    masks1 = 5
    new_sent = sent.split(' ')
    new_sent[masks1] = '[MASK]'
    new_sent = ' '.join(new_sent)
    
    encoded_dict = tokenizer.encode_plus(
        new_sent,  add_special_tokens = True,  max_length = int(max_len*1.5),  pad_to_max_length = True, 
        return_attention_mask = True,  return_tensors = 'pt')
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [134]:
# splitting and loading
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [137]:
len(sentences)

4776

In [139]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# load data
def ret_dataloader():
    batch_size = 60
    print('batch_size = ', batch_size)
    train_dataloader = DataLoader(train_dataset,sampler = RandomSampler(train_dataset), batch_size = batch_size)

    validation_dataloader = DataLoader( val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)
    return train_dataloader,validation_dataloader

In [141]:
# load the model
# ues BertForMaskedLM to predict word
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

model = BertForMaskedLM.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/xwan6/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /home/xwan6/.cache/torch/transformers/f2ee78

In [171]:
# freeze the bert layers, only train the output layer
for child in model.bert.children():
    for param in child.parameters():
        param.requires_grad = False 

In [172]:
# Early stopping
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

In [177]:
# Loss and optimizer
from torch import nn
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.025)
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

In [179]:
# training the model
# training
import time
import numpy as np
# Set model to train
model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model.to(device)

train_dataloader,validation_dataloader = ret_dataloader()

epochs = 10
for i in range(epochs):
    losses = []
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        # use the gpu
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda()
        
        loss, logits = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print("Loss: ", np.mean(losses))
    #early_stopping.update_loss(np.mean(losses))
    #if early_stopping.stop_training():
        #break
#     model.eval()
#     total_eval_accuracy = 0
#     total_eval_loss = 0
#     nb_eval_steps = 0
#     # Evaluate data for one epoch
#     for batch in validation_dataloader:
#         b_input_ids = batch[0].cuda()
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)
#         with torch.no_grad():        
#             (loss, logits) = model(b_input_ids, 
#                                   token_type_ids=None, 
#                                   attention_mask=b_input_mask,
#                                   labels=b_labels)


#         total_eval_loss += loss.item()
#         logits = logits.detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()
#         total_eval_accuracy += flat_accuracy(logits, label_ids)

#     avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
#     print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
#     avg_val_loss = total_eval_loss / len(validation_dataloader)

#     validation_time = format_time(time.time() - t0)
#     #Log the Avg. validation accuracy
#     wandb.log({'val_accuracy':avg_val_accuracy,'avg_val_loss':avg_val_loss})
#     print("  Validation Loss: {0:.2f}".format(avg_val_loss))


cuda
batch_size =  60
Loss:  1.6780397097269695
Loss:  1.395151553882493
Loss:  1.2484408815701802
Loss:  1.1617248530189197
Loss:  1.1066630772418447
Loss:  1.057557348575857
Loss:  1.0222111567854881
Loss:  0.9924223398168882
Loss:  0.9651898766557375
Loss:  0.9429843541648653


In [128]:
outputs = model(input_ids, labels=labels)
outputs

tensor([[  101,  2120, 20994,  1997,  4163,  1010,  3330,  1998,  4200,  2758,
          1996,  2406,  2323,  2025, 11160,  2006,  4010,  4633,  2000,  2644,
          9530, 15900,  3258,  1012,  1037,  5997,  5531,  2008,  1996,  2913,
          1010,  1997,  9671,  3737,  1997,  3350,  1010,  2079,  2025,  3749,
          1037,  3978,  2000,  2903,  2008,  2621,  4633,  2097, 15115,  2007,
          1996,  3659,  1997,  1996, 21887, 23350,  1012,  1996,  6090,  3207,
          7712,  2089,  2625,  2368,  2138,  1997,  2591,  4487, 12693,  6129,
          1998,  2060,  5761,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [None]:
# training the model

In [10]:
# prepare the data
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)
    
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  National Academies of Sciences, Engineering and Medicine says the country should not rely on warm weather to stop contagion. A panel concluded that the studies, of varying quality of evidence, do not offer a basis to believe that summer weather will interfere with the spread of the coronavirus. The pandemic may lessen because of social distancing and other measures.
Token IDs: [101, 2120, 20994, 1997, 4163, 1010, 3330, 1998, 4200, 2758, 1996, 2406, 2323, 2025, 11160, 2006, 4010, 4633, 2000, 2644, 9530, 15900, 3258, 1012, 1037, 5997, 5531, 2008, 1996, 2913, 1010, 1997, 9671, 3737, 1997, 3350, 1010, 2079, 2025, 3749, 1037, 3978, 2000, 2903, 2008, 2621, 4633, 2097, 15115, 2007, 1996, 3659, 1997, 1996, 21887, 23350, 1012, 1996, 6090, 3207, 7712, 2089, 2625, 2368, 2138, 1997, 2591, 4487, 12693, 6129, 1998, 2060, 5761, 1012, 102]


In [11]:
# Padding & Truncating
# find the maxmum length of sentences
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  141


In [12]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
MAX_LEN = 150
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")
print('\Done.')

Using TensorFlow backend.



Padding/truncating all sentences to 150 values...

Padding token: "[PAD]", ID: 0
\Done.


In [13]:
# Attention Masks
# The attention mask simply makes it explicit which tokens are actual words versus which are padding.
# Create attention masks
attention_masks = []
# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [42]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [63]:
for i in range(len(sentences)):
    sentences[i] = sentences[i].replace('. ','|')

In [64]:
sentences[0]

'National Academies of Sciences, Engineering and Medicine says the country should not rely on warm weather to stop contagion| A panel concluded that the studies, of varying quality of evidence, do not offer a basis to believe that summer weather will interfere with the spread of the coronavirus| The pandemic may lessen because of social distancing and other measures|'

In [65]:
# split the data
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(sentences, test_size=0.2)

In [71]:
document = train_data[0]
document.strip().split('|')

['Coronavirus death toll rises to 490 in China, Hong Kong quarantines all arrivals from mainland',
 ' President Trump is acquitted of charges of abuse of power and obstruction of Congress',
 ' Democrats are already planning to continue their investigations, starting with a possible subpoena of John Bolton, whom the Senate did not call to testify',
 '']

In [94]:
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)

In [None]:
def pad_masking(x):
    # x: (batch_size, seq_len)
    padded_positions = x == 0
    return padded_positions.unsqueeze(1)

In [14]:
# ues BertForMaskedLM to predict word
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

In [15]:
# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [16]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

In [17]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [18]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [22]:
tokenizer.encode(
                text,                      # Sentence to encode.
                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                # This function also supports truncation and conversion
                # to pytorch tensors, but we need to do padding, so we
                # can't use these features :( .
                max_length = 128,          # Truncate all sentences.
                return_tensors = 'pt',     # Return pytorch tensors.
           )

tensor([[  101,   101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,
          2001,  1037, 13997, 11510,   102,   102]])

In [29]:
inputs = tokenizer.tokenize("The capital of France is [MASK].", return_tensors="pt")
labels = tokenizer.tokenize("The capital of France is Paris.", return_tensors="pt")["input_ids"]
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

TypeError: list indices must be integers or slices, not str

In [32]:
tokenizer.tokenize("The capital of France is Paris.", return_tensors="pt")

['the', 'capital', 'of', 'france', 'is', 'paris', '.']

In [48]:
tokenizer.encode("The [MASK] of France is Paris.", return_tensors="pt")

tensor([[ 101, 1996,  103, 1997, 2605, 2003, 3000, 1012,  102]])

In [49]:
tokenizer.encode("The capital of France is [MASK].", return_tensors="pt")

tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]])

In [60]:
torch.tensor(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("Paris")))

tensor([3000])

In [54]:
tokenizer.encode("Paris", return_tensors="pt")

tensor([[ 101, 3000,  102]])

In [55]:
tokenizer.encode("[MASK]", return_tensors="pt")

tensor([[101, 103, 102]])

In [47]:
l = tokenizer.encode("The capital of France is Paris.", return_tensors="pt")

tensor([[ 101, 1996, 3007, 1997, 2605, 2003, 3000, 1012,  102]])


[tensor(2003)]

In [63]:
l[0]

tensor([ 101, 1996, 3007, 1997, 2605, 2003, 3000, 1012,  102])

In [62]:
inputs

tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]])

In [69]:
labels

tensor([[3000]])

In [74]:
outputs[0]

tensor(4.1324, grad_fn=<NllLossBackward>)

In [76]:
model(inputs)

(tensor([[[ -6.4346,  -6.4063,  -6.4097,  ...,  -5.7691,  -5.6326,  -3.7883],
          [-14.0119, -14.7241, -14.2120,  ..., -11.6977, -10.7304, -12.7618],
          [ -9.6561, -10.3124,  -9.7458,  ...,  -8.7781,  -6.6036, -12.6595],
          ...,
          [ -3.7861,  -3.8571,  -3.5644,  ...,  -2.5592,  -3.1093,  -4.3819],
          [-11.6598, -11.4274, -11.9267,  ...,  -9.8772, -10.2103,  -4.7594],
          [-11.7267, -11.7509, -11.8040,  ..., -10.5943, -10.9407,  -7.5151]]],
        grad_fn=<AddBackward0>),)

In [78]:
loss

tensor(4.1324, grad_fn=<NllLossBackward>)

In [80]:
outputs[1].shape

torch.Size([1, 9, 30522])

In [77]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

inputs = tokenizer.encode("The capital of France is [MASK].", return_tensors="pt")
#labels = torch.tensor([tokenizer.convert_tokens_to_ids(tokenizer.tokenize("Paris"))])
labels = tokenizer.encode("The capital of France is Paris.", return_tensors="pt")

outputs = model(inputs, labels=labels)
loss = outputs[0]
loss.backward()
logits = outputs.logits

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/xwan6/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:transformers.modeling_utils:loading weights file https://cdn.huggingface.co/bert-base-uncased-pytorch_model.bin from cache at /home/xwan6/.cache/torch/transformers/f2ee78

AttributeError: 'tuple' object has no attribute 'logits'

In [35]:
model()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr