In [0]:
pip install pytorch_transformers

In [0]:
import torch
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
#import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')

def get_probs(text):
  text = '[CLS] '+text.lstrip('[CLS] ').rstrip(' [SEP]')+' [SEP]'
  # Tokenize input
  tokenized_text = tokenizer.tokenize(text)
  #print(tokenized_text)

  # Convert token to vocabulary indices
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
  segments_ids = [0]*len(tokenized_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Load pre-trained model (weights)
  model = BertModel.from_pretrained('bert-base-uncased')

  # Set the model in evaluation mode to desactivate the DropOut modules
  # This is IMPORTANT to have reproductible results during evaluation!
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict hidden states features for each layer
  with torch.no_grad():
      # See the models docstrings for the detail of the inputs
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      # PyTorch-Transformers models always output tuples.
      # See the models docstrings for the detail of all the outputs
      # In our case, the first element is the hidden state of the last layer of the Bert model
      encoded_layers = outputs[0]
  # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
  assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

  # Load pre-trained model (weights)
  model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict all tokens
  with torch.no_grad():
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      predictions = outputs[0]

  predicted_probs = [round(torch.softmax(predictions[0, i], 0)[j].item(),4) for i,j in enumerate(indexed_tokens)]
  
  return list(zip(tokenized_text, predicted_probs))[1:-1]

def predict_word(text, topn=10):
  text = '[CLS] '+text.lstrip('[CLS] ').rstrip(' [SEP]')+' [SEP]'
  # Tokenize input
  tokenized_text = tokenizer.tokenize(text)
  #print(tokenized_text)

  # Mask a token that we will try to predict back with `BertForMaskedLM`
  masked_index = -1
  for i,token in enumerate(tokenized_text):
    if token=='[MASK]':
      masked_index = i
      break
  assert i>=0

  # Convert token to vocabulary indices
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
  segments_ids = [0]*len(tokenized_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Load pre-trained model (weights)
  model = BertModel.from_pretrained('bert-base-uncased')

  # Set the model in evaluation mode to desactivate the DropOut modules
  # This is IMPORTANT to have reproductible results during evaluation!
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict hidden states features for each layer
  with torch.no_grad():
      # See the models docstrings for the detail of the inputs
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      # PyTorch-Transformers models always output tuples.
      # See the models docstrings for the detail of all the outputs
      # In our case, the first element is the hidden state of the last layer of the Bert model
      encoded_layers = outputs[0]
  # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
  assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

  # Load pre-trained model (weights)
  model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict all tokens
  with torch.no_grad():
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      predictions = outputs[0]

  predicted_inds = torch.argsort(-predictions[0, masked_index])
  predicted_probs = [round(p.item(),4) for p in torch.softmax(predictions[0, masked_index], 0)[predicted_inds]]
  predicted_tokens = tokenizer.convert_ids_to_tokens([ind.item() for ind in predicted_inds])
  
  return list(zip(predicted_tokens, predicted_probs))[:topn]

def fix_word(text):
  probs = [p[1] for p in get_probs(text)]
  ind = torch.argmin(torch.tensor(probs))
  tokenized_text = tokenizer.tokenize(text)
  bad_word = tokenized_text[ind]
  tokenized_text[ind] = '[MASK]'
  fix = predict_word(' '.join(tokenized_text), 1)[0][0]
  tokenized_text[ind] = fix
  return ' '.join(tokenized_text)
  



In [0]:
predict_word('The boy [MASK] to the school')

In [0]:
predict_word('Alex likes to have [MASK] with his best friend')

In [0]:
get_probs('The boy want to the school')

In [0]:
fix_word('The boy want to the school')