<a href="https://colab.research.google.com/github/eyaler/workshop/blob/master/heb_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import AutoTokenizer, BertTokenizer, AutoModel, AutoModelWithLMHead, BertModel, BertForMaskedLM

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
#import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/wikibert-base-he-cased")

def get_probs(text):
  text = '[CLS] '+text.lstrip('[CLS] ').rstrip(' [SEP]')+' [SEP]'
  # Tokenize input
  tokenized_text = tokenizer.tokenize(text)
  #print(tokenized_text)

  # Convert token to vocabulary indices
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
  segments_ids = [0]*len(tokenized_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Load pre-trained model (weights)
  #model = BertModel.from_pretrained('bert-base-uncased')
  model = AutoModel.from_pretrained("TurkuNLP/wikibert-base-he-cased")

  # Set the model in evaluation mode to desactivate the DropOut modules
  # This is IMPORTANT to have reproductible results during evaluation!
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict hidden states features for each layer
  with torch.no_grad():
      # See the models docstrings for the detail of the inputs
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      # PyTorch-Transformers models always output tuples.
      # See the models docstrings for the detail of all the outputs
      # In our case, the first element is the hidden state of the last layer of the Bert model
      encoded_layers = outputs[0]
  # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
  assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

  # Load pre-trained model (weights)
  #model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  model = AutoModelWithLMHead.from_pretrained("TurkuNLP/wikibert-base-he-cased")
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict all tokens
  with torch.no_grad():
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      predictions = outputs[0]

  predicted_probs = [round(torch.softmax(predictions[0, i], 0)[j].item(),4) for i,j in enumerate(indexed_tokens)]
  
  return list(zip(tokenized_text, predicted_probs))[1:-1]

def predict_word(text, topn=10):
  text = '[CLS] '+text.lstrip('[CLS] ').rstrip(' [SEP]')+' [SEP]'
  # Tokenize input
  tokenized_text = tokenizer.tokenize(text)
  #print(tokenized_text)

  # Mask a token that we will try to predict back with `BertForMaskedLM`
  masked_index = -1
  for i,token in enumerate(tokenized_text):
    if token=='[MASK]':
      masked_index = i
      break
  assert i>=0

  # Convert token to vocabulary indices
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
  segments_ids = [0]*len(tokenized_text)

  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Load pre-trained model (weights)
  #model = BertModel.from_pretrained('bert-base-uncased')
  model = AutoModel.from_pretrained("TurkuNLP/wikibert-base-he-cased")

  # Set the model in evaluation mode to desactivate the DropOut modules
  # This is IMPORTANT to have reproductible results during evaluation!
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict hidden states features for each layer
  with torch.no_grad():
      # See the models docstrings for the detail of the inputs
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      # PyTorch-Transformers models always output tuples.
      # See the models docstrings for the detail of all the outputs
      # In our case, the first element is the hidden state of the last layer of the Bert model
      encoded_layers = outputs[0]
  # We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
  assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

  # Load pre-trained model (weights)
  #model = BertForMaskedLM.from_pretrained('bert-base-uncased')
  model = AutoModelWithLMHead.from_pretrained("TurkuNLP/wikibert-base-he-cased")
  model.eval()

  # If you have a GPU, put everything on cuda
  tokens_tensor = tokens_tensor.to('cuda')
  segments_tensors = segments_tensors.to('cuda')
  model.to('cuda')

  # Predict all tokens
  with torch.no_grad():
      outputs = model(tokens_tensor, token_type_ids=segments_tensors)
      predictions = outputs[0]

  predicted_inds = torch.argsort(-predictions[0, masked_index])
  predicted_probs = [round(p.item(),4) for p in torch.softmax(predictions[0, masked_index], 0)[predicted_inds]]
  predicted_tokens = tokenizer.convert_ids_to_tokens([ind.item() for ind in predicted_inds])
  
  return list(zip(predicted_tokens, predicted_probs))[:topn]

def fix_word(text):
  probs = [p[1] for p in get_probs(text)]
  ind = torch.argmin(torch.tensor(probs))
  tokenized_text = tokenizer.tokenize(text)
  bad_word = tokenized_text[ind]
  tokenized_text[ind] = '[MASK]'
  fix = predict_word(' '.join(tokenized_text), 1)[0][0]
  tokenized_text[ind] = fix
  return ' '.join(tokenized_text)
  



In [None]:
MODEL_PATH = "TurkuNLP/wikibert-base-he-cased"
torch.set_grad_enabled(False)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
#model = AutoModel.from_pretrained(MODEL_PATH)
#model.eval()

#tokenizer= BertTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelWithLMHead.from_pretrained(MODEL_PATH)
model.eval()

def whatisit(text):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    print("Encode: " + str(tokens))
    print("Decode: " + str(tokenizer.decode(tokens)))
    print("[PAD]: " + str(tokenizer.encode("[PAD]")[1]))
    print("[MASK]: " + str(tokenizer.encode("[MASK]")[1]))

    input_ids = torch.tensor(tokens).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids, masked_lm_labels=input_ids)    

    loss, prediction_scores = outputs[:2]
    
    print ("len(prediction_scores): " + str(len(prediction_scores)))
    print ("prediction_scores.shape: " + str(prediction_scores.shape))

    return loss, prediction_scores

In [None]:
#פרדי מרקורי מאסק זמר ומוזיקאי


print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
#print(whatisit("זמר ומוזיקאי " + tokenizer.mask_token + " פרדי מרקורי"))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print(whatisit("פרדי מרקורי [MASK] זמר ומוזיקאי"))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
#print(predict_word('זמר ומוזיקאי [MASK] פרדי מרקורי'))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
print(predict_word('פרדי מרקורי [MASK] זמר ומוזיקאי'))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")

#פרדי מרקורי היה מאסק ומוזיקאי

print(predict_word('פרדי מרקורי היה [MASK] ומוזיקאי'))


In [None]:
get_probs('פרדי מרקורי זמר ומוזיקאי')

In [None]:
fix_word('פרדי מרקורי זמר ומוזיקאי')