In [1]:
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments
import datasets
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoModelForMaskedLM, DataCollatorForLanguageModeling

In [2]:
winohack = pd.read_csv('combined_sentences.csv')

winohack.columns = ['sentences']

winohack = winohack.sample(20000)

winohack_dataset = datasets.Dataset.from_pandas(winohack)

train_data, val_data = train_test_split(winohack, test_size = 0.2 )

model_checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [3]:
def regular_encode(texts, tokenizer, maxlen=25):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        padding = 'max_length',
        max_length = 24
    )
    return enc_di

In [4]:
X_train = regular_encode(list(train_data.sentences.values), tokenizer, maxlen=25)
X_val = regular_encode(list(val_data.sentences.values), tokenizer, maxlen=25)

In [5]:
tokenizer.decode(X_train[1].ids)

'[CLS] the chemist provided the advisee with paperwork to return to $ acc _ pronoun upon completion. [SEP] [PAD] [PAD]'

In [6]:
X_train = datasets.Dataset.from_dict(X_train)
X_val = datasets.Dataset.from_dict(X_val)

In [7]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, mlm=True)

In [9]:
training_args = TrainingArguments(
    "test-mlm",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=X_val,
    data_collator=data_collator,
)

ModuleNotFoundError: No module named 'docker'

In [None]:
trainer.train()

In [None]:
import torch

In [43]:
text = "The nurse told the patient that he forgot to bring the documents"
tokenized_text = tokenizer(text, padding='max_length', max_length = 24)

In [44]:
masked_index = 7

In [45]:
tokenized_text['input_ids'][7] = tokenizer.mask_token_id

In [46]:
# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.unsqueeze(torch.tensor(tokenized_text['input_ids']), 0)

In [47]:
type_tensor = torch.unsqueeze(torch.tensor(tokenized_text['token_type_ids']),0)

In [48]:
tokens_tensor = tokens_tensor.to('cuda')
type_tensor = type_tensor.to('cuda')
model.to('cuda')

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [49]:
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=type_tensor)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

In [50]:
torch.topk(predictions[0,masked_index], 5).indices

tensor([2016, 2002, 2027, 2009, 2619], device='cuda:0')

In [51]:
top_5_preds = tokenizer.convert_ids_to_tokens(torch.topk(predictions[0,masked_index], 5).indices)

In [52]:
top_5_preds

['she', 'he', 'they', 'it', 'someone']