In [4]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from transformers import DataCollatorForLanguageModeling
import random

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Prepare Data
class WordsDataset(Dataset):
    def __init__(self, words, tokenizer, max_length):
        self.words = words
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = self.words[idx]
        # Dynamically mask random characters in the word
        char_indexes = list(range(len(word)))
        random.shuffle(char_indexes)
        num_to_mask = max(1, len(word) // 2)  # Ensure at least one character is masked
        masked_chars = [word[i] if i not in char_indexes[:num_to_mask] else tokenizer.mask_token for i in range(len(word))]
        masked_word = "".join(masked_chars)
        # Tokenize without creating batch dimension here
        inputs = self.tokenizer(masked_word, max_length=self.max_length, padding='max_length', truncation=True)
        return inputs
    
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        words = file.read().splitlines()
    return words

# Assuming a function load_data exists to load your dataset
words = load_data("/kaggle/input/english-words/words_250000_train.txt")
max_length = max([len(word) for word in words]) + 2

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = WordsDataset(words, tokenizer, max_length)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator)  # Optimized batch size

# Load and configure model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)  # Move model to the defined device

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)  # Optimized learning rate
total_steps = len(loader) * 10  # Assume 10 epochs for example
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

model.train()
for epoch in range(10):  # Increased number of epochs
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Directly send to device without squeezing
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 0, Loss: 8.947418212890625
Epoch 0, Loss: 4.397143363952637
Epoch 0, Loss: 4.516434669494629
Epoch 0, Loss: 5.941709995269775
Epoch 0, Loss: 5.456535816192627
Epoch 0, Loss: 5.305830478668213
Epoch 0, Loss: 5.066450595855713
Epoch 0, Loss: 3.6946523189544678
Epoch 0, Loss: 4.591305255889893
Epoch 0, Loss: 4.982004165649414
Epoch 0, Loss: 4.504914283752441
Epoch 0, Loss: 2.5949907302856445
Epoch 0, Loss: 4.872231483459473
Epoch 0, Loss: 3.5904722213745117
Epoch 0, Loss: 4.956690788269043
Epoch 0, Loss: 3.9285378456115723
Epoch 0, Loss: 4.29967737197876
Epoch 0, Loss: 4.997945308685303
Epoch 0, Loss: 5.863208293914795
Epoch 0, Loss: 4.5873122215271
Epoch 0, Loss: 4.3738532066345215
Epoch 0, Loss: 4.876739501953125
Epoch 0, Loss: 5.440181732177734
Epoch 0, Loss: 4.758907794952393
Epoch 0, Loss: 4.09415340423584
Epoch 0, Loss: 3.1853275299072266
Epoch 0, Loss: 3.3252766132354736
Epoch 0, Loss: 4.342646598815918
Epoch 0, Loss: 5.2548956871032715
Epoch 0, Loss: 3.860213041305542
Epoch 

In [6]:
import torch

def predict_word(model, length, tokenizer, device, max_iters=10):
    # Start with a fully masked word
    masked_word = [tokenizer.mask_token] * length
    masked_input = ''.join(masked_word)

    for _ in range(max_iters):
        # Tokenize the input
        inputs = tokenizer(masked_input, return_tensors='pt', padding='max_length', max_length=512, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Get model predictions
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = outputs.logits

        # Find the index of the masked token and replace it with the most likely character
        masked_index = masked_input.find(tokenizer.mask_token)
        if masked_index == -1:  # Break if no masked token is found
            break

        # Get the predicted token ID for the masked position
        predicted_index = torch.argmax(predictions[0, masked_index]).item()
        predicted_token = tokenizer.decode([predicted_index])

        # Update the masked word
        masked_word[masked_index] = predicted_token
        masked_input = ''.join(masked_word)

        # Break if no mask token is left
        if tokenizer.mask_token not in masked_input:
            break

    return masked_input

# Usage example
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

predicted_word = predict_word(model, length=5, tokenizer=tokenizer, device=device)  # For a 5-letter word
print(predicted_word)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


.....
