In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import string
from tqdm import tqdm

# Constants
MAX_LEN = 20
CHAR2IDX = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
CHAR2IDX['_'] = 27  # Mask token
IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
VOCAB_SIZE = 28  # 26 letters + _ + padding (0)
MASK_TOKEN = 27
PAD_TOKEN = 0
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Utility Functions
def encode_word(word):
    return [CHAR2IDX.get(c, PAD_TOKEN) for c in word]

def pad_word(encoded):
    if len(encoded) > MAX_LEN:
        return encoded[:MAX_LEN]
    return encoded + [PAD_TOKEN] * (MAX_LEN - len(encoded))

def mask_random(word, mask_ratio=(0.3, 0.5)):
    masked = list(word)
    L = len(word)
    if L == 0:
        return masked, []
    ratio = random.uniform(*mask_ratio)
    num_to_mask = max(1, int(L * ratio))
    mask_indices = random.sample(range(L), num_to_mask)
    labels = [-1] * L  # -1 means "ignore" for loss

    for idx in mask_indices:
        labels[idx] = CHAR2IDX[masked[idx]] - 1  # supervised target: 0-indexed (a=0,...)
        masked[idx] = '_'
    return masked, labels

# Dataset
class HangmanMaskedDataset(torch.utils.data.Dataset):
    def __init__(self, dictionary):
        self.samples = []
        for word in dictionary:
            if not word.isalpha() or len(word) > MAX_LEN:
                continue
            word = word.lower()
            masked_word, labels = mask_random(word)
            input_encoded = pad_word(encode_word(masked_word))
            label_encoded = pad_word(labels)
            self.samples.append((input_encoded, label_encoded))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

# Model
class MaskedTransformer(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, d_model=256, nhead=8, num_layers=4, dim_ff=1024):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_TOKEN)
        self.pos_embed = nn.Parameter(torch.randn(1, MAX_LEN, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_ff, dropout=0.1, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.lm_head = nn.Linear(d_model, 26)  # only a–z predictions (no _ or pad)

    def forward(self, x):
        emb = self.embedding(x) + self.pos_embed[:, :x.size(1), :]
        h = self.transformer(emb)
        logits = self.lm_head(h)
        return logits

# Load dictionary
with open("/content/drive/MyDrive/words_250000_train.txt") as f:
    dictionary = [line.strip() for line in f if line.strip()]

print(f"Total words loaded: {len(dictionary)}")

# Create dataset and dataloader
dataset = HangmanMaskedDataset(dictionary)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=512, shuffle=True, num_workers=2, pin_memory=True)

# Instantiate model
model = MaskedTransformer().to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-2)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10*len(train_loader))
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # ignore unmasked positions

# Training
NUM_EPOCHS = 10
for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

    for x_batch, y_batch in progress:
        x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)

        optimizer.zero_grad()
        logits = model(x_batch)  # (B, L, 26)

        logits = logits.view(-1, 26)
        y_batch = y_batch.view(-1)

        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        progress.set_postfix(loss=f"{loss.item():.4f}")

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1} average loss: {avg_loss:.4f}")

# Save model
torch.save(model.state_dict(), "masked_transformer_hangman.pth")
print("Model saved to masked_transformer_hangman.pth")


In [1]:
import torch
import torch.nn as nn
import string

# Constants
MAX_LEN = 20
CHAR2IDX = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
CHAR2IDX['_'] = 27  # Mask token
IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
VOCAB_SIZE = 28  # 26 letters + _ + padding (0)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model class
class MaskedTransformer(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, d_model=256, nhead=8, num_layers=4, dim_ff=1024):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_embed = nn.Parameter(torch.randn(1, MAX_LEN, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_ff, dropout=0.1, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.lm_head = nn.Linear(d_model, 26)  # only a–z predictions

    def forward(self, x):
        emb = self.embedding(x) + self.pos_embed[:, :x.size(1), :]
        h = self.transformer(emb)
        logits = self.lm_head(h)
        return logits

# Lazy load model
_model = None
def load_model():
    global _model
    if _model is None:
        _model = MaskedTransformer().to(DEVICE)
        _model.load_state_dict(torch.load("masked_transformer_hangman.pth", map_location=DEVICE))
        _model.eval()
    return _model

def encode_input(word):
    cleaned = word[::2].replace(' ', '').lower()
    encoded = [CHAR2IDX.get(c, 0) for c in cleaned]
    if len(encoded) > MAX_LEN:
        encoded = encoded[:MAX_LEN]
    else:
        encoded += [0] * (MAX_LEN - len(encoded))
    return torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(DEVICE)

def guess(self, word):  # word example: "_ p p _ e "
    model = load_model()
    
    input_tensor = encode_input(word)  # (1, MAX_LEN)
    with torch.no_grad():
        logits = model(input_tensor)  # (1, L, 26)
        
    # Get predictions for masked positions only
    mask_positions = [i for i, c in enumerate(word[::2]) if c == '_']
    if not mask_positions:
        # If no masks, use the last position
        mask_positions = [-1]
    
    # Average logits across masked positions
    masked_logits = logits[0, mask_positions, :].mean(0)
    probs = torch.softmax(masked_logits, dim=0).cpu().numpy()
    
    # Sort predictions by probability descending
    letter_indices = probs.argsort()[::-1]
    
    # Find highest probability letter that hasn't been guessed
    for idx in letter_indices:
        letter = string.ascii_lowercase[idx]
        if letter not in self.guessed_letters:
            return letter
            
    # Fallback if all high probability letters guessed
    for letter in string.ascii_lowercase:
        if letter not in self.guessed_letters:
            return letter
            
    return 'e'  # Emergency fallback