In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import string
from tqdm import tqdm

MAX_LEN = 20
CHAR2IDX = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
CHAR2IDX['_'] = 27  
IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
VOCAB_SIZE = 28  
MASK_TOKEN = 27
PAD_TOKEN = 0
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def encode_word(word):
    return [CHAR2IDX.get(c, PAD_TOKEN) for c in word]

def pad_word(encoded):
    if len(encoded) > MAX_LEN:
        return encoded[:MAX_LEN]
    return encoded + [PAD_TOKEN] * (MAX_LEN - len(encoded))

def mask_random(word, mask_ratio=(0.3, 0.5)):
    masked = list(word)
    L = len(word)
    if L == 0:
        return masked, []
    ratio = random.uniform(*mask_ratio)
    num_to_mask = max(1, int(L * ratio))
    mask_indices = random.sample(range(L), num_to_mask)
    labels = [-1] * L  

    for idx in mask_indices:
        labels[idx] = CHAR2IDX[masked[idx]] - 1  
        masked[idx] = '_'
    return masked, labels

class HangmanMaskedDataset(torch.utils.data.Dataset):
    def __init__(self, dictionary):
        self.samples = []
        for word in dictionary:
            if not word.isalpha() or len(word) > MAX_LEN:
                continue
            word = word.lower()
            masked_word, labels = mask_random(word)
            input_encoded = pad_word(encode_word(masked_word))
            label_encoded = pad_word(labels)
            self.samples.append((input_encoded, label_encoded))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

class EnhancedTransformer(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, d_model=512, nhead=16, num_layers=12, dim_ff=2048, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=PAD_TOKEN)
        self.pos_embed = nn.Parameter(torch.randn(1, MAX_LEN, d_model))
        
        self.pre_norm = nn.LayerNorm(d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            activation='gelu',  
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.post_norm = nn.LayerNorm(d_model)
        self.fc1 = nn.Linear(d_model, d_model)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(d_model, d_model)
        self.final_norm = nn.LayerNorm(d_model)
        
        self.lm_head = nn.Linear(d_model, 26)  
        
        self._init_weights()
        
    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def forward(self, x):
        padding_mask = (x == PAD_TOKEN)
        
        emb = self.embedding(x) + self.pos_embed[:, :x.size(1), :]
        emb = self.pre_norm(emb)
        
        h = self.transformer(emb, src_key_padding_mask=padding_mask)
        
        h = self.post_norm(h)
        residual = h
        h = self.fc1(h)
        h = self.act(h)
        h = self.fc2(h)
        h = residual + h
        h = self.final_norm(h)
        
        logits = self.lm_head(h)
        return logits

with open("words_250000_train.txt") as f:
    dictionary = [line.strip() for line in f if line.strip()]

print(f"Total words loaded: {len(dictionary)}")

dataset = HangmanMaskedDataset(dictionary)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True, num_workers=2, pin_memory=True)

model = EnhancedTransformer().to(DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01, betas=(0.9, 0.999), eps=1e-8)

def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, min_lr=1e-6):
    def lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(min_lr, 0.5 * (1.0 + math.cos(math.pi * progress)))
    
    return optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

import math
total_steps = 20 * len(train_loader)  
warmup_steps = int(0.1 * total_steps)  
scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

criterion = nn.CrossEntropyLoss(ignore_index=-1)  

from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

NUM_EPOCHS = 20  
best_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

    for x_batch, y_batch in progress:
        x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)

        optimizer.zero_grad()
        
        with autocast():
            logits = model(x_batch)  
            logits = logits.view(-1, 26)
            y_batch = y_batch.view(-1)
            loss = criterion(logits, y_batch)
        
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        running_loss += loss.item()
        progress.set_postfix(loss=f"{loss.item():.4f}", lr=f"{scheduler.get_last_lr()[0]:.6f}")

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1} loss: {avg_loss:.4f}")
    
    if avg_loss < best_loss:
        best_loss = avg_loss
        ## torch.save(model.state_dict(), "masked_transformer_hangman_best.pth")
        print(f"loss: {best_loss:.4f}")

torch.save(model.state_dict(), "masked_transformer_hangman_final.pth")

In [2]:
import torch
import torch.nn as nn
import string

MAX_LEN = 20
CHAR2IDX = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
CHAR2IDX['_'] = 27  
IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
VOCAB_SIZE = 28  
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class EnhancedTransformer(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, d_model=512, nhead=16, num_layers=12, dim_ff=2048, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_embed = nn.Parameter(torch.randn(1, MAX_LEN, d_model))
        
        self.pre_norm = nn.LayerNorm(d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.post_norm = nn.LayerNorm(d_model)
        self.fc1 = nn.Linear(d_model, d_model)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(d_model, d_model)
        self.final_norm = nn.LayerNorm(d_model)
        
        self.lm_head = nn.Linear(d_model, 26)  
        
    def forward(self, x):
        padding_mask = (x == 0)
        emb = self.embedding(x) + self.pos_embed[:, :x.size(1), :]
        emb = self.pre_norm(emb)
        
        h = self.transformer(emb, src_key_padding_mask=padding_mask)
        
        h = self.post_norm(h)
        residual = h
        h = self.fc1(h)
        h = self.act(h)
        h = self.fc2(h)
        h = residual + h
        h = self.final_norm(h)
        
        logits = self.lm_head(h)
        return logits

_model = None
def load_model():
    global _model
    if _model is None:
        _model = EnhancedTransformer().to(DEVICE)
        _model.load_state_dict(torch.load("masked_transformer_hangman_final.pth", map_location=DEVICE))
        _model.eval()
    return _model

def encode_input(word):
    cleaned = word[::2].replace(' ', '').lower()
    encoded = [CHAR2IDX.get(c, 0) for c in cleaned]
    if len(encoded) > MAX_LEN:
        encoded = encoded[:MAX_LEN]
    else:
        encoded += [0] * (MAX_LEN - len(encoded))
    return torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(DEVICE)

def guess(self, word):  
    model = load_model()
    
    input_tensor = encode_input(word)  
    with torch.no_grad():
        logits = model(input_tensor)  
        
    mask_positions = [i for i, c in enumerate(word[::2]) if c == '_']
    if not mask_positions:
        mask_positions = [-1]
    
    position_probs = []
    for pos in mask_positions:
        pos_logits = logits[0, pos, :]
        pos_probs = torch.softmax(pos_logits, dim=0).cpu().numpy()
        position_probs.append(pos_probs)
    
    weights = [1.0 / (i + 1) for i in range(len(position_probs))]
    weight_sum = sum(weights)
    weights = [w / weight_sum for w in weights]
    
    combined_probs = sum(w * p for w, p in zip(weights, position_probs))
    
    letter_indices = combined_probs.argsort()[::-1]
    
    for idx in letter_indices:
        letter = string.ascii_lowercase[idx]
        if letter not in self.guessed_letters:
            return letter
            
    for letter in string.ascii_lowercase:
        if letter not in self.guessed_letters:
            return letter
            
    return 'e'  
