In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
from tqdm import tqdm

# Load a smaller subset if memory is an issue
with open("words_250000_train.txt", "r") as f:
    word_list = [w.strip() for w in f if len(w.strip()) > 2 and len(w.strip()) <= 12]
    word_list = random.sample(word_list, 20000)  # reduce to 20k words if needed

# Build vocab
all_chars = sorted(set("".join(word_list)))
char_to_idx = {ch: i + 1 for i, ch in enumerate(all_chars)}
char_to_idx["_"] = len(char_to_idx) + 1  # MASK
idx_to_char = {i: ch for ch, i in char_to_idx.items()}
PAD_IDX = 0
vocab_size = len(char_to_idx) + 1

# Dataset
class HangmanDataset(Dataset):
    def __init__(self, words, max_len=16):
        self.data = []
        self.max_len = max_len
        for word in words:
            idx = random.randint(0, len(word) - 1)
            target = word[idx]
            masked = list(word)
            masked[idx] = "_"
            self.data.append(("".join(masked), target, idx))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        masked, target, pos = self.data[idx]
        x = [char_to_idx.get(c, 0) for c in masked]
        x += [PAD_IDX] * (self.max_len - len(x))
        return torch.tensor(x), torch.tensor(char_to_idx[target]), torch.tensor(pos)

# Model
class BiLSTMHangman(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, hidden_dim=64, layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=layers,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        emb = self.embedding(x)
        lstm_out, _ = self.lstm(emb)
        return self.fc(lstm_out)

# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMHangman(vocab_size).to(device)
dataset = HangmanDataset(word_list)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(8):
    model.train()
    total_loss = 0
    for x, y, pos in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        x, y, pos = x.to(device), y.to(device), pos.to(device)
        out = model(x)  # [batch, seq_len, vocab_size]
        logits = out[torch.arange(x.size(0)), pos]
        loss = criterion(logits, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} loss: {total_loss / len(dataloader):.4f}")

# Save model + mappings
torch.save(model.state_dict(), "bilstm_hangman.pth")
torch.save({"char_to_idx": char_to_idx, "idx_to_char": idx_to_char}, "bilstm_vocab.pth")

Epoch 1: 100%|██████████| 313/313 [00:09<00:00, 34.13it/s]


Epoch 1 loss: 2.6729


Epoch 2: 100%|██████████| 313/313 [00:07<00:00, 42.21it/s]


Epoch 2 loss: 2.3448


Epoch 3: 100%|██████████| 313/313 [00:06<00:00, 46.43it/s]


Epoch 3 loss: 2.2073


Epoch 4: 100%|██████████| 313/313 [00:06<00:00, 46.86it/s]


Epoch 4 loss: 2.1013


Epoch 5: 100%|██████████| 313/313 [00:06<00:00, 47.80it/s]


Epoch 5 loss: 2.0141


Epoch 6: 100%|██████████| 313/313 [00:06<00:00, 46.55it/s]


Epoch 6 loss: 1.9351


Epoch 7: 100%|██████████| 313/313 [00:06<00:00, 46.39it/s]


Epoch 7 loss: 1.8601


Epoch 8: 100%|██████████| 313/313 [00:06<00:00, 47.21it/s]

Epoch 8 loss: 1.7834



