### Step 1: Preprocessing

Load words_250000_train.txt.
Generate millions of (masked_word, correct_next_letter) pairs.

### Step 2: Model

Input: 1D array of character encodings (size ≈ word_length × channels).
CNN Layers:
Embedding (characters → vectors)
1D Conv
Global Max Pool
Dense → softmax over 26 letters.

### Step 3: Training

Train on masked examples.
Loss: CrossEntropy.

### Step 4: Deployment

At every guess:
Pass masked word.
Predict letter probabilities.
Pick highest probability letter NOT YET guessed.

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import string
from tqdm import tqdm

# Constants
MAX_LEN = 20
CHAR2IDX = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
CHAR2IDX['_'] = 27
IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
VOCAB_SIZE = 28  # 26 letters + _ + padding (0)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model
class HangmanCNN(nn.Module):
    def __init__(self):
        super(HangmanCNN, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, 32, padding_idx=0)
        self.conv1 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, 26)  # 26 letters output

    def forward(self, x):
        x = self.embedding(x).transpose(1,2)  # (batch, embed_dim, seq_len)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = self.pool(x).squeeze(2)  # (batch, 128)
        x = self.fc(x)
        return x

# Utility functions
def encode_word(word):
    return [CHAR2IDX.get(c, 0) for c in word]

def pad_word(encoded):
    if len(encoded) > MAX_LEN:
        return encoded[:MAX_LEN]
    return encoded + [0]*(MAX_LEN - len(encoded))

def mask_word(word, mask_ratio=0.4):
    masked = list(word)
    num_to_mask = max(1, int(len(word) * mask_ratio))
    indices = random.sample(range(len(word)), num_to_mask)
    for idx in indices:
        masked[idx] = '_'
    return "".join(masked), [word[i] for i in indices]  # returns masked word and original letters masked

# Load dictionary
with open("/Users/dhairya/cs projects/trexquant assignment/words_250000_train.txt") as f:
    dictionary = [line.strip().lower() for line in f if line.strip()]

# Prepare Dataset
train_data = []
for word in dictionary:
    if not word.isalpha() or len(word) > MAX_LEN:
        continue
    for _ in range(2):  # augment: 2 masked versions per word
        masked_word, hidden_letters = mask_word(word)
        for letter in hidden_letters:
            x = pad_word(encode_word(masked_word))
            y = CHAR2IDX[letter]
            train_data.append((x, y))

print(f"Total training samples: {len(train_data)}")

# Dataloader
class HangmanDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y-1, dtype=torch.long)  # 0-based

train_loader = torch.utils.data.DataLoader(HangmanDataset(train_data), batch_size=512, shuffle=True)

# Training
model = HangmanCNN().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    running_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}')
    for x_batch, y_batch in progress_bar:
        x_batch, y_batch = x_batch.to(DEVICE), y_batch.to(DEVICE)
        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    print(f"Epoch {epoch+1}: Loss {running_loss/len(train_loader):.4f}")

torch.save(model.state_dict(), "hangman_cnn.pth")
print("Model saved as hangman_cnn.pth")


Total training samples: 1514082


Epoch 1:   1%|          | 31/2958 [00:02<03:38, 13.40it/s, loss=2.8796]


KeyboardInterrupt: 

In [10]:
import torch
import torch.nn as nn
import string
import re
import collections

# Constants for model
MAX_LEN = 20
CHAR2IDX = {c: i+1 for i, c in enumerate(string.ascii_lowercase)}
CHAR2IDX['_'] = 27
IDX2CHAR = {v: k for k, v in CHAR2IDX.items()}
VOCAB_SIZE = 28
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model class
class HangmanCNN(nn.Module):
    def __init__(self):
        super(HangmanCNN, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, 32, padding_idx=0)
        self.conv1 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(128, 26)

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = self.pool(x).squeeze(2)
        x = self.fc(x)
        return x

# Lazy load model
_model = None
def load_model():
    global _model
    if _model is None:
        _model = HangmanCNN().to(DEVICE)
        _model.load_state_dict(torch.load("hangman_cnn.pth", map_location=DEVICE))
        _model.eval()
    return _model

# Helper to encode input
def encode_input(word):
    cleaned = word[::2].replace(' ', '').lower()
    encoded = [CHAR2IDX.get(c, 0) for c in cleaned]
    if len(encoded) > MAX_LEN:
        encoded = encoded[:MAX_LEN]
    else:
        encoded += [0] * (MAX_LEN - len(encoded))
    return torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(DEVICE)

# 🚀 THE REPLACED FUNCTION
def guess(self, word):  # word example: "_ p p _ e "
    model = load_model()

    input_tensor = encode_input(word)  # (1, MAX_LEN)
    logits = model(input_tensor)       # (1, 26)
    probs = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]

    # Sort predictions by probability descending
    letter_indices = probs.argsort()[::-1]

    # Find the highest probability letter that hasn't been guessed yet
    for idx in letter_indices:
        letter = string.ascii_lowercase[idx]
        if letter not in self.guessed_letters:
            return letter

    # Fallback (should rarely happen)
    for letter in string.ascii_lowercase:
        if letter not in self.guessed_letters:
            return letter

    return 'e'  # Emergency fallback