<a href="https://colab.research.google.com/github/cs-amy/project-codebase/blob/main/Word_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CNN Sliding-Window Model for 3-Letter Word De-Obfuscation**
Stage 2 of MSc Project — Ashraf Muhammed Yusuf

# **1. Colab Environment Setup**

In [None]:
# Install dependencies
!pip install -q tensorflow matplotlib

In [2]:
# Import dependencies
import os, sys, random, itertools, pathlib, math, shutil
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path
from glob import glob
from tensorflow.keras import mixed_precision
from google.colab import drive
from tensorflow.keras.callbacks import (ModelCheckpoint, EarlyStopping, ReduceLROnPlateau)
from sklearn.metrics import classification_report, confusion_matrix
from collections import defaultdict
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
from typing import Tuple

In [None]:
# 1.3 Mount Drive & define base path
# Mount Drive so you can read datasets and write checkpoints
# Link to Drive:
# https://drive.google.com/drive/folders/1sfNG1PkmTPBe1wOSQXZmfdkvR97Hn9lk?usp=sharing
drive.mount('/content/drive')

# **2. Data Generation**
This block generates the 'three-letter words' dataset afresh if you do not already have it (You can access it here: https://drive.google.com/drive/folders/1kygA17GiCeCs8qTeDBEndU6TkXnEu-m7?usp=drive_link). It synthesizes three three-letter words from the character dataset (https://drive.google.com/drive/folders/1eUaTNW8zVjTArg0JszbCdCEq0tTdx89n?usp=drive_link).

In [6]:
# paths & constants
BASE_PATH = Path("/content/drive/MyDrive/MScProject")
GLYPH_DIR = Path(f"{BASE_PATH}/data/characters/train")
DATA_ROOT = Path(f"{BASE_PATH}/data/words3")
CKPT_DIR = f"{BASE_PATH}/words3_ckpt_best.keras"
BATCH = 128
IMG_H = IMG_W = 64
IMG_SHAPE = (IMG_H, IMG_W)
PATCH_W = IMG_W // 3
VARIANTS_PER = 5 # per word
EXPECTED_CLASSES = 26**3 # 26³ = 17,576
FINAL_TEST_FRAC = 0.20
SEED = 42
PATCH_W = IMG_W // 3 # 21 when IMG_W = 64
N_VARIANTS = 4 # number of images per class
FRACTION = 0.15 # 15 %
train_dir = DATA_ROOT / "train"
test_dir  = DATA_ROOT / "test"

random.seed(SEED)

In [None]:
"""
- Generates a single ‘train/’ directory with 17 576 class folders (AAA … ZZZ)
- Each class contains N_VARIANTS PNG images rendered on-the-fly (no external glyph reuse)
- Obfuscation applied per-character (leet + homoglyph + random spacing jitter)
- Idempotent: if the train folder already has 17 576 classes it exits immediately
"""

# Define font
try:
  FONT_PATH = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
  FONT = ImageFont.truetype(FONT_PATH, 40)
except (IOError, OSError):
  print("DejaVuSans not found; using PIL default bitmap font.")
  FONT = ImageFont.load_default()

# fast-exit guard
if train_dir.exists() and len([p for p in train_dir.iterdir() if p.is_dir()]) == EXPECTED_CLASSES:
  print("words3/train already complete – nothing to do.")
  sys.exit(0)

# mapping tables (uppercase only)
LEET = {
  'A': ['Α', '4', 'Д', 'Ä', 'Á', 'À', 'Â', '@', 'Δ'],
  'B': ['8', 'β', 'Β', 'В'],
  'C': ['Ç', 'Ć', 'Č', 'С'],
  'D': ['Ð', 'Ď'],
  'E': ['3', 'Σ', 'Έ', 'Ε', 'Е', 'Ë', 'É', 'È', 'Ê'],
  'F': ['Φ', 'Ƒ'],
  'G': ['6', 'Ğ', 'Ģ', 'Γ'],
  'H': ['Η', 'Н'],
  'I': ['1', '|', 'Í', 'Ì', 'Î', 'Ï', 'И'],
  'J': ['Ј'],
  'K': ['Κ', 'К'],
  'L': ['Ι', 'Ł', 'Ĺ', 'Л'],
  'M': ['Μ', 'М'],
  'N': ['Ν', 'Ń', 'Ñ', 'Н'],
  'O': ['0', 'Θ', 'Ο', 'Ө', 'Ø', 'Ö', 'Ó', 'Ò', 'Ô'],
  'P': ['Ρ', 'Р'],
  'Q': ['Φ'],
  'R': ['®', 'Я', 'Ř', 'Ŕ'],
  'S': ['5', '$', 'Ѕ', 'Ś', 'Š'],
  'T': ['Τ', 'Т'],
  'U': ['Υ', 'Ц', 'Ü', 'Ú', 'Ù', 'Û'],
  'V': ['Ѵ', 'V'],
  'W': ['Ω', 'Ѡ', 'Ψ', 'Ш', 'Щ'],
  'X': ['Χ', 'Ж', 'Х'],
  'Y': ['Υ', 'Ү', 'Ý', 'Ÿ'],
  'Z': ['Ζ', 'Ż', 'Ź', 'Ž', 'З', '2']
}
HOMO = {
  'A':'Α',
  'B':'Β',
  'C':'С',
  'E':'Ε',
  'H':'Н',
  'K':'Κ',
  'M':'Μ',
  'O':'О',
  'P':'Р',
  'T':'Τ',
  'X':'Χ',
  'Y':'Υ',
  'Z':'Ζ'
}

def obfuscate_char(ch: str) -> str:
  mode = random.choices(("plain", "leet", "homo"), weights=(0.5, 0.4, 0.1))[0]
  if mode == "leet" and ch in LEET:
    return random.choice(LEET[ch])
  if mode == "homo" and ch in HOMO:
    return HOMO[ch]
  return ch

def render_patch(ch: str) -> Image.Image:
  """Return a 64×21 monochrome patch for a single (possibly obfuscated) char."""
  patch = Image.new("L", (PATCH_W, IMG_H), color=255)
  draw  = ImageDraw.Draw(patch)
  draw.text((4, 4), obfuscate_char(ch), fill=0, font=FONT)
  return patch

def stitch_word(word: str, out_file: Path):
  canvas = Image.new("L", (IMG_W, IMG_H), color=255)
  for idx, ch in enumerate(word):
    glyph = render_patch(ch)
    canvas.paste(glyph, (idx * PATCH_W, 0))
  # light horizontal jitter
  if random.random() < 0.3:
    dx = random.randint(-2, 2)
    canvas = canvas.transform(canvas.size, Image.AFFINE, (1, 0, dx, 0, 1, 0))
  canvas.save(out_file)

# wipe & rebuild train directory (safe for colab runs)
if train_dir.exists():
  shutil.rmtree(train_dir)
train_dir.mkdir(parents=True, exist_ok=True)

# generate every word (AAA … ZZZ)
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
all_words = ["".join(tpl) for tpl in itertools.product(alphabet, repeat=3)]

for word in tqdm(all_words, desc="Generating train"):
  cls_dir = train_dir / word
  cls_dir.mkdir(parents=True, exist_ok=True)
  for k in range(N_VARIANTS):
    stitch_word(word, cls_dir / f"{word}_{k}.png")

print("✓ Training set complete.")

In [None]:
"""
- Make a permanent 15 % test split on Drive
- Assumes you have a single words3/train/AAA … ZZZ/*.png structure already.
- Creates /words3/test/AAA … ZZZ/ and MOVES files (no duplication).
- Safe to rerun – will skip classes already processed.
"""

# fast-guard: already split?
if test_dir.exists() and len([d for d in test_dir.iterdir() if d.is_dir()]) == EXPECTED_CLASSES:
  print("words3/test already holds all", EXPECTED_CLASSES, "class folders – nothing to do.")
  sys.exit(0)

test_dir.mkdir(parents=True, exist_ok=True)

# split loop
for cls_dir in tqdm.tqdm([d for d in train_dir.iterdir() if d.is_dir()], desc="Creating 15 % test split"):
  tgt_cls = test_dir / cls_dir.name
  tgt_cls.mkdir(parents=True, exist_ok=True)

  # list PNGs still in train/ for this class (those already moved last run are gone)
  imgs = list(cls_dir.glob("*.png"))
  if not imgs: # all imgs already moved in a previous run
    continue

  # number to move: 15 % rounded down, but keep ≥1 in train/
  n_move = max(1, math.floor(len(imgs) * FRACTION))
  n_move = min(n_move, len(imgs) - 1)          # safeguard: leave ≥1

  random.shuffle(imgs)
  for img in imgs[:n_move]:
    shutil.move(str(img), tgt_cls / img.name)

print("Test split ready.")
print("Train images:", sum(1 for _ in train_dir.rglob("*.png")))
print("Test images:", sum(1 for _ in test_dir.rglob("*.png")))

# **3. Load & Freeze the Single-Char Model**

In [14]:
base_model = tf.keras.models.load_model(f"{BASE_PATH}/char_cnn_ckpt_best.keras")
base_model.trainable = False # freeze weights initially
print("Base model frozen — params:", base_model.count_params())

Base model frozen — params: 2455450


# **4. Data Loading & Splitting**

In [None]:
# Train dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  train_dir,
  labels="inferred",
  label_mode="categorical",
  batch_size=BATCH,
  image_size=IMG_SHAPE,
  color_mode="grayscale",
  validation_split=0.2,
  subset="training",
  seed=42
)

# Val dataset
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
  train_dir,
  labels="inferred",
  label_mode="categorical",
  batch_size=BATCH,
  image_size=IMG_SHAPE,
  color_mode="grayscale",
  validation_split=0.2,
  subset="validation",
  seed=42
)

# Test dataset
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
  test_dir,
  labels="inferred",
  label_mode="categorical",
  batch_size=BATCH,
  image_size=IMG_SHAPE,
  color_mode="grayscale",
  shuffle=False
)

# **5. Visual Sanity Check**

In [None]:
# Utility to display examples from each set
def show_examples(ds, ds_name, num=5):
  # Take one batch
  for images, labels in ds.take(1):
    images = images.numpy()
    labels = labels.numpy()
    class_names = ds.class_names
    break

  plt.figure(figsize=(6,6))
  for i in range(num):
    ax = plt.subplot(3, 3, i+1)
    img = images[i].squeeze()  # shape: (H,W) since grayscale
    lbl = class_names[labels[i].argmax()]
    plt.imshow(img, cmap='gray')
    plt.title(f"{ds_name}: {lbl}")
    plt.axis('off')
  plt.tight_layout()
  plt.show()

# Display 5 examples from each split
show_examples(train_ds, "Train")
show_examples(val_ds, "Val")
show_examples(test_ds, "Test")

# **6. Build the Sliding-Window Model**

In [None]:
def extract_patch(x, idx):
  start = idx * PATCH_W
  return x[:, :, start:start+PATCH_W, :] # (None, 64, 21, 1)


inputs = tf.keras.Input(shape=(IMG_H, IMG_W, 1))
logits = []

for i in range(3):
  patch = tf.keras.layers.Lambda(lambda z, i=i: extract_patch(z, i))(inputs)
  patch = tf.keras.layers.Resizing(IMG_H, IMG_H)(patch) # -> (64 x 64 x 1)
  # Re-use frozen base_model (shared weights)
  logits.append(base_model(patch)) # (None, 26)

concat = tf.keras.layers.Concatenate()(logits) # (None, 78)
# Hidden layer #1
h1 = tf.keras.layers.Dense(256, activation='relu')(concat)
h1 = tf.keras.layers.BatchNormalization()(h1)
h1 = tf.keras.layers.Dropout(0.5)(h1)
# Hidden layer #2
h1 = tf.keras.layers.Dense(256, activation='relu')(h1)
h1 = tf.keras.layers.Dropout(0.5)(h1)
outputs = tf.keras.layers.Dense(EXPECTED_CLASSES, activation='softmax')(h1)
word_model = tf.keras.Model(inputs, outputs)

# (Optional: if accuracy is not great)
# Freeze all weights except the last N blocks
N = 1
# Un-freeze last N layers of base_model
for layer in base_model.layers[-N:]:
  layer.trainable = True

# Compile model
word_model.compile(
  optimizer=tf.keras.optimizers.Adam(1e-3),
  loss='categorical_crossentropy',
  metrics=['accuracy']
)

# Print model summary
word_model.summary()

# **7. Callbacks**

In [None]:
callbacks = [
  # 1. Checkpoint
  ModelCheckpoint(CKPT_DIR, save_best_only=True, monitor='val_loss'),
  # 2. Early stopping
  EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
  # 3. Learning rate scheduler
  ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)
]

# **8. Train**

In [None]:
# Train the model
history = word_model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=20,
  callbacks=callbacks
)

In [None]:
# Re-compile with lower LR
word_model.compile(
  optimizer=tf.keras.optimizers.Adam(1e-4),
  loss='categorical_crossentropy',
  metrics=['accuracy']
)

ft_history = word_model.fit(
  train_ds,
  validation_data=val_ds,
  initial_epoch=history.epoch[-1] + 1,
  epochs=history.epoch[-1] + 5,
  callbacks=callbacks
)

# **9. Evaluation**

In [None]:
word_model = tf.keras.models.load_model(CKPT_DIR) # best checkpoint
test_loss, test_acc = word_model.evaluate(test_ds)
print(f"Test accuracy: {test_acc:.4f}")

# Util for plotting confusion matrix
def plot_confusion_matrix(cm, class_names, title="Confusion Matrix"):
  """
  Args:
      cm (np.ndarray): square confusion matrix
      class_names (List[str]): labels in the same order used to build cm
  """
  fig, ax = plt.subplots(figsize=(10, 9))
  im = ax.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
  ax.figure.colorbar(im, ax=ax, fraction=0.045)

  # axes & ticks
  ax.set(
    xticks=np.arange(len(class_names)),
    yticks=np.arange(len(class_names)),
    xticklabels=class_names,
    yticklabels=class_names,
    ylabel="True label",
    xlabel="Predicted label",
    title=title,
  )
  plt.setp(ax.get_xticklabels(), rotation=90, ha="center", va="center")

  # annotate cells
  thresh = cm.max() / 2.0
  for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
      ax.text(
        j, i, format(cm[i, j], "d"),
        ha="center", va="center",
        color="white" if cm[i, j] > thresh else "black",
        fontsize=8
      )

  fig.tight_layout()
  plt.show()

# Classification report
y_pred, y_true = [], []
for x, y in test_ds:
  y_pred.extend(np.argmax(word_model.predict(x), axis=1))
  y_true.extend(np.argmax(y.numpy(), axis=1))
print(classification_report(y_true, y_pred, target_names=train_ds.class_names))

# Confusion matrix heat-map (optional)
cm = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(cm, train_ds.class_names, title="3-Letter Word Confusion Matrix")

# **10. Qualitative Error Analysis**

In [None]:
# Plot a few misclassified 3-letter words
mis_idx = [i for i,(t,p) in enumerate(zip(y_true, y_pred)) if t != p]
show_examples(test_ds.unbatch().skip(mis_idx[0]), "Misclassified example")