<a href="https://colab.research.google.com/github/cs-amy/project-codebase/blob/main/notebooks/Tesseract_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Tesseract OCR**
Part of MSc Project - Ashraf Muhammed Yusuf

In [None]:
# Install / import dependencies
!apt-get update && apt-get install -y tesseract-ocr
!pip install -q pytesseract pillow jiwer

import os
import glob
import pytesseract
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from google.colab import drive
from PIL import Image
from jiwer import wer, cer
from tqdm import tqdm
from collections import Counter

# Mount Drive & define base path
# Mount Drive so you can read datasets and write checkpoints
# Link to Drive:
# https://drive.google.com/drive/folders/1sfNG1PkmTPBe1wOSQXZmfdkvR97Hn9lk?usp=sharing
drive.mount('/content/drive')

# **Tesseract over Word Dataset**

In [None]:
# Configuration: test/ dir
TEST_DIR = "/content/drive/MyDrive/MScProject/data/words3/test"

# Set up Tesseract: only uppercase A–Z, single line (--psm 7)
tess_config = r"--oem 1 --psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ"

# Run OCR over every image, collect predictions & ground truth
gt_labels = []
pred_labels = []

for cls in sorted(os.listdir(TEST_DIR)):
    cls_path = os.path.join(TEST_DIR, cls)
    if not os.path.isdir(cls_path):
        continue
    for img_path in glob.glob(os.path.join(cls_path, "*.png")):
        # ground truth is the folder name
        gt = cls
        # load image as grayscale
        img = Image.open(img_path).convert("L")
        # optional thresholding:
        # img = img.point(lambda x: 0 if x<128 else 255, '1')
        pred = pytesseract.image_to_string(img, config=tess_config)
        pred = pred.strip().upper()

        gt_labels.append(gt)
        pred_labels.append(pred)
        print(f"Label: {gt} → Pred: {pred}")

# Exact-match accuracy
exact_acc = np.mean([p == g for p, g in zip(pred_labels, gt_labels)])
print(f"Exact match accuracy: {exact_acc:.4%}")

# Average character-error rate (CER) and word-error rate (WER)
avg_cer = np.mean([cer(g, p) for p, g in zip(pred_labels, gt_labels)])
avg_wer = np.mean([wer(g, p) for p, g in zip(pred_labels, gt_labels)])
print(f"Mean CER: {avg_cer:.4f}")
print(f"Mean WER: {avg_wer:.4f}")

# **Tesseract over Character Dataset**

In [None]:
CHAR_TEST_DIR = "/content/drive/MyDrive/MScProject/data/characters/test"

# Tell pytesseract to treat each image as a single character, restrict to A–Z
TESSERACT_CONFIG = r"--psm 10 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ"

# Run through the dataset
y_true, y_pred = [], []
for true_char in sorted(os.listdir(CHAR_TEST_DIR)):
    char_dir = Path(CHAR_TEST_DIR) / true_char
    if not char_dir.is_dir():
        continue
    for img_path in char_dir.glob("*.png"):
        img = Image.open(img_path).convert("L")
        # optional: binarize if your glyphs need thresholding:
        # img = img.point(lambda x: 0 if x<128 else 255, mode='1')
        txt = pytesseract.image_to_string(img, config=TESSERACT_CONFIG)
        pred = txt.strip().upper()
        # take first character only (in case of noise)
        pred = pred[0] if len(pred)>0 else ""
        y_true.append(true_char)
        y_pred.append(pred)
        print(f"Label: {true_char} → Pred: {pred}")

# Compute accuracy
correct = sum(t==p for t,p in zip(y_true, y_pred))
total   = len(y_true)
acc = correct/total
print(f"Character‐level Tesseract Accuracy: {acc*100:5.2f}%  ({correct}/{total})")

# Build & plot a confusion matrix for the most frequent errors
labels = sorted(set(y_true))
cm = np.zeros((len(labels), len(labels)), dtype=int)
idx = {c:i for i,c in enumerate(labels)}
for t,p in zip(y_true, y_pred):
    i, j = idx[t], idx.get(p, None)
    if j is None:
        # treat unknown predictions as a special “?” class
        continue
    cm[i, j] += 1

plt.figure(figsize=(8,6))
sns.heatmap(cm, xticklabels=labels, yticklabels=labels, fmt="d", cmap="Blues")
plt.xlabel("Tesseract Predicted")
plt.ylabel("Ground Truth")
plt.title("Confusion Matrix on Character Test Set")
plt.show()