# **Tesseract OCR**
Part of MSc Project - Ashraf Muhammed Yusuf

In [1]:
# Install / import dependencies
!apt-get update && apt-get install -y tesseract-ocr
!pip install -q pytesseract pillow jiwer

import os
import glob
import pytesseract
import numpy as np
import pandas as pd
from google.colab import drive
from PIL import Image
from jiwer import wer, cer
from tqdm import tqdm

# Mount Drive & define base path
# Mount Drive so you can read datasets and write checkpoints
# Link to Drive:
# https://drive.google.com/drive/folders/1sfNG1PkmTPBe1wOSQXZmfdkvR97Hn9lk?usp=sharing
drive.mount('/content/drive')

Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [2]:
# Configuration: test/ dir
TEST_DIR = "/content/drive/MyDrive/MScProject/data/words3/test"

# Set up Tesseract: only uppercase A–Z, single line (--psm 7)
tess_config = r"--oem 1 --psm 7 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ"

# Run OCR over every image, collect predictions & ground truth
gt_labels = []
pred_labels = []

for cls in sorted(os.listdir(TEST_DIR)):
    cls_path = os.path.join(TEST_DIR, cls)
    if not os.path.isdir(cls_path):
        continue
    for img_path in glob.glob(os.path.join(cls_path, "*.png")):
        # ground truth is the folder name
        gt = cls
        # load image as grayscale
        img = Image.open(img_path).convert("L")
        # optional thresholding:
        # img = img.point(lambda x: 0 if x<128 else 255, '1')
        pred = pytesseract.image_to_string(img, config=tess_config)
        pred = pred.strip().upper()

        gt_labels.append(gt)
        pred_labels.append(pred)
        print(f"Label: {gt} → Pred: {pred}")

# Exact-match accuracy
exact_acc = np.mean([p == g for p, g in zip(pred_labels, gt_labels)])
print(f"Exact match accuracy: {exact_acc:.4%}")

# Average character-error rate (CER) and word-error rate (WER)
avg_cer = np.mean([cer(g, p) for p, g in zip(pred_labels, gt_labels)])
avg_wer = np.mean([wer(g, p) for p, g in zip(pred_labels, gt_labels)])
print(f"Mean CER: {avg_cer:.4f}")
print(f"Mean WER: {avg_wer:.4f}")

# (Optional) Save detailed results to CSV for later analysis
df = pd.DataFrame({
    "ground_truth": gt_labels,
    "prediction":   pred_labels,
    "cer":          [cer(g, p) for p, g in zip(pred_labels, gt_labels)],
    "wer":          [wer(g, p) for p, g in zip(pred_labels, gt_labels)]
})

# df.to_csv("tesseract_eval.csv", index=False)
# print("Detailed results written to tesseract_eval.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
GT: SPV, Pred: SFV
GT: SPW, Pred: SFL
GT: SPX, Pred: SFX
GT: SPY, Pred: SFY
GT: SPZ, Pred: SFZ
GT: SQA, Pred: SA
GT: SQB, Pred: 
GT: SQC, Pred: S
GT: SQD, Pred: SCL
GT: SQE, Pred: SE
GT: SQF, Pred: SF
GT: SQG, Pred: ECG
GT: SQH, Pred: SCF
GT: SQI, Pred: C
GT: SQJ, Pred: CJ
GT: SQK, Pred: EK
GT: SQL, Pred: SL
GT: SQM, Pred: SN
GT: SQN, Pred: S
GT: SQO, Pred: S
GT: SQP, Pred: SF
GT: SQQ, Pred: E
GT: SQR, Pred: SCF
GT: SQS, Pred: 
GT: SQT, Pred: T
GT: SQU, Pred: S
GT: SQV, Pred: 
GT: SQW, Pred: SL
GT: SQX, Pred: X
GT: SQY, Pred: S
GT: SQZ, Pred: CZ
GT: SRA, Pred: SFA
GT: SRB, Pred: SFE
GT: SRC, Pred: SFC
GT: SRD, Pred: SL
GT: SRE, Pred: SFE
GT: SRF, Pred: SFF
GT: SRG, Pred: SFT
GT: SRH, Pred: S
GT: SRI, Pred: SI
GT: SRJ, Pred: SF
GT: SRK, Pred: SFK
GT: SRL, Pred: SFJ
GT: SRM, Pred: SFI
GT: SRN, Pred: SRN
GT: SRO, Pred: FO
GT: SRP, Pred: EFF
GT: SRQ, Pred: S
GT: SRR, Pred: SFF
GT: SRS, Pred: SFS
GT: SRT, Pred: SFT
GT: SRU, Pr