In [1]:
!pip install pytesseract opencv-python-headless jiwer scikit-learn

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, pytesseract, jiwer
Successfully installed jiwer-3.1.0 pytesseract-0.3.13 rapidfuzz-3.13.0


In [2]:
import os, time, re
import pandas as pd
import numpy as np
import cv2
from PIL import Image
import pytesseract
from sklearn.metrics import precision_score, recall_score, f1_score
from jiwer import wer, cer

In [3]:
IMAGE_DIR = "/content/drive/MyDrive/OCR_Project/test_images"
RESULTS_PATH = "/content/drive/MyDrive/OCR_Project/Tesseract_Results/results.csv"
VIS_DIR = "/content/drive/MyDrive/OCR_Project/Tesseract_Results/vis"
os.makedirs(VIS_DIR, exist_ok=True)


def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


def get_sorted_pairs(image_dir):
    files = []
    for f in os.listdir(image_dir):
        if f.lower().endswith(('png', 'jpg', 'jpeg')):
            base = os.path.splitext(f)[0]
            txt_path = os.path.join(image_dir, f"{base}.txt")
            if os.path.exists(txt_path):
                files.append((f, base))
    files.sort(key=lambda x: natural_sort_key(x[1]))
    return [(os.path.join(image_dir, f), os.path.join(image_dir, f"{base}.txt")) for f, base in files]


def calculate_metrics(gt, pred):
    gt_words = set(gt.split())
    pred_words = set(pred.split())
    all_words = list(gt_words.union(pred_words))
    y_true = [1 if w in gt_words else 0 for w in all_words]
    y_pred = [1 if w in pred_words else 0 for w in all_words]
    return {
        'Accuracy': int(gt == pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1': f1_score(y_true, y_pred, zero_division=0),
        'CER': cer(gt, pred),
        'WER': wer(gt, pred)
    }


results = []
for img_path, txt_path in get_sorted_pairs(IMAGE_DIR):
    with open(txt_path, 'r') as f:
        gt_text = f.read().strip()
    try:
        start = time.time()
        img = cv2.imread(img_path)
        text = pytesseract.image_to_string(img).strip()
        time_taken = time.time() - start
        h, w = img.shape[:2]
        boxes = pytesseract.image_to_boxes(img)
        for b in boxes.splitlines():
            b = b.split()
            x1, y1, x2, y2 = int(b[1]), int(b[2]), int(b[3]), int(b[4])
            cv2.rectangle(img, (x1, h - y2), (x2, h - y1), (255, 0, 0), 2)

        vis_path = os.path.join(VIS_DIR, os.path.basename(img_path))
        cv2.imwrite(vis_path, img)

        metrics = calculate_metrics(gt_text, text)
        metrics.update({'Image': os.path.basename(img_path), 'Time': time_taken})
        results.append(metrics)

    except Exception as e:
        print(f"Error processing {img_path}: {e}")

pd.DataFrame(results).to_csv(RESULTS_PATH, index=False)
print("Tesseract evaluation complete!")

✅ Tesseract evaluation complete!


In [4]:
df=pd.DataFrame(results)
df.head()

Unnamed: 0,Accuracy,Precision,Recall,F1,CER,WER,Image,Time
0,0,0.0,0.0,0.0,0.956522,1.0,image1.jpg,1.044674
1,0,0.526316,0.47619,0.5,0.518325,0.857143,image2.jpg,9.109387
2,0,0.366667,0.293333,0.325926,0.598921,0.847826,image3.jpg,6.578062
3,0,0.769231,0.3125,0.444444,0.643432,0.688889,image4.jpg,4.573883
4,0,0.65625,0.355932,0.461538,0.575617,0.768116,image5.jpg,6.139045
