In [None]:
import os
import cv2
import pytesseract
import pandas as pd
import matplotlib.pyplot as plt

# 📌 Set path to Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# 📁 Dataset base path and subfolders
dataset_path = r"C:\Users\NITRO 5\OneDrive - Swinburne Sarawak\General - COS30018 INTELLIGENT SYSTEMS\Dataset\OCR\Combined"
subfolders = ['train', 'test']

# 🔧 Tesseract config
tess_config = r'--oem 1 --psm 6'

# 📊 Store all results here
results = []
failed_samples = []

# 🔍 OCR processing function with debug image return
def preprocess_and_tesseract(image_path):
    image = cv2.imread(image_path)
    if image is None:
        return "", None

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.resize(gray, (400, 100))
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    text = pytesseract.image_to_string(thresh, config=tess_config)
    text = ''.join(filter(str.isalnum, text)).upper()
    return text, thresh

# 🔁 Loop through both train and test folders
for sub in subfolders:
    folder = os.path.join(dataset_path, sub)
    for filename in os.listdir(folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder, filename)
            ground_truth = os.path.splitext(filename)[0].upper().replace(" ", "")
            predicted_text, processed_img = preprocess_and_tesseract(image_path)

            correct = (predicted_text == ground_truth)
            result_entry = {
                "file": filename,
                "folder": sub,
                "ground_truth": ground_truth,
                "predicted": predicted_text,
                "match": correct
            }
            results.append(result_entry)

            if not correct:
                failed_samples.append((result_entry, processed_img))

            print(f"[{filename}] GT: {ground_truth} | Pred: {predicted_text} | ✅" if correct else f"[{filename}] GT: {ground_truth} | Pred: {predicted_text} | ❌")

# 🧮 Final accuracy calculation
total = len(results)
correct = sum(r["match"] for r in results)
accuracy = correct / total * 100
print(f"\n🔍 Total Samples: {total} | Correct: {correct} | Accuracy: {accuracy:.2f}%")

# 💾 Save results to Excel
results_df = pd.DataFrame(results)
output_file = os.path.join(dataset_path, "ocr_results.xlsx")
results_df.to_excel(output_file, index=False)
print(f"📁 Results saved to: {output_file}")

# 🖼️ Show up to 5 failed predictions
print("\n🔍 Showing up to 5 failed predictions:")
for i, (info, img) in enumerate(failed_samples[:5]):
    plt.figure(figsize=(8, 2))
    plt.imshow(img, cmap='gray')
    plt.title(f"[{info['file']}] GT: {info['ground_truth']} | Pred: {info['predicted']}")
    plt.axis("off")
    plt.show()
