In [None]:
import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from paddleocr import PaddleOCR

# 📁 Dataset path
dataset_path = r"C:\Users\NITRO 5\OneDrive - Swinburne Sarawak\General - COS30018 INTELLIGENT SYSTEMS\Dataset\OCR\Combined"
subfolders = ['train', 'test']

# 🧠 Initialize PaddleOCR (use_angle_cls=True enables angle correction)
ocr = PaddleOCR(use_angle_cls=True, lang='en')  # use_gpu=True for GPU

# 📊 Store results
results = []
failed_samples = []

# 🔁 Loop through train/test images
for sub in subfolders:
    folder = os.path.join(dataset_path, sub)
    for filename in os.listdir(folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(folder, filename)
            ground_truth = os.path.splitext(filename)[0].upper().replace(" ", "")

            # 🔍 Run OCR
            result = ocr.ocr(image_path, cls=True)

            # 📝 Extract predicted text
            detected_text = ""
            if result and isinstance(result, list):
                for line in result:
                    if isinstance(line, list):
                        for box, text in line:
                            detected_text += text[0]

            detected_text = detected_text.upper().replace(" ", "")

            # ✅ Evaluate
            match = detected_text == ground_truth
            result_entry = {
                "file": filename,
                "folder": sub,
                "ground_truth": ground_truth,
                "predicted": detected_text,
                "match": match
            }
            results.append(result_entry)

            if not match:
                failed_samples.append((filename, image_path, ground_truth, detected_text))

            print(f"[{filename}] GT: {ground_truth} | Pred: {detected_text} | {'✅' if match else '❌'}")

# 🧮 Final accuracy
total = len(results)
correct = sum(r["match"] for r in results)
accuracy = correct / total * 100
print(f"\n🔍 Total: {total} | Correct: {correct} | Accuracy: {accuracy:.2f}%")

# 💾 Save results to Excel
df = pd.DataFrame(results)
output_file = os.path.join(dataset_path, "paddleocr_results.xlsx")
df.to_excel(output_file, index=False)
print(f"📁 Results saved to: {output_file}")

# 🖼️ Show a few failed predictions
print("\n🔍 Showing up to 5 failed predictions:")
for i, (fname, img_path, gt, pred) in enumerate(failed_samples[:5]):
    img = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(8, 3))
    plt.imshow(img_rgb)
    plt.title(f"[{fname}] GT: {gt} | Pred: {pred}")
    plt.axis("off")
    plt.show()
