In [8]:
import cv2
import numpy as np
import os
import pytesseract
from imutils.object_detection import non_max_suppression
from sklearn.metrics import precision_score, recall_score, f1_score
import time

# Update the Tesseract path (if needed)
pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Users\shiku\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
)


def load_ground_truth(gt_file):
    """Load ground truth bounding boxes from ICDAR2015 label files."""
    boxes = []
    with open(gt_file, "r", encoding="utf-8-sig") as f:
        for line in f.readlines():
            parts = line.strip().split(",")
            coords = list(map(int, parts[:8]))  # Get the 8 coordinates
            box = [(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)]
            boxes.append(box)
    return boxes


def decode_predictions(scores, geometry, confThreshold=0.5):
    """Decode predictions from the EAST model into bounding boxes."""
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []

    for y in range(numRows):
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]

        for x in range(numCols):
            if scoresData[x] < confThreshold:
                continue

            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            angle = anglesData[x]
            cos, sin = np.cos(angle), np.sin(angle)

            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]

            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)

            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])

    return rects, confidences


def compute_iou(boxA, boxB):
    """Compute Intersection over Union (IoU) between two boxes."""
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # Compute the area of intersection
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

    # Compute the area of both the predicted and ground-truth rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    # Compute the IoU
    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou


def match_boxes(pred_boxes, gt_boxes, iou_threshold=0.2):
    """Match predicted boxes with ground truth using IoU."""
    matches = 0
    used = [False] * len(gt_boxes)  # Track matched ground-truth boxes

    for pred in pred_boxes:
        for i, gt in enumerate(gt_boxes):
            if not used[i] and compute_iou(pred, gt) >= iou_threshold:
                matches += 1
                used[i] = True  # Mark this GT box as matched
                break

    return matches


def detect_and_evaluate(
    image_path, net, gt_file=None, apply_gray=True, apply_blur=True
):
    """Detect text and evaluate bounding box detection."""
    # Load image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Failed to load image {image_path}")
        return 0, 0, 0
    orig = image.copy()
    (h, w) = image.shape[:2]

    # Optionally apply grayscale and Gaussian blur
    if apply_gray:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)  # Ensure 3 channels
    if apply_blur:
        image = cv2.GaussianBlur(image, (5, 5), 0)

    # Resize the image for the EAST model
    newW, newH = 320, 320
    rW, rH = w / float(newW), h / float(newH)

    blob = cv2.dnn.blobFromImage(
        image, 1.0, (newW, newH), (123.68, 116.78, 103.94), swapRB=True, crop=False
    )
    net.setInput(blob)

    # Run the model and decode predictions
    layerNames = ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]
    (scores, geometry) = net.forward(layerNames)

    (rects, confidences) = decode_predictions(scores, geometry, confThreshold=0.1)
    pred_boxes = non_max_suppression(np.array(rects), probs=confidences)

    # Scale the bounding boxes back to the original image size
    pred_boxes = [
        (int(x * rW), int(y * rH), int(x2 * rW), int(y2 * rH))
        for (x, y, x2, y2) in pred_boxes
    ]

    # print(f"Number of predicted boxes: {len(pred_boxes)}")

    if gt_file:
        # Load ground truth boxes
        gt_boxes = load_ground_truth(gt_file)
        # print(f"Number of ground truth boxes: {len(gt_boxes)}")

        # Flatten ground truth boxes to (x1, y1, x2, y2) format
        gt_boxes = [
            (
                min(pt[0] for pt in x),
                min(pt[1] for pt in x),
                max(pt[0] for pt in x),
                max(pt[1] for pt in x),
            )
            for x in gt_boxes
        ]

        # Match predicted and ground truth boxes
        matches = match_boxes(pred_boxes, gt_boxes)
        return matches, len(pred_boxes), len(gt_boxes)

    return 0, 0, 0


# Load the EAST model
east_model = "frozen_east_text_detection.pb"
net = cv2.dnn.readNet(east_model)

# Define dataset paths
input_folder = "icdar2015dataset/ch4_training_images"
gt_folder = "icdar2015dataset/ch4_training_localization_transcription_gt"

# Performance evaluation on the first 1000 samples
start_time = time.time()
total_matches = total_pred = total_gt = 0

for idx, filename in enumerate(os.listdir(input_folder)):
    if idx >= 1000:
        break  # Process only the first 1000 samples

    if filename.endswith(".jpg"):
        image_path = os.path.join(input_folder, filename)
        gt_file = os.path.join(gt_folder, f"gt_{filename.replace('.jpg', '.txt')}")
        try:
            matches, pred_count, gt_count = detect_and_evaluate(
                image_path, net, gt_file, apply_gray=True, apply_blur=True
            )
            total_matches += matches
            total_pred += pred_count
            total_gt += gt_count
        except Exception as e:
            print(f"Error when processing {image_path}: {e}")

# Compute precision, recall, and F1 score for bounding box detection
precision = total_matches / total_pred if total_pred > 0 else 0
recall = total_matches / total_gt if total_gt > 0 else 0
f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

end_time = time.time()
elapsed_time = end_time - start_time

# Display results
print(f"Mean Precision: {precision:.2f}")
print(f"Mean Recall: {recall:.2f}")
print(f"Mean F1-Score: {f1:.2f}")
print(f"Time taken for text detection on 1000 samples: {elapsed_time:.2f} seconds")

Mean Precision: 0.75
Mean Recall: 0.15
Mean F1-Score: 0.25
Time taken for text detection on 1000 samples: 82.15 seconds


- Threshold =0.1
    Mean Precision: 0.75
    Mean Recall: 0.15
    Mean F1-Score: 0.25
    Time taken for text detection on 1000 samples: 80.49 seconds
- Threshold =0.5
    Mean Precision: 0.78
    Mean Recall: 0.14
    Mean F1-Score: 0.24
    Time taken for text detection on 1000 samples: 78.85 seconds
- Threshold =0.8
    Mean Precision: 0.80
    Mean Recall: 0.14
    Mean F1-Score: 0.23
    Time taken for text detection on 1000 samples: 79.59 seconds



with confThreshold=0.1
- No preprocessing
    Mean Precision: 0.78
    Mean Recall: 0.18
    Mean F1-Score: 0.30
    Time taken for text detection on 1000 samples: 81.88 seconds
- Grayscale
    Mean Precision: 0.73
    Mean Recall: 0.17
    Mean F1-Score: 0.27
    Time taken for text detection on 1000 samples: 78.82 seconds
- Blur
    Mean Precision: 0.80
    Mean Recall: 0.16
    Mean F1-Score: 0.27
    Time taken for text detection on 1000 samples: 79.97 seconds
- Grayscale + Blur
    Mean Precision: 0.75
    Mean Recall: 0.15
    Mean F1-Score: 0.25
    Time taken for text detection on 1000 samples: 80.49 seconds


with confThreshold=0.1
- No preprocessing
    Mean Precision: 0.78
    Mean Recall: 0.18
    Mean F1-Score: 0.30
    Time taken for text detection on 1000 samples: 81.88 seconds
- Grayscale
    Mean Precision: 0.73
    Mean Recall: 0.17
    Mean F1-Score: 0.27
    Time taken for text detection on 1000 samples: 78.82 seconds
- Blur
    Mean Precision: 0.73
    Mean Recall: 0.17
    Mean F1-Score: 0.27
    Time taken for text detection on 1000 samples: 78.82 seconds
- Grayscale + Blur
    Mean Precision: 0.75
    Mean Recall: 0.15
    Mean F1-Score: 0.25
    Time taken for text detection on 1000 samples: 80.49 seconds
