In [1]:
import gc
import torch
import pandas as pd
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from craft_text_detector import Craft
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import time
import jiwer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def read(image_path):
    craft = Craft(output_dir=None, crop_type="box", cuda=False)
    image = cv2.imread(image_path)
    result = craft.detect_text(image_path)
    boxes = result["boxes"]
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    texts = []
    for box in boxes:
        crop = pil_image.crop([box[0][0], box[0][1], box[2][0], box[2][1]])
        pixel_values = processor(crop, return_tensors="pt").pixel_values
        with torch.no_grad():
            generated_ids = model.generate(pixel_values)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        texts.append(text)
    return texts

In [4]:
# image_path = "test/eng_AF_022.jpg"
# img = Image.open(image_path)

# plt.figure(figsize=(10, 8))
# plt.imshow(img)
# plt.axis('off')  
# plt.show()

In [5]:
# start_time = time.time()
# texts = read(image_path)
# text_data = " ".join(texts)
# end_time = time.time()
# time_difference = end_time - start_time
# print(f"Time: {time_difference} seconds")
# print("Data\n")
# print(text_data)
# gc.collect()
# torch.cuda.empty_cache()

In [6]:
test_data = pd.read_csv("test_dataset.csv")

In [7]:
test_data.head()

Unnamed: 0,image_path,content
0,/Users/chigi/Developer/cd_ocr_code_runner/test...,* Cross road at next lights and continue strai...
1,/Users/chigi/Developer/cd_ocr_code_runner/test...,Language for communication use single -word an...
2,/Users/chigi/Developer/cd_ocr_code_runner/test...,Everything will be okay in the end. If it's no...
3,/Users/chigi/Developer/cd_ocr_code_runner/test...,"communication directly. knowledge, recepient c..."
4,/Users/chigi/Developer/cd_ocr_code_runner/test...,Date: 01/31/18 limit the request to the smalle...


In [8]:
def calculate_wer(reference, hypothesis):
    """Calculate Word Error Rate using jiwer"""
    try:
        return jiwer.wer(reference, hypothesis)
    except:
        if len(reference) == 0:
            return 0 if len(hypothesis) == 0 else 1
        return 1.0

def calculate_cer(reference, hypothesis):
    """Calculate Character Error Rate using jiwer"""
    try:
        return jiwer.cer(reference, hypothesis)
    except:
        if len(reference) == 0:
            return 0 if len(hypothesis) == 0 else 1
        return 1.0

In [9]:
results = []
total_images = len(test_data)

print(f"Starting OCR evaluation on {total_images} images...")

for idx, row in enumerate(test_data.iterrows()):
    idx = idx + 1  
    _, row = row 
    
    image_path = row['image_path']
    ground_truth = str(row['content'])
    
    print(f"[{idx}/{total_images}] Processing: {image_path}")
    
    try:
        start_time = time.time()
        predicted_texts = read(image_path)
        prediction = " ".join(predicted_texts)
        processing_time = time.time() - start_time
        
        cer = calculate_cer(ground_truth, prediction)
        wer = calculate_wer(ground_truth, prediction)
        
        print(f"    Results - CER: {cer:.4f}, WER: {wer:.4f}, Time: {processing_time:.2f}s")
        
        results.append({
            'image_path': image_path,
            'ground_truth': ground_truth,
            'prediction': prediction,
            'cer': cer,
            'wer': wer,
            'processing_time': processing_time
        })
        
    except Exception as e:
        print(f"    Error processing {image_path}: {e}")
    
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print("-" * 50)

results_df = pd.DataFrame(results)

print("\nResults summary:")
print(results_df[['image_path', 'cer', 'wer', 'processing_time']])

overall_cer = results_df['cer'].mean()
overall_wer = results_df['wer'].mean()
avg_processing_time = results_df['processing_time'].mean()

print(f"\nOverall metrics:")
print(f"Average CER: {overall_cer:.4f}")
print(f"Average WER: {overall_wer:.4f}")
print(f"Average processing time: {avg_processing_time:.2f} seconds per image")

Starting OCR evaluation on 172 images...
[1/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_307.jpg


  copyStateDict(torch_utils.load(weight_path, map_location="cpu"))
  copyStateDict(torch_utils.load(weight_path, map_location="cpu"))


    Results - CER: 0.4535, WER: 0.8148, Time: 56.57s
--------------------------------------------------
[2/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_184.jpg
    Results - CER: 0.4071, WER: 0.7222, Time: 25.98s
--------------------------------------------------
[3/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_NA_034.jpg
    Results - CER: 0.6742, WER: 0.8947, Time: 36.47s
--------------------------------------------------
[4/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AS_016.jpg
    Error processing /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AS_016.jpg: Coordinate 'lower' is less than 'upper'
--------------------------------------------------
[5/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AS_041.jpg
    Error processing /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AS_041.jpg: Coordinate 'lower' is less than 'upper'
-------------------------------

The channel dimension is ambiguous. Got image shape (1, 211, 3). Assuming channels are the first dimension.


    Error processing /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AF_027.jpg: mean must have 1 elements if it is an iterable, got 3
--------------------------------------------------
[65/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_117.jpg
    Results - CER: 0.6948, WER: 0.9013, Time: 67.79s
--------------------------------------------------
[66/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_140.jpg
    Results - CER: 0.3691, WER: 0.8000, Time: 40.08s
--------------------------------------------------
[67/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_013.jpg
    Results - CER: 0.2577, WER: 0.5600, Time: 28.66s
--------------------------------------------------
[68/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AF_067.jpg
    Results - CER: 0.6031, WER: 0.9091, Time: 19.51s
--------------------------------------------------
[69/172] Processing: /Users/chigi/Deve

The channel dimension is ambiguous. Got image shape (1, 2195, 3). Assuming channels are the first dimension.


    Error processing /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_NA_052.jpg: mean must have 1 elements if it is an iterable, got 3
--------------------------------------------------
[80/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_048.jpg
    Results - CER: 0.4961, WER: 0.7083, Time: 39.63s
--------------------------------------------------
[81/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AF_047.jpg
    Results - CER: 0.9524, WER: 1.1471, Time: 48.20s
--------------------------------------------------
[82/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_EU_199.jpg
    Results - CER: 0.7554, WER: 1.0476, Time: 27.98s
--------------------------------------------------
[83/172] Processing: /Users/chigi/Developer/cd_ocr_code_runner/test_data/eng_AF_010.jpg
    Results - CER: 0.8188, WER: 0.9333, Time: 25.86s
--------------------------------------------------
[84/172] Processing: /Users/chigi/Deve

In [10]:
results_df.to_csv('results.csv', index=False)