# Setup

In [None]:
# Parameters
bs = None
keyframes_dir = None
save_dir = None

In [1]:
import os
from collections import defaultdict

dir_path = os.getcwd()

if not keyframes_dir:
    if 'google.colab' in str(get_ipython()):
        # Update this path as necessary
        keyframes_dir = f'{dir_path}/Keyframes'
    elif 'kaggle' in str(get_ipython()):
        keyframes_dir = f'{dir_path}/Keyframes'
    else:
        parent_dir_path = os.path.dirname(dir_path)
        keyframes_dir = f'{parent_dir_path}/transnet/Keyframes'
        
if not bs:
    bs = 16
    
if not save_dir:
    save_dir = './ocr'

In [2]:
! pip install easyocr



In [3]:
import os
import cv2
import glob
import json
import torch
import easyocr
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# Parse data path

In [4]:
def parse_keyframe_info(keyframes_dir = '../transnet/Keyframes'):
    all_keyframe_paths = {}
    for part in sorted(os.listdir(keyframes_dir)):
        data_part = part.split('/')[-1]
        all_keyframe_paths[data_part] = {}
        
    for data_part in sorted(all_keyframe_paths.keys()):
        data_part_path = f'{keyframes_dir}/{data_part}'
        video_dirs = sorted(os.listdir(data_part_path))
        video_ids = [video_dir.split('_')[-1] for video_dir in video_dirs]
        for video_id, video_dir in zip(video_ids, video_dirs):
            keyframe_paths = sorted(
                glob.glob(f'{data_part_path}/{video_dir}/*.jpg'))
            all_keyframe_paths[data_part][video_id] = keyframe_paths
    
    return all_keyframe_paths

# Inference

In [7]:
def create_directory(path):
    """Create a directory if it does not exist."""
    if not os.path.exists(path):
        os.makedirs(path)


def save_ocr_results(save_dir, key, video_id, ocr_results):
    """Save OCR results to a JSON file."""
    with open(os.path.join(save_dir, key, f"{video_id}.json"), "w", encoding='utf-8') as jsonfile:
        json.dump(ocr_results, jsonfile, ensure_ascii=False)


def process_video_keyframes(reader, video_keyframe_paths, batch_size=16):
    """Process keyframes of a video and perform OCR."""
    video_ocr_results = []
    
    for i in range(0, len(video_keyframe_paths), batch_size):
        image_paths = video_keyframe_paths[i:i+batch_size]
        results = reader.readtext_batched(
            image_paths, batch_size=len(image_paths))
        for result in results:
            refined_result = [item for item in result if item[2] > 0.5]
            refined_result = easyocr.utils.get_paragraph(refined_result)
            text_detected = [item[1] for item in refined_result]
            video_ocr_results.append(text_detected)
    return video_ocr_results


def ocr_and_save_results(reader, all_keyframe_paths, save_dir, batch_size=16):
    """Perform OCR on keyframes and save results to JSON files."""
    create_directory(save_dir)
    keys = sorted(all_keyframe_paths.keys())

    for key in tqdm(keys, desc="Processing keys"):
        video_keyframe_paths = all_keyframe_paths[key]
        video_ids = sorted(video_keyframe_paths.keys())
        key_dir = os.path.join(save_dir, key)
        create_directory(key_dir)

        for video_id in tqdm(video_ids, desc=f"Processing {key}"):
            video_keyframe_path = video_keyframe_paths[video_id]
            video_ocr_results = process_video_keyframes(
                reader, video_keyframe_path, batch_size)
            save_ocr_results(save_dir, key, video_id, video_ocr_results)

In [8]:
all_keyframe_paths = parse_keyframe_info(keyframes_dir)
# this needs to run only once to load the model into memory
reader = easyocr.Reader(['vi'], gpu=True)
ocr_and_save_results(reader, all_keyframe_paths, save_dir, bs)

Processing keys:   0%|          | 0/1 [00:00<?, ?it/s]

['/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000000.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000010.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000021.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000032.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000043.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000044.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/000113.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extractio

Processing L01_extra: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it]
Processing keys: 100%|██████████| 1/1 [00:03<00:00,  3.88s/it]

['/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026705.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026750.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026796.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026797.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026810.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026824.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extraction/transnet/Keyframes/L01_extra/V001/026838.jpg', '/home/jiggle/personal/competition/hcm-ai/Pipeline_HCM_AI/notebooks/data_extractio


