# Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks

/content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks


In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Parameters
bs = None
keyframes_dir = None
save_dir = None

In [None]:
bs = 16
keyframes_dir = "/content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks/data_extraction/transnet/Keyframes"
save_dir = "/content/drive/MyDrive/AIO-2024/pipeline-hcm-ai/notebooks/data_extraction/metadata/ocr"

In [None]:
import os

dir_path = os.getcwd()

if not keyframes_dir:
    if 'google.colab' in str(get_ipython()):
        # Update this path as necessary
        keyframes_dir = f'{dir_path}/keyframes'
    elif 'kaggle' in str(get_ipython()):
        keyframes_dir = f'{dir_path}/keyframes'
    else:
        parent_dir_path = os.path.dirname(dir_path)
        keyframes_dir = f'{parent_dir_path}/transnet/keyframes'

if not bs:
    bs = 16

if not save_dir:
    save_dir = './ocr'

In [None]:
! pip install aiohttp aiofiles

Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0


In [None]:
! pip install aiofiles



In [None]:
! pip install git+https://github.com/JaidedAI/EasyOCR.git

Collecting git+https://github.com/JaidedAI/EasyOCR.git
  Cloning https://github.com/JaidedAI/EasyOCR.git to /tmp/pip-req-build-awfpdhuq
  Running command git clone --filter=blob:none --quiet https://github.com/JaidedAI/EasyOCR.git /tmp/pip-req-build-awfpdhuq
  Resolved https://github.com/JaidedAI/EasyOCR.git to commit 3d3852d8e4ddd9e5044a6f70b561ca7186d9cbac
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-bidi (from easyocr==1.7.1)
  Downloading python_bidi-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting pyclipper (from easyocr==1.7.1)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr==1.7.1)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
! pip install easyocr



In [None]:
! pip install transformers




In [None]:
import os
import json
import asyncio
import glob
from tqdm import tqdm
import easyocr
import aiofiles
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Parse data path

In [None]:
def parse_keyframe_info(keyframes_dir):
    all_keyframe_paths = {}
    for part in sorted(os.listdir(keyframes_dir)):
        data_part_path = f'{keyframes_dir}/{part}'
        data_part = part.split('/')[-1]
        all_keyframe_paths[data_part] = []
        image_path = sorted(glob.glob(f'{data_part_path}/*.jpg'))
        all_keyframe_paths[data_part] = image_path
    return all_keyframe_paths

# Inference

In [16]:
async def create_directory(directory):
    """Create a directory asynchronously if it doesn't exist."""
    os.makedirs(directory, exist_ok=True)

async def save_ocr_results(save_dir, key, video_ocr_results):
    """Save OCR results to a JSON file."""
    filename = f"{save_dir}/{key}.json"
    async with aiofiles.open(filename, 'w') as f:
        await f.write(json.dumps(video_ocr_results, ensure_ascii=False, indent=2))

async def ocr_and_save_results(reader, tokenizer, model, device, all_keyframe_paths, save_dir, batch_size=16):
    """Perform OCR on keyframes, translate, and save results to JSON files."""
    await create_directory(save_dir)

async def translate_text(text, tokenizer, model, device):
    """Translate text using VinAI's translation model."""
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
    output_ids = model.generate(
        input_ids,
        decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"],
        num_return_sequences=1,
        num_beams=5,
        early_stopping=True
    )
    translated_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
    return translated_text

async def process_image(reader, image_path):
    """Process a single image with OCR."""
    result = await asyncio.to_thread(reader.readtext, image_path)
    refined_result = [item for item in result if item[2] > 0.6]
    refined_result = easyocr.utils.get_paragraph(refined_result)
    return [item[1] for item in refined_result]

async def process_video_keyframes(reader, tokenizer, model, device, video_keyframe_paths, batch_size=16):
    """Process keyframes of a video and perform OCR and translation."""
    video_ocr_results = {}
    tasks = []

    for i in range(0, len(video_keyframe_paths), batch_size):
        batch = video_keyframe_paths[i:i+batch_size]
        for image_path in batch:
            task = asyncio.create_task(process_image(reader, image_path))
            tasks.append((os.path.basename(image_path), task))

    for frame_name, task in tasks:
        text_detected = await task
        if text_detected:
            joined_text = " ||| ".join(text_detected)
            translated_text = await translate_text(joined_text, tokenizer, model, device)
            translated_items = translated_text.split(" ||| ")
            video_ocr_results[frame_name] = translated_items

    return video_ocr_results

async def ocr_and_save_results(reader, tokenizer, model, device, all_keyframe_paths, save_dir, batch_size=16):
    """Perform OCR on keyframes, translate, and save results to JSON files."""
    await create_directory(save_dir)
    keys = sorted(all_keyframe_paths.keys())

    for key in tqdm(keys, desc="Processing keys"):
        video_keyframe_paths = all_keyframe_paths[key]
        video_ocr_results = await process_video_keyframes(
            reader, tokenizer, model, device, video_keyframe_paths, batch_size)
        await save_ocr_results(save_dir, key, video_ocr_results)

In [17]:
# Main execution
all_keyframe_paths = parse_keyframe_info(keyframes_dir)
reader = easyocr.Reader(['vi'], gpu=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en-v2", src_lang="vi_VN")
model = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en-v2").to(device)

await ocr_and_save_results(reader, tokenizer, model, device, all_keyframe_paths, save_dir, bs)

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
Processing keys: 100%|██████████| 4/4 [13:05<00:00, 196.33s/it]
