# Importing Packages

In [92]:
import os
import shutil
from pathlib import Path
import re
import math
from typing import Tuple, Union
from tqdm import tqdm

import numpy as np

import cv2
from PIL import Image
import pytesseract
from deskew import determine_skew

<h3>Setting Environment Paths</h3>

In [2]:
directory = Path(os.path.abspath('')).parents[0]
temp_data_dir = os.path.join(directory,"data/temp")
sample_data_dir = os.path.join(directory,"data/rvl_cdip_1000_samples")

# Preprocessing


<h3> Orientation Correction</h3>

In [115]:
def rotate_image(image_path,output_image_path):
    
    image = cv2.imread(image_path)
    try:
        osd_result = pytesseract.image_to_osd(Image.open(image_path),config='--psm 0 -c min_characters_to_try=5')
        
        
        # GET OSD Results 
        angle = re.search(r'Orientation in degrees: \d+', osd_result).group().split(':')[-1].strip()
        confidence= re.search(r'Orientation confidence: \d+\.\d+', osd_result).group().split(':')[-1].strip()
        confidence = float(confidence)
        
    except Exception as e:
        angle = 0 
        confidence = 0
        
    
    # Rotating images based on OSD RESULTS
    if angle=='90':
        rotated_image = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)    
    if angle=='180' and confidence :
        rotated_image = cv2.rotate(image, cv2.ROTATE_180)
    if angle=='270':
        rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
    else:
        rotated_image = image

    cv2.imwrite(output_image_path,rotated_image)
    


<h3> Binarization </h3>

In [116]:
def binarize_image(image_path,output_image_path, threshold=128):
    image = cv2.imread(image_path)
    # Convert the image to grayscale 
    if len(image.shape) == 3:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply binary thresholding
    _, binary_image = cv2.threshold(image, threshold, 255, cv2.THRESH_BINARY)

    cv2.imwrite(output_image_path,binary_image)

<h3> Deniosing </h3>

In [117]:
def remove_noise(image_path,output_image_path):
    image = cv2.imread(image_path)
    # if len(image.shape) == 3:
    #     image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    denoised_image = cv2.fastNlMeansDenoising(image, None, 20, 7, 21)
    cv2.imwrite(output_image_path,denoised_image)
 

<h3>Deskew</h3>

In [118]:
# Reference : https://pypi.org/project/deskew/


def adjust_skew(image: np.ndarray, angle: float, background: Union[int, Tuple[int, int, int]]) -> np.ndarray:
    old_width, old_height = image.shape[:2]
    angle_radian = math.radians(angle)
    width = abs(np.sin(angle_radian) * old_height) + abs(np.cos(angle_radian) * old_width)
    height = abs(np.sin(angle_radian) * old_width) + abs(np.cos(angle_radian) * old_height)

    image_center = tuple(np.array(image.shape[1::-1]) / 2)
    rot_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
    rot_mat[1, 2] += (width - old_width) / 2
    rot_mat[0, 2] += (height - old_height) / 2
    return cv2.warpAffine(image, rot_mat, (int(round(height)), int(round(width))), borderValue=background)


def deskew(image_path,output_image_path):
    image = cv2.imread(image_path)
    angle = determine_skew(image)
    adjusted_skew_image = adjust_skew(image, angle, (0, 0, 0))
    cv2.imwrite(output_image_path,adjusted_skew_image)

<h3>Normalization </h3>



In [119]:
def normalize_image(image_path,output_image_path):
    image = cv2.imread(image_path)
    norm_img = np.zeros((image.shape[0], image.shape[1]))
    normalized_image = cv2.normalize(image, norm_img, 0, 255, cv2.NORM_MINMAX)
    cv2.imwrite(output_image_path,normalized_image)


<h3> Image Scaling </h3>

In [31]:
def scale_image(file_path,output_file):
    im = Image.open(file_path)
    length_x, width_y = im.size
    factor = min(1, float(1024.0 / length_x))
    size = int(factor * length_x), int(factor * width_y)
    scaled_image = im.resize(size, Image.Resampling.LANCZOS)
    scaled_image.save(output_file, dpi=(300, 300))


<h3> Removing Lines </h3>

In [44]:
# This function removes Vertical and Horizontal Lines in an Image, like Tables and Borders 
# It is a sensitive function as reducing with higher strength might lead to misspellings. like H can become  l l 
# Source: https://stackoverflow.com/questions/57961119/how-to-remove-all-the-detected-lines-from-the-original-image-using-python

def remove_lines(image_path,output_image_path):
    image = cv2.imread(image_path)

    kernel_vertical = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
    remove_vertical = 255 - cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel_vertical)

    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
    remove_horizontal = 255 - cv2.morphologyEx(image, cv2.MORPH_CLOSE, horizontal_kernel)

    remove_both = cv2.add(remove_vertical, remove_horizontal)
    removed_lines_image = cv2.add(remove_both, image)
    cv2.imwrite(output_image_path,removed_lines_image)

<h3> Enchance Contrast </h3>

In [46]:
def enhance_contrast(image_path,output_image_path):
    # Read the image
    original_image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Apply histogram equalization
    equalized_image = cv2.equalizeHist(original_image,)
    
    cv2.imwrite(output_image_path,equalized_image)


<h3> Image Segmentation</h3>

In [131]:
def extract_text_from_image(image):
    # Convert the OpenCV image to a format that Tesseract expects (PIL Image)
    pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

    # Use Tesseract to do OCR on the image
    text = pytesseract.image_to_string(pil_image, config='--psm 6')

    return text


def get_text_from_boxes(image_path,output_image_path):

    temp_data_dir = os.path.join(directory,"data/temp")

    image = cv2.imread(image_path)
    base_image = image.copy()

    #grayscale
    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    cv2.imwrite(temp_data_dir+"/boxes/sample_gray.png", gray)

    #resize
    im = cv2.resize(image,None,fx=4, fy=4, interpolation = cv2.INTER_CUBIC)

    # #binary
    ret,thresh = cv2.threshold(gray,127,255,cv2.THRESH_BINARY_INV)
    cv2.imwrite(temp_data_dir+"/boxes/sample_threshold.png", thresh)

    # #dilation
    kernel = np.ones((5,50), np.uint8)
    img_dilation = cv2.dilate(thresh, kernel, iterations=1)
    cv2.imwrite(temp_data_dir+"/boxes/sample_dilated.png", img_dilation)


    # #find contours
    # contours, hierarchy
    ctrs, hier = cv2.findContours(img_dilation.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    #sort contours
    sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[1])
    # sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])

    text_results = []
    for i, ctr in enumerate(sorted_ctrs):
        # Get bounding box
        x, y, w, h = cv2.boundingRect(ctr)

        # Getting ROI
        roi = base_image[y:y+h, x:x+w]
        text_results.append(extract_text_from_image(roi))

        # cv2.imshow('segment no:'+str(i),roi)
        cv2.rectangle(image,(x,y),( x + w, y + h ),(90,0,255),2)

    cv2.imwrite(output_image_path, image)
    
    total_text_extracted =  "".join(text_results).replace("\n",".")
    return total_text_extracted



# MAIN

<h3> Util Functions</h3>


In [132]:
def delete_all_files(directory_path : os.path ):
    for item in os.listdir(directory_path):
        item_path = os.path.join(directory_path, item)

        if os.path.isfile(item_path):
            os.remove(item_path)

        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)

In [133]:
def preprocess_single_image(image_path,output_folder_path):
    
    each_image = image_path.split("/")[-1]
    image_folder_name =  each_image.replace(".png","")
    each_image_folder = os.path.join(output_folder_path,image_folder_name)
    
    if not os.path.exists(each_image_folder):
        os.mkdir(each_image_folder)

    
    # Rotating the Image
    rotated_image_path = os.path.join(each_image_folder,each_image.replace(".png","_rotated.png"))
    rotate_image(image_path=image_path,output_image_path=rotated_image_path)
    # print(rotated_image_path)

    # Binarize Image
    binary_image_path = os.path.join(each_image_folder,each_image.replace(".png","_binary.png"))
    binarize_image(rotated_image_path,binary_image_path)
    # print(binary_image_path)

    # Denoise Image
    denoised_image_path = os.path.join(each_image_folder,each_image.replace(".png","_denoised.png"))
    # remove_noise(binary_image_path,denoised_image_path)
    remove_noise(rotated_image_path,denoised_image_path)
    # print(denoised_image_path)



    # Desknew Image
    deskewed_image_path = os.path.join(each_image_folder,each_image.replace(".png","_deskewed.png"))
    remove_noise(denoised_image_path,deskewed_image_path)
    # print(deskewed_image_path)

    # Enchance Contrast
    contrast_corrected_image_path = os.path.join(each_image_folder,each_image.replace(".png","_contrast.png"))
    enhance_contrast(denoised_image_path,contrast_corrected_image_path)
    # print(contrast_corrected_image_path)



    # Normalize Image
    normalized_image_path = os.path.join(each_image_folder,each_image.replace(".png","_normalized.png"))
    normalize_image(deskewed_image_path,normalized_image_path)
    # normalize_image(contrast_corrected_image_path,normalized_image_path)
    # print(normalized_image_path)

    # Scaling Image
    scaled_image_path = os.path.join(each_image_folder,each_image.replace(".png","_scaled.png"))
    scale_image(normalized_image_path,scaled_image_path)
    # print(scaled_image_path)

    # Removing Dark Lines in Image
    line_removed_image_path = os.path.join(each_image_folder,each_image.replace(".png","_lines.png"))
    remove_lines(scaled_image_path,line_removed_image_path)
    # print(line_removed_image_path)

     # Extract Text from Images Bounded Boxes
    boxed_image_path = os.path.join(each_image_folder,each_image.replace(".png","_boxed.png"))
    extracted_text = get_text_from_boxes(line_removed_image_path,boxed_image_path)

    return extracted_text




In [134]:
directory = Path(os.path.abspath('')).parents[0]
sample_data_dir = os.path.join(directory,"data/rvl_cdip_1000_samples")
# output_folder_dir = os.path.join(directory,"data/samples/sample_image")
output_folder_dir = os.path.join(directory,"data/temp")

# image_path = os.path.join(sample_data_dir,"1/image_0121.png")
image_path = os.path.join(sample_data_dir,"6/image_0732.png")
# image_path = os.path.join(sample_data_dir,"15/image_0756.png")
# image_path = os.path.join(sample_data_dir,"15/image_0660.png")
delete_all_files(output_folder_dir)

preprocess_single_image(image_path,output_folder_dir)



In [135]:
directory = Path(os.path.abspath('')).parents[0]
sample_data_dir = os.path.join(directory,"data/rvl_cdip_1000_samples")
temp_data_dir = os.path.join(directory,"data/temp")
processed_data_dir = os.path.join(directory,"data/processed")
output_folder_dir = os.path.join(directory,"data/temp")

ocr_dataset = []


for each_class in tqdm(sorted(os.listdir(sample_data_dir))):
    # Not able to remove ".DS_Store" hidden file, therefore using if clause
    if each_class != ".DS_Store":
        class_dir = os.path.join(sample_data_dir,each_class)
        class_folder_path = os.path.join(processed_data_dir,each_class)
        

        if not os.path.exists(class_folder_path):
            os.makedirs(class_folder_path)

        for each_image in os.listdir(class_dir):
            image_data = {}
            image_data["image"] = each_image
            image_data["image_class"] = each_class

            image_folder_name = each_class + "_" +each_image.replace(".png","")
            each_image_folder = os.path.join(temp_data_dir,image_folder_name)
            
            image_path = os.path.join(class_dir,each_image)
            print(image_path)

            exracted_text = preprocess_single_image(image_path,output_folder_dir)
            image_data['text'] = exracted_text
            ocr_dataset.append(image_data)

  0%|          | 0/16 [00:00<?, ?it/s]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0794.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0757.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0395.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0427.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0552.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0036.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0535.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0534.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0050.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/0/image_0722.png
/Users/dip

  6%|▋         | 1/16 [02:00<30:08, 120.58s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0595.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0218.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0782.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0168.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0790.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0182.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0380.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0591.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0988.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/1/image_0793.png
/Users/dip

 12%|█▎        | 2/16 [05:59<44:23, 190.26s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0033.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0345.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0351.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0435.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0192.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0145.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0019.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0596.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0018.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/10/image_0622.png


 19%|█▉        | 3/16 [10:15<47:43, 220.28s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0378.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0556.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0637.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0031.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0227.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0742.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0959.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0626.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0154.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/11/image_0864.png


 25%|██▌       | 4/16 [14:21<46:06, 230.57s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0393.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0350.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0191.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0608.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0755.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0966.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0621.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0635.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0379.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/12/image_0347.png


 31%|███▏      | 5/16 [18:25<43:06, 235.18s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0146.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0973.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0797.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0542.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0437.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0226.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0408.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0829.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0815.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/13/image_0949.png


 38%|███▊      | 6/16 [23:11<42:06, 252.66s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0741.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0384.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0804.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0540.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0971.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0385.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0791.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0433.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0194.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/14/image_0590.png


 44%|████▍     | 7/16 [26:57<36:33, 243.76s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0344.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0813.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0152.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0580.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0999.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0769.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0423.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0838.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0743.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/15/image_0555.png


 50%|█████     | 8/16 [31:11<32:57, 247.17s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0967.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0224.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0623.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0025.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0569.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0965.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0811.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0434.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0381.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/2/image_0418.png
/Users/dip

 56%|█████▋    | 9/16 [32:45<23:14, 199.17s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0620.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0768.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0032.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0972.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0184.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0806.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0186.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0964.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0030.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/3/image_0424.png
/Users/dip

 62%|██████▎   | 10/16 [35:25<18:41, 186.98s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0147.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0812.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0583.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0150.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0144.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0839.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0578.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0419.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0343.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/4/image_0425.png
/Users/dip

 69%|██████▉   | 11/16 [37:30<14:00, 168.11s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0740.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0230.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0231.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0219.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0153.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0409.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0421.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0780.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0183.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/5/image_0975.png
/Users/dip

 75%|███████▌  | 12/16 [41:12<12:17, 184.42s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0998.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0754.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0557.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0594.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0390.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0178.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0034.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0551.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0586.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/6/image_0627.png
/Users/dip

 81%|████████▏ | 13/16 [45:07<09:59, 199.78s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0783.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0190.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0151.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0970.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0582.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0233.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0541.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0805.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0020.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/7/image_0746.png
/Users/dip

 88%|████████▊ | 14/16 [49:13<07:07, 213.83s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0436.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0422.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0185.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0634.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0581.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0027.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0609.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0386.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0353.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/8/image_0958.png
/Users/dip

 94%|█████████▍| 15/16 [51:32<03:11, 191.34s/it]

/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0387.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0807.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0026.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0543.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0225.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0796.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0392.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0140.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0974.png
/Users/dipanjanchowdhury/Documents/Labs/raft-interview/data/rvl_cdip_1000_samples/9/image_0155.png
/Users/dip

100%|██████████| 16/16 [53:38<00:00, 201.14s/it]


In [136]:
import pandas as pd

data_df = pd.DataFrame(ocr_dataset)

In [138]:
datasets_data_dir = os.path.join(directory,"data/datasets")

data_df.to_csv(datasets_data_dir+"/extracted_dataset.csv",index= False)

In [137]:
data_df.head()

Unnamed: 0,image,image_class,text
0,image_0794.png,0,nairo |.Bis {NBIFO Institut fiir biologische F...
1,image_0757.png,0,Aqua L “Risa.2} 30/48.Ro).SMEy.PHILIP MORRIS.‘...
2,image_0395.png,0,fa) Re fer pos.CG.NOC.Ne.(sR).jae.R.J. REYNOLD...
3,image_0427.png,0,c.0.P.TY.Dr. Murco HN. Roegholt.Baarn 2h<9+57....
4,image_0552.png,0,in.GALLAHER LIMITED.nO aaa.g HINGEWAY + LONDON...
