In [13]:
import pandas as pd
import numpy as np

import glob
import os

In [14]:
PROCESSED_PATH='./Data/ICDAR_SROIE/processed/'

In [15]:
receipt_train_img={os.path.split(x)[-1].replace(".jpg",""):x for x in glob.glob("./Data/ICDAR_SROIE/0325updated.task1train(626p)/*.jpg") if not os.path.split(x)[-1].replace(".jpg","").endswith(")")}
ocr_data = {os.path.split(x)[-1].replace(".txt",""):x for x in glob.glob("./Data/ICDAR_SROIE/0325updated.task1train(626p)/*.txt") if not os.path.split(x)[-1].replace(".txt","").endswith(")")} 
label_data = {os.path.split(x)[-1].replace(".txt",""):x for x in glob.glob("./Data/ICDAR_SROIE/0325updated.task2train(626p)/*.txt") if not os.path.split(x)[-1].replace(".txt","").endswith(")")}


In [16]:
#checking is all the sets have the same number of labeled data
assert len(receipt_train_img)==len(ocr_data)==len(label_data)

In [17]:
import json
from fuzzywuzzy import fuzz

def extract_ocr_data_fromtxt(file_path,key,save=False):
    """
    Extract the bouding box coordinates from txt and returns a pandas dataframe
    """
    with open(file_path,'r') as in_file:
        stripped=(line.strip() for line in in_file)
        lines=[line.split(",")[:2] + line.split(",")[4:6]+[",".join(line.split(",")[8:])] for line in stripped if line]
        
        df=pd.DataFrame(lines,columns=['xmin','ymin','xmax','ymax','text'])
        
    #Option to save as a csv
    
    if save:
        if not os.path.exists(PROCESSED_PATH):
            os.mkdir(PROCESSED_PATH)
            df.to_csv(os.path.join(PROCESSED_PATH,key+'.csv'),index=None)
    return df


def extract_label_data_fromtxt(file_path):
    """
    Read the label json and return as a dictionary
    """
    
    with open(file_path) as f:
        json_data=json.load(f)
        
    return json_data



def map_labels(text,k):
    """
    Maps label to ocr output using certain heuristics and logic
    """
    text_n = None
    k_n = None
    try:
        text_n = float(text)
    except Exception as e:
        pass

    try:
        k_n = float(k)
    except Exception as e:
        pass
    # if both are text then we are doing a fuzzy match
    if (pd.isnull(text_n) and pd.isnull(k_n)):
#         if (text in k) or (k in text):
#             return True
        if fuzz.token_set_ratio(text,k) > 90:
            return True
    # if both are numerical then we just check for complete match
    elif (text_n is not None) and (k_n is not None):
        return text == k
    # special case to handle total, using endwith 
    # as sometimes symbols are attached to ocr output
    elif (k_n is not None) and (text_n is None):
        return text.endswith(k)
    
    return False
        

In [18]:
def mapped_label_ocr(key):
    """
    Wrapper function to yield result of mapping in desired format
    """
    data = extract_ocr_data_fromtxt(ocr_data[key],key)
    label_dict = extract_label_data_fromtxt(label_data[key])
    
    data['labels'] = [[k for k,v in label_dict.items() if map_labels(text, v)] for text in data.text]
    # To avoid company and address overlap in some cases.
    data['labels'] = ['address' if len(label) > 1 else "".join(label) for label in data['labels']]
    
    if not os.path.exists(PROCESSED_PATH):
        os.mkdir(PROCESSED_PATH)
    data.to_csv(os.path.join(PROCESSED_PATH,key + '.csv'), index =None)
    
    
    return data
    
    

In [19]:
mapped_data={key:mapped_label_ocr(key) for key in ocr_data.keys()}

In [20]:
#lets plot some of these labels and see the results
import cv2
from matplotlib import pyplot as plt

LABELLED_IMG = "./Data/ICDAR_SROIE/labelled_img/"
if not os.path.exists(LABELLED_IMG):
    os.mkdir(LABELLED_IMG)

ModuleNotFoundError: No module named 'cv2'