In [None]:
!python --version

import cv2
import pytesseract

import os
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bisect import bisect


In [None]:
def myround(x, base=10):
    return base * round(x/base)

def myceil(x, base=10):
    return base * np.ceil(x/base)

In [None]:
ipath =  "img_data/"
opath =  "output/"

In [None]:
filename = os.path.join(ipath, "receipt3_1.png")
img = cv2.imread(filename)

hh, ww, cc = img.shape
#print(hh, ww, cc)

In [None]:
fig1, ax1 = plt.subplots(figsize=(16, 12))
ax1.imshow(img)

xticks = range(0, int( myceil(ww, base=1000) ), 200)
yticks = range(0, int( myceil(hh, base=1000) ), 200)

ax1.set_xticks(xticks)
ax1.set_yticks(yticks)
ax1.set_xticklabels(xticks, rotation = 90)
ax1.grid(which="both", axis="both")


In [None]:
#input
y1, y2, x1, x2 = 1250, 6400, 200, 4600 #top, bottom, left, right
vlines = [2600, 3000, 3600, 4200]
conf_level = 35

#replot fig1 with ROI
fig1, ax1 = plt.subplots(figsize=(16, 12))
ax1.imshow(img)

xticks = range(0, int( myceil(ww, base=1000) ), 200)
yticks = range(0, int( myceil(hh, base=1000) ), 200)

ax1.set_xticks(xticks)
ax1.set_yticks(yticks)
ax1.set_xticklabels(xticks, rotation = 90)
ax1.grid(which="both", axis="both")

#draw ROI
ax1.plot([x1, x2], [y1, y1], 'b-', linewidth=2)
ax1.plot([x1, x2], [y2, y2], 'b-', linewidth=2)
ax1.plot([x1, x1], [y1, y2], 'b-', linewidth=2)
ax1.plot([x2, x2], [y1, y2], 'b-', linewidth=2)

#draw columns
for vline in vlines:
    ax1.plot([vline, vline], [y1, y2], 'r-', linewidth=1)
    ax1.plot([vline, vline], [y1, y2], 'r-', linewidth=1)

In [None]:
config_option = "--psm 1"
d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang="eng", config=config_option)
#print(d.keys())

df = pd.DataFrame(data=d)
df.head(15)

In [None]:
#select just text objects ("level"==5) within ROI

idxroi = df[ (df["top"] > y1) & \
             (df["top"] < y2) & \
             (df["left"] > x1)& \
             (df["left"] < x2)& \
             (df["level"] == 5)].index


In [None]:
#add bb of detected texts within ROI
imgbb = img.copy()

linewidth = 2
r, g, b = 0, 255, 0

for idx in (list(idxroi)):
    (x, y, w, h) = (df["left"][idx], df["top"][idx], df["width"][idx], df["height"][idx])
    imgbb = cv2.rectangle(imgbb, (x, y), (x+w, y+h), (r, g, b), linewidth)


In [None]:
#draw detected text bbs

fig, ax = plt.subplots(figsize=(20, 16))
#axs[0].imshow(img);
ax.imshow(imgbb);

#draw ROI
ax.plot([x1, x2], [y1, y1], 'b-', linewidth=2);
ax.plot([x1, x2], [y2, y2], 'b-', linewidth=2);
ax.plot([x1, x1], [y1, y2], 'b-', linewidth=2);
ax.plot([x2, x2], [y1, y2], 'b-', linewidth=2);

#draw columns
for vline in vlines:
    ax.plot([vline, vline], [y1, y2], 'r-', linewidth=1);
    ax.plot([vline, vline], [y1, y2], 'r-', linewidth=1);

ax.set_xticks(xticks);
ax.set_yticks(yticks);
ax.set_xticklabels(xticks, rotation = 90);
#axs.grid(which="both", axis="both")

In [None]:
subset = df.iloc[idxroi].reset_index()
subset.head(10)

In [None]:
# use ui column seperators to bin detected texts into columns
seplines = sorted( vlines + [x1, x2] )
ncols = len(seplines) - 1
col = [bisect(seplines, subset["left"].iloc[i]) for i in range(0, len(subset))]

#add/remove ["column", "diff"]/["block_num","par_num"] fields to/from df
subset["column"] = col
subset.drop(["block_num", "par_num"], axis=1, inplace=True)

subset["diff"] = subset["word_num"].diff()

subset.head(10)

In [None]:
istarts = subset[subset["diff"]<0].index.values
#istarts = subset[subset["word_num"]==1].index.values
iends = np.append(istarts[1:], len(subset)) #already +1 to indices for slicing

if istarts[0] > 0:
    iends   = np.append(istarts[0], iends)
    istarts = np.append(0, istarts)
    
with open(os.path.join(opath, "receipt3_1.csv"), "w") as outfile:
    writer = csv.writer(outfile)
    
    for idxs, idxe in zip(istarts, iends):
        line = subset.iloc[idxs:idxe]
        columns = line["column"].unique()
        
        csvline = [' '] * ncols
        for column in columns:
            ci = column-1 #index
            ctext = "{}".format( ' '.join(line[line["column"]==column]["text"]) )
            csvline[ci] = ctext
        
        print(csvline)
        writer.writerow(csvline)
outfile.close()