## Pytesseract

In [None]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 32 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 1s (3,566 kB/s)
debconf: unable to initi

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pytesseract, shutil, os, cv2, re
from glob import glob
from tqdm import tqdm
from statistics import mean, median, stdev
from tabulate import tabulate
from sklearn.cluster import AgglomerativeClustering
from google.colab.patches import cv2_imshow

sns.set_theme(style = "ticks",
              palette = sns.dark_palette("seagreen", reverse=True))

print(f"OpenCV version: {cv2.__version__}")
print(f"PyTesseract version: {pytesseract.__version__}")

OpenCV version: 4.6.0
PyTesseract version: 0.3.10


In [None]:
# Input and output directories
os.mkdir('data/')
os.mkdir('output/')
os.mkdir('output/png/')
os.mkdir('output/csv/')

In [None]:
images = glob('data/*.png')
images

['data/Rent_Report_-_July_2020.width-720-1.png',
 'data/Rent_Report_-_April_2020.width-720-469x1024.png',
 'data/Rent_Report_-_April_2019.width-720-431x1024.png',
 'data/Rent_Report_-_May_2022_.width-720.png',
 'data/Rent_Report_-_October_2020.width-720.png',
 'data/Rent-Report-November-2020-1-1-e1611353009250.png',
 'data/rent_report_-_January_2022_2.width-720.png',
 'data/Rent_Report_-_May_2020.width-720-1-439x1024.png',
 'data/Rentals.ca_-_Rent_Report_-_January_2019.width-720-483x1024.png',
 'data/Rent_Report_-_May_2019.width-720-431x1024.png',
 'data/Rent_Report_-_February_2020.width-720.png',
 'data/Rent_Report_-_August_2020.width-720.png',
 'data/Rent_Report_-_March_2021.width-720.png',
 'data/Rent_Report_-_February_2022_2.width-720.png',
 'data/rent_report_-_November_2021.width-720.png',
 'data/Rent_Report_-_December_2021.width-720.png',
 'data/Rent_Report_Graphic__-_August_2022_2.width-720.png',
 'data/Rent_Report_-_April_2021_1.width-720.png',
 'data/Rent_Report_-_September_20

In [None]:
def preprocess(image,
               resize = False,
               preserve_ar = True,
               grayscale = False,
               gaussian_blur = False,
               thresholding = False,
               thresh_value = 127,
               invert_output = False,
               verbose = True):
  '''
  Preprocess image object input with:
  image: image input file path;
  resize: Resize to desired width and height dimensions. Takes arguments tuple
    (width, height), single Integer as target width or false boolean. Will
    inforce aspect ratio based on passed target width if preserve_ar argument is
    set to True. Default = False. Default = True if resize argument is integer;
  preserve_ar: Boolean argument to preserve original image's Aspect Ratio or
    redefine based on 'resize' input. Default = True;
  grayscale: OpenCV grayscaling. Takes argument boolean = True or False.
    Default = False;
  gaussian_blur: Smooth image input with a gaussian blurring method. Takes
    arguments Integer kernel size or false boolean. Default = False;
  thresholding: OpenCV simple thresholding. Takes arguments [binary, binary_inv]
    or false boolean. Default = False;
  thresh_value: OpenCV threshold value. Takes argument Int. Default = 127;
  invert_output: Boolean argument to invert output binary image. Default = False;
  '''
  # Image load and input dimensions
  input_file = image
  image = cv2.imread(image)
  input_height = int(image.shape[0])
  input_width = int(image.shape[1])
  aspect_ratio = input_height/input_width
  
  if verbose:
    print(f"Processing input file: {input_file}...")

  # Resizing
  if type(resize) == int:
    resize = (resize,)

  if resize:
    if preserve_ar:
      image = cv2.resize(image, (resize[0], int(resize[0]*aspect_ratio)))
    else:
      image = cv2.resize(image, (resize[0], input_height))
  
  output_height = int(image.shape[0])
  output_width = int(image.shape[1])

  # Gray-scaling
  if grayscale:
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

  # Blurring
  if gaussian_blur:
    image = cv2.GaussianBlur(image, (5, 5), gaussian_blur)

  # Thresholding
  if thresholding:
    if thresholding == "binary":
      image = cv2.threshold(image, thresh_value, 255, cv2.THRESH_BINARY_INV)[1]
    elif thresholding == "binary_inv":
      image = cv2.threshold(image, thresh_value, 255, cv2.THRESH_BINARY_INV)[1]
    else:
      print("Invalid thresholding argument!")
  
  # Inverting binary image
  if invert_output:
    image = np.invert(image)

  if verbose:
    print(f"Image input dimensions: {(input_width, input_height)}\n"\
          f"Image output dimensions: {(output_width, output_height)}\n")
  return image

In [None]:
def draw_table(image,
               pytesseract_config = "--psm 4",
               conf_thresh = 0,
               h_padding = 10,
               v_padding = 10,
               h_distance_threshold = 3,
               v_distance_threshold = 15,
               v_cluster_threshold = 10,
               smooth_v_factor = 2.5):
  '''
  pytesseract_config: Pytesseract OCR config argument. String.
  Default = "--psm 4";
  conf_thresh: Minimum confidence value for thresholding OCR results. Positive
  integer. Default = 0;
  h_padding: Horizontal padding on table vertical borders. Positive integer.
  Default = 10;
  v_padding: Vertical padding on table vertical borders. Positive integer.
  Default = 10;
  v_distance_threshold: Vertical lines clustering distance threshold.
  Default = 3;
  h_distance_threshold: Horizontal lines clustering distance threshold.
  Default = 15;
  v_cluster_threshold: Vertical lines clustering threshold. Determines the
  minimum amount of elements to form an x coordinate cluster. Default = 10;
  smooth_v_factor: Smoothening factor for vertical lines. Positive integer.
  Default = 2.5;
  '''

  # Pytesseract image_to_data method on input image
  OCRdict = pytesseract.image_to_data(image,
                                      lang = 'eng',
                                      output_type = pytesseract.Output.DICT,
                                      config = pytesseract_config)
  
  # Initializing coords, gaps, and OCR text list
  coords = []
  h_gaps = []
  v_gaps = []
  OCRtext = []
  confs = []

  for i in range(0, len(OCRdict["text"])):
    # Retrieving current text and coordinates
    x0 = OCRdict["left"][i]
    y0 = OCRdict["top"][i]
    w0 = OCRdict["width"][i]
    h0 = OCRdict["height"][i]
    text0 = OCRdict["text"][i]
    conf0 = OCRdict["conf"][i]

    # Retrieving following text and coordinates
    try:
      x1 = OCRdict["left"][i+1]
      y1 = OCRdict["top"][i+1]
      w1 = OCRdict["width"][i+1]
      h1 = OCRdict["height"][i+1]
    except:
      pass
    
    # Calculating vertical and horizontal gaps
    h_gap = x1 - (x0 + w0)
    v_gap = y1 - (y0 + h0)

    if (conf0 > conf_thresh) and (h0 < image.shape[0]/2) and (w0 < image.shape[1]/2):
      coords.append((x0, y0, w0, h0))
      h_gaps.append(h_gap)
      v_gaps.append(v_gap)
      OCRtext.append(text0)
      confs.append(conf0)

  # Clustering x coordinates to determine vertical lines
  x_coords = [(x[0], 0) for x in coords]

  # Hierarchical clustering - vertical lines
  clustering = AgglomerativeClustering(n_clusters = None,
                                      affinity = "manhattan",
                                      linkage = "complete",
                                      distance_threshold = h_distance_threshold)
  clustering.fit(x_coords)

  # Initializing vertical lines list
  v_lines = []

  for cluster in np.unique(clustering.labels_):
    ids = np.where(clustering.labels_ == cluster)[0]
    
    if len(ids) > v_cluster_threshold:
      avg_x = np.average([coords[i][0] for i in ids])
      v_lines.append(int(avg_x) - h_padding)

  v_lines.sort()
  n_columns = len(v_lines) # Number of columns defined by vertical lines

  # Hierarchical clustering - horizontal lines
  y_coords = [(0, y[1]) for y in coords]

  clustering = AgglomerativeClustering(n_clusters = None,
                                      affinity = "manhattan",
                                      linkage = "complete",
                                      distance_threshold = v_distance_threshold)
  clustering.fit(y_coords)

  # Initializing horizontal lines list
  h_lines = []

  for cluster in np.unique(clustering.labels_):
    ids = np.where(clustering.labels_ == cluster)[0]
    # Thresholding on clusters that have at least half of columns populated
    # rounded up
    if len(ids) > (int(n_columns / 2) + 1):
      avg_y = np.average([coords[i][1] for i in ids])
      h_lines.append(int(avg_y) - v_padding)

  h_lines.sort()

  # Calculating vertical gaps and log of vertical gaps defined by vertical lines
  lines_v_gaps = [h_lines[i+1] - h_lines[i] for i in range(len(h_lines) - 1)]
  log_lines_v_gaps = np.log(lines_v_gaps)

  # Vertical gaps smoothening factor
  stdev_v_gaps = stdev(log_lines_v_gaps) # Standard deviation of vertical gaps
  mean_v_gaps = mean(log_lines_v_gaps) # Mean of vertical gaps

  # Vertical gaps smoothened increment and interval
  smooth_v_increment = smooth_v_factor * (stdev_v_gaps / np.sqrt(len(lines_v_gaps)))
  smooth_v_interval = (mean_v_gaps - smooth_v_increment, mean_v_gaps + smooth_v_increment)

  # Converting back to original scale
  smooth_v_interval = np.exp(smooth_v_interval).astype("int8")

  # Converting to a range interval
  smooth_v_interval = range(smooth_v_interval[0], smooth_v_interval[1])

  # Updating horizontal lines based on smoothened vertical interval
  smooth_h_lines = []
  for i, line in enumerate(h_lines):
    try:
      # Looking forward for at least 2 gaps in a row within smoothened interval
      if h_lines[i+2] - h_lines[i+1] in smooth_v_interval:
        if h_lines[i+1] - h_lines[i] in smooth_v_interval:
          smooth_h_lines.append(line)
      # Looking backward for at least 2 gaps in a row within smoothened interval
      elif h_lines[i-1] - h_lines[i-2] in smooth_v_interval:
        if h_lines[i] - h_lines[i-1] in smooth_v_interval:
          smooth_h_lines.append(line)
    except:
      pass

  # Defining external borders
  # Calculating mean vertical spacing within table
  v_spacings = []
  for i in range(0, len(smooth_h_lines) - 1):
    v_spacings.append(smooth_h_lines[i+1] - smooth_h_lines[i])

  v_spacing = int(mean(v_spacings) - mean([h[3] for h in coords]))

  # Iterating through last column elements to determine last column max width
  last_column_widths = []
  for id in np.where([x[0] for x in coords] > np.max(v_lines))[0]:
    last_column_widths.append(coords[id][2])

  # Last column max width with padding
  last_column_width = np.max(last_column_widths) + 2 * h_padding

  # Final table dimensions
  n_rows = len(smooth_h_lines)
  n_columns = len(v_lines)
  table_dim = (n_rows, n_columns)

  # Redefining preprocessed image as a colored cv2 image
  try:
    color_preprocessed = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
  except:
    color_preprocessed = image

  # Borders and lines colors
  border_color = [51, 102, 0] # dark green
  lines_color = [76, 153, 0] # mild pale green

  # Table corners
  x_min = np.min(v_lines)
  y_min = np.min(smooth_h_lines)
  x_max = np.max(v_lines) + last_column_width
  y_max = np.max(smooth_h_lines) + v_spacing + v_padding

  # Drawing external borders
  cv2.rectangle(color_preprocessed,
                (x_min, y_min),
                (x_max, y_max),
                color = border_color,
                thickness = 3)

  for v_line in v_lines:
    if v_line != x_min:
      cv2.line(color_preprocessed,
              (v_line, y_min),
              (v_line, y_max),
              color = lines_color,
              thickness = 2)
      
  for h_line in smooth_h_lines:
    if h_line != y_min:
      cv2.line(color_preprocessed,
              (x_min, h_line),
              (x_max, h_line),
              color = lines_color,
              thickness = 2)

  # Columns ranges
  columns = []
  for i in range(0, len(v_lines) -1):
    columns.append(range(v_lines[i], v_lines[i+1]))

  # Appending last column
  columns.append(range(v_lines[-1], x_max))

  # Rows ranges
  rows = []
  for i in range(0, len(smooth_h_lines) -1):
    rows.append(range(smooth_h_lines[i], smooth_h_lines[i+1]))

  # Appending last row
  rows.append(range(smooth_h_lines[-1], y_max))

  # Initializing empty table to store text
  table = np.empty(table_dim, dtype = 'object')

  # Iterating through OCR text and storing values on initialized table
  for coord, text in zip(coords, OCRtext):
    for j in range(0, len(columns)):
      for i in range(0, len(rows)):
        if (int(coord[0] + coord[2]/2) in columns[j]) and (int(coord[1] + coord[3]/2) in rows[i]):
          if table[i, j] is not None:
            table[i, j] += f" {text}"
          else:
            table[i, j] = text

  df = pd.DataFrame(table[1:], columns = table[0])

  table = {}
  table['coords'] = coords
  table['OCRtext'] = OCRtext
  table['confs'] = confs
  table['image'] = color_preprocessed
  table['df'] = df

  return table

In [254]:
# Preprocessing parameters
preprocess_args = {
        "resize": 1000,
        "grayscale": True,
        "thresholding": "binary",
        "thresh_value": 165
}

# OCR table extraction parameters
draw_table_args = {
    "pytesseract_config": "--psm 4",
    "h_padding": 10,
    "v_padding": 10,
    "h_distance_threshold": 3,
    "v_distance_threshold": 25,
    "smooth_v_factor": 3
}

In [None]:
# Extracting table for images on input folder
tables = {}

for image in tqdm(images):
  match = re.search("[A-Z][a-z]*.[0-9][0-9][0-9][0-9]", image)
  if match is not None:
    table_name = match.group(0).replace("-", "_")
    preprocessed = preprocess(image, **preprocess_args, verbose = False)
    tables[table_name] = draw_table(preprocessed, **draw_table_args)

100%|██████████| 40/40 [06:15<00:00,  9.38s/it]


In [None]:
# Writting png and csv files
for table in tables:
  cv2.imwrite(f"output/png/{str(table)}.png", tables[table]['image'])
  tables[table]['df'].to_csv(f"output/csv/{str(table)}.csv")

# Creating output zip folder
shutil.make_archive("output", "zip", 'output/')

'/content/output.zip'