In [198]:
import os
import pandas as pd
from pdf2image import convert_from_path
import numpy as np
import pytesseract
import spacy
from pytesseract import Output
import cv2
import json
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans
from spacy.tokens import DocBin
from pytesseract import Output
import matplotlib.pyplot as plt
import camelot.io as camelot
from utils.boundingbox import BoundingBoxes

In [182]:
train_dir = r'input/train'
test_dir = r'input/test'
fields_dir = r'input/fields'

testFilePath = train_dir+'/'+os.listdir(train_dir)[3]

# Train Named Entity Recognizer

In [3]:
fields = os.listdir(fields_dir)
invoices = os.listdir(train_dir)

In [4]:
print(len(fields))
print(len(invoices))

0
16


In [None]:
set_fields  = set(map(lambda f: f.split('.')[0], fields))
set_invoices = set(map(lambda f: f.split('.')[0], invoices))

training_set = list(set_fields.intersection(set_invoices))
print(training_set)

In [None]:
data = pd.DataFrame(columns=["filename", "text"])

data["filename"] = training_set

data_text = []
for file in data["filename"]:
    data_text.append(get_text(f'{train_dir}/{file}.pdf'))
data["text"] = data_text

ent_list = []
for file in data["filename"]:
    with open(f'{fields_dir}/{file}.json') as f:
        entity_dict = json.load(f)
        ent_list.append(entity_dict)
data["entity_dictionary"] = ent_list

data.shape

In [None]:
training_data = []
id_ent = []

nlp_match = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp_match.vocab)
for index, row in data.iterrows():
    ent_dic = row["entity_dictionary"]
    ent = []
    phrases = list(ent_dic.values())
    patterns = [nlp_match.make_doc(phrase) for phrase in phrases]
    matcher.add("EntityList", None, *patterns)

    doc = nlp_match(row["text"])
    matches = matcher(doc)
    for match_id, start, end in matches:
        try:
            span = doc[start:end]
            if start>0:
                sb = doc[0:start]
                start_index=len(sb.text)+1
            else:
                start_index=0
            end_index= start_index+len(span.text)
        except:
            pass

        for key, value in ent_dic.items():
            if value==span.text:
                ent_tup=(start_index, end_index, key)
                ent.append(ent_tup)
                
    ent_set = {"total", "invnr"}
    detected_entities = set([key for start, end, key in ent])
    missed_entities = list(ent_set - detected_entities)
    if "total" in missed_entities:
        value = ent_dic["total"]
        if len(value)>0:
            catch_total = re.search(value, str(row["text"]).replace(",", ""))
            ent_tup = (catch_total.span()[0], catch_total.span()[1], "total")
            ent.append(ent_tup)
    if "invnr" in missed_entities:
        value = ent_dic["invnr"]
        if len(value)>0:
            catch_total = re.search(value, str(row["text"]).replace(",", ""))
            ent_tup = (catch_total.span()[0], catch_total.span()[1], "invnr")
            ent.append(ent_tup)
    id_ent.append(len(ent))
    training_data.append({"entities": ent, "text":row["text"]})

In [None]:
len(training_data)

In [None]:
TRAIN_DATA = training_data
output_dir=r'model'
n_iter = 80

In [None]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

for training_example  in TRAIN_DATA: 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin object

# Testing NER

In [None]:
nlp_ner = spacy.load("model/model-best")

In [None]:
test_files = os.listdir(test_dir)
test_files

In [None]:
len(test_files)

In [None]:
test_data = pd.DataFrame(columns=["filename", "text"])

test_data["filename"] = test_files

data_text = []
for file in test_data["filename"]:
    data_text.append(get_text(f'{test_dir}/{file}'))
test_data["text"] = data_text

In [None]:
test_data

In [None]:
for index, row in test_data.iterrows():
    op_dict = {"total": "", "invnr": ""}
    doc = nlp_ner(row["text"])
    for ent in doc.ents:      #identifying the entities using the trained model
        op_dict[ent.label_] = ent.text
  
    
    print("Entities", op_dict)

In [None]:
def get_text_old(filename):
    doc = convert_from_path(filename)

    for page_number, page_data in enumerate(doc):
        txt = pytesseract.image_to_string(page_data).replace('\n', ', ') 
    return txt

# Table Extraction

In [22]:
def plot(image,cmap=None):
    plt.figure(figsize=(15,15))
    plt.imshow(image,cmap=cmap) 

In [161]:
#Generate two text boxes a larger one that covers them
def merge_boxes(box1, box2):
    (x1, y1, w1, h1, text1) = box1
    (x2, y2, w2, h2, text2) = box2

    return [min(x1, x2), 
         min(y1, y2), 
         w1 + w2 + calc_horizontal_distance(box1, box2),
         max(h1, h2),
         text1 + ' ' + text2]


def calc_horizontal_distance(box1, box2):
    (x1, y1, w1, h1, text1) = box1
    (x2, y2, w2, h2, text2) = box2

    return abs(min(x1+w1-x2,x2+w2-x1))

def get_rows(boxes, cell_threshold=50):
    rows = {}

    # Clustering the bounding boxes by their positions
    for box in boxes:
        (x, y, w, h, text) = box
        row_key = y // cell_threshold
        rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

    return rows


def merge(boxes, dist_limit = 10, cell_threshold=50):
    rows = get_rows(boxes, cell_threshold)
    for row in rows.values():
        i=0
        while i<len(row)-1:
            if calc_horizontal_distance(row[i], row[i+1]) <= dist_limit:
                row[i] = merge_boxes(row[i], row[i+1])
                row.pop(i+1)
            else:
                i += 1
    return [item for sublist in list(rows.values()) for item in sublist]

In [172]:
# This only works if there's only one table on a page
# Important parameters:
#  - morph_size
#  - min_text_height_limit
#  - max_text_height_limit
#  - cell_threshold
#  - min_columns


def pre_process_image(img, morph_size=(8, 8)):
    pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    pre = cv2.threshold(pre, 240, 255, cv2.THRESH_BINARY)[1]
    # dilate the text to make it solid spot
    cpy = pre.copy()
    struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
    cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
    pre = ~cpy
    return pre


def find_text_boxes(pre, min_text_height_limit=6, max_text_height_limit=40):
    boxes = []

    d = pytesseract.image_to_data(pre, output_type=Output.DICT)
    for i in range(len(d['level'])):
        (x, y, w, h, text) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i], d['text'][i])

        if min_text_height_limit < h < max_text_height_limit and d['text'][i].strip():
            boxes.append([x, y, w, h, text])

    return boxes


def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
    rows = {}
    cols = {}

    # Clustering the bounding boxes by their positions
    for box in boxes:
        (x, y, w, h, text) = box
        col_key = x // cell_threshold
        row_key = y // cell_threshold
        cols[col_key] = [box] if col_key not in cols else cols[col_key] + [box]
        rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]

    # Filtering out the clusters having less than 2 cols
    table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
    # Sorting the row cells by x coord
    table_cells = [list(sorted(tb)) for tb in table_cells]
    # Sorting rows by the y coord
    table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))

    return table_cells


def build_lines(table_cells):
    if table_cells is None or len(table_cells) <= 0:
        return [], []

    max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
    max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]

    max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
    max_y = max_last_row_height_box[1] + max_last_row_height_box[3]

    hor_lines = []
    ver_lines = []

    for box in table_cells:
        x = box[0][0]
        y = box[0][1]
        hor_lines.append((x, y, max_x, y))

    for box in table_cells[0]:
        x = box[0]
        y = box[1]
        ver_lines.append((x, y, x, max_y))

    (x, y, w, h, text) = table_cells[0][-1]
    ver_lines.append((max_x, y, max_x, max_y))
    (x, y, w, h, text) = table_cells[0][0]
    hor_lines.append((x, max_y, max_x, max_y))

    return hor_lines, ver_lines

In [199]:
pages = convert_from_path(train_dir+'/'+os.listdir(train_dir)[12])
img = np.array(pages[0])

pre_processed = pre_process_image(img, morph_size=(1,1))
rows = BoundingBoxes.get_bounding_boxes_from_img(pre_processed)
table_cells = find_table_in_boxes(rows, min_columns=4, cell_threshold=25)
hor_lines, ver_lines = build_lines(table_cells)

# Visualize the result
vis = img.copy()

for cell in table_cells:
    for c in cell:
        (x, y, w, h, text) = c
        cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (255, 0, 0), 3)

for box in rows:
    (x, y, w, h, text) = box
    cv2.rectangle(vis, (x, y), (x + w - 2, y + h - 2), (0, 255, 0), 1)

for line in hor_lines:
    [x1, y1, x2, y2] = line
    cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

for line in ver_lines:
    [x1, y1, x2, y2] = line
    cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)

plot(vis)



TypeError: BoundingBoxes.get_bounding_boxes_from_img() missing 1 required positional argument: 'img'