In [4]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

In [5]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
### Load NER model
model_ner = spacy.load('./output/model-best/')

In [None]:
# Load Image
image = cv2.imread('./stubs/Slip.png')

# extract data using Pytesseract 
tessData = pytesseract.image_to_data(image)

# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
df.dropna(inplace=True) # drop missing values
df['text'] = df['text'].apply(cleanText)

# convet data into content
df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])
print(content)

# get prediction from NER model
doc = model_ner(content)

In [10]:
from spacy import displacy

In [None]:
displacy.serve(doc,style='ent',port=5001)

In [None]:
displacy.render(doc,style='ent')

### Tagging

In [None]:
docjson = doc.to_json()
docjson.keys()

In [14]:
doc_text = docjson['text']

In [None]:
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]] , axis = 1)
datafram_tokens.head(10)

In [16]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')

In [None]:
datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head(10)

In [18]:
# join lable to df_clean dataframe
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1 
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)

In [19]:
# inner join with start 
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')

In [None]:
dataframe_info.tail(10)

### Bounding Box

In [21]:
bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

In [23]:
# group the label
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
        
grp_gen = groupgen()

In [24]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [25]:
# right and bottom of bounding box
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

In [26]:
# tagging: groupby group
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')

In [27]:
img_tagging = group_tag_img.agg({
        
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x: " ".join(x)
    
})

In [None]:
img_tagging

In [None]:
img_bb = image.copy()
for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb,(l,t),(r,b),(0,255,0),2)
    cv2.putText(img_bb,str(label),(l,t),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Bounding Box Slips',img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [31]:
def parser(text,label):  
    
    if label in ('NAME'):
        text = text.lower()
        text = re.sub(r'[^a-z ]','',text)
        text = text.title()
        
    elif label in ('ORG','ROLE'):
        text = text.lower()
        text = re.sub(r'[^a-z0-9 ]','',text)
        text = text.title()
        
    elif label in ('DATE'):
        text = text.lower()
        text = re.sub(r'[^0-9/]','',text)
        text = text.title()        
        
    elif label in ('BASE','HOURS','QTD','GAINS','DEDUCTIONS','NETTE'):
        text = text.lower()
        text = re.sub(r'[^0-9.,]','',text)
        text = text.title()
        
    return text

In [32]:
info_array = dataframe_info[['token','label']].values
entities = dict(NAME=[],ORG=[],DATE=[],ROLE=[],BASE=[],HOURS=[],QTD=[],GAINS=[],DEDUCTIONS=[],NETTE=[])
previous = 'O'

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]

    # step -1 parse the token
    text = parser(token,label_tag)

    if bio_tag in ('B','I'):

        if previous != label_tag:
            entities[label_tag].append(text)

        else:
            if bio_tag == "B":
                entities[label_tag].append(text)

            else:
                if label_tag in ("NAME",'ORG','ROLE'):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text

                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text



    previous = label_tag

In [None]:
entities