In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = '!"#$%&\'()*+:;<=>?[\\]^`{|}~'
    tableWhitespace = str.maketrans('','',whitespace) # replace whitespace with ''
    tablePunctuation = str.maketrans('','',punctuation)
    # maketrans(x,y,z) args: if found x, replace with y, if found z, remove z
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)

    return str(removepunctuation)
# punctuation = punctuation to remove

In [88]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
### Load NER model
model_ner = spacy.load('./output/model-best')

In [4]:
# load Image
image = cv2.imread("./data/6.jpg")

# cv2.imshow("businesscard", image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

# extract data using Pytesseract
tessData = pytesseract.image_to_data(image)

# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:], columns=tessList[0])
df.dropna(inplace=True) # drop missing values
df['text'] = df['text'].apply(cleanText)


# convert data into content
df_clean = df.query('text != ""')
# df_clean['text']
content = " ".join([w for w in df_clean['text']])
content

# get prediction from NER model
doc = model_ner(content)



In [5]:
from spacy import displacy

In [14]:
displacy.serve(doc, style='ent')




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [7]:
# displacy.render(doc, style='ent')

In [8]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [9]:
doc_text = docjson['text']
doc_text

'cell 8099948528 ga 8466045457 email lictsrikant@gmail.com life insurance corporation of india seosrika ntht@gmail “com thathineni srikanth insurance advisor agent code no. 0316164y life insurance corporation of india br. off. lic office, trimulgherry, sec’bad - 500 016. add. borabanda, hyderabad - 500 018. lictsrikant8099948528.blogspot.in, interviewsinhyderabad.blogspot.in facebook.com/lictsrikant8099948528, facebook.com/thathineni.srikanth.9 promote your business online pybo'

In [89]:
datafram_tokens = pd.DataFrame(docjson['tokens'])
# datafram_tokens.head()
datafram_tokens['token'] = datafram_tokens[['start', 'end']].apply(
    lambda x:doc_text[x[0]:x[1]], axis=1)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,4,cell
1,1,5,15,8099948528
2,2,16,18,ga
3,3,19,29,8466045457
4,4,30,35,email
5,5,36,57,lictsrikant@gmail.com
6,6,58,62,life
7,7,63,72,insurance
8,8,73,84,corporation
9,9,85,87,of


In [11]:
# Join the tables
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens, right_table, how='left', on='start')

In [12]:
datafram_tokens.fillna('O', inplace=True) # to replace empty words with 'O' tag
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token,label
0,0,0,4,cell,O
1,1,5,15,8099948528,B-PHONE
2,2,16,18,ga,O
3,3,19,29,8466045457,B-PHONE
4,4,30,35,email,O
5,5,36,57,lictsrikant@gmail.com,B-EMAIL
6,6,58,62,life,B-ORG
7,7,63,72,insurance,I-ORG
8,8,73,84,corporation,I-ORG
9,9,85,87,of,I-ORG


In [90]:
# join label to df_clean dataframe
df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1 # cumsum() is the sum of the
# indexes of the rows before the current one so it keeps adding up
df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]),axis=1)

In [26]:
# iner join with start position
dataframe_info = pd.merge(df_clean, datafram_tokens[['start','token','label']], how='inner',on='start')

In [29]:
dataframe_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
41,5,1,6,3,1,6,495,474,53,21,96.027008,018.,307,303,018,O
42,5,1,6,3,2,1,46,506,430,27,85.618164,"lictsrikant8099948528.blogspot.in,",342,308,lictsrikant8099948528.blogspot.in,I-EMAIL
43,5,1,6,3,2,2,488,506,429,27,89.912384,interviewsinhyderabad.blogspot.in,376,343,interviewsinhyderabad.blogspot.in,B-WEB
44,5,1,6,3,3,1,44,539,472,25,87.967415,"facebook.com/lictsrikant8099948528,",412,377,facebook.com/lictsrikant8099948528,O
45,5,1,6,3,3,2,526,539,443,22,90.545654,facebook.com/thathineni.srikanth.9,447,413,facebook.com/thathineni.srikanth.9,O
46,5,1,6,3,4,1,46,571,106,21,96.344551,promote,455,448,promote,O
47,5,1,6,3,4,2,161,576,56,22,95.835091,your,460,456,your,O
48,5,1,6,3,4,3,226,571,111,21,96.409111,business,469,461,business,O
49,5,1,6,3,4,4,347,571,74,21,93.162605,online,476,470,online,O
50,5,1,6,3,4,5,432,571,96,27,92.261208,pybo,481,477,pybo,O


### Bounding Box

In [37]:
bb_df = dataframe_info.query("label != 'O'")
img = image.copy()

for x,y,w,h,label in bb_df[['left', 'top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)

    cv2.rectangle(img,(x,y),(x+w, y+h),(0, 255, 0), 2)
    cv2.putText(img,str(label),(x+w, y+h),cv2.FONT_HERSHEY_PLAIN, 1,(255, 0, 255), 2)

cv2.imshow('Predictions', img)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [91]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group,right,bottom
1,5,1,3,1,1,3,822,53,203,28,96.483879,8099948528,15,5,8099948528,ONE,1,1025,81
3,5,1,3,2,1,2,822,95,203,28,96.435081,8466045457,29,19,8466045457,ONE,1,1025,123
5,5,1,3,2,2,3,709,136,316,31,86.249245,lictsrikant@gmail.com,57,36,lictsrikant@gmail.com,AIL,2,1025,167
6,5,1,3,3,1,1,46,170,33,14,96.406654,life,62,58,life,G,3,79,184
7,5,1,3,3,1,2,85,151,92,42,95.806709,insurance,72,63,insurance,G,3,177,193


In [41]:
# group the label
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''

    def getgroup(self, text):
        if self.text ==text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id

grp_gen = groupgen()

In [92]:
bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

In [93]:
# right and bottom of bounding box
bb_df[['left', 'top','width','height']] = bb_df[['left', 'top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

In [50]:
# tagging = groupby group
# take min of lift and top, take max of right and bottom to group
col_group = ['left', 'top','right', 'bottom', 'width','height','label','token', 'group']
group_tag_img = bb_df[col_group].groupby(by='group')

In [94]:
img_tagging = group_tag_img.agg({

    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x:" ".join(x)
})

In [52]:
img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,822,1025,53,123,[PHONE],8099948528 8466045457
2,709,1025,136,167,[EMAIL],lictsrikant@gmail.com
3,46,374,151,193,[ORG],life insurance corporation of india
4,668,1025,165,209,[EMAIL],seosrika ntht@gmail “
5,310,755,227,259,[NAME],thathineni srikanth
6,399,669,271,296,[DES],insurance advisor
7,47,882,395,427,[ORG],life insurance corporation of india
8,46,476,506,533,[EMAIL],lictsrikant8099948528.blogspot.in
9,488,917,506,533,[WEB],interviewsinhyderabad.blogspot.in


In [56]:
img_bb = image.copy()
for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb, (l,t),(r,b),(0,255,0),2)
    cv2.putText(img_bb,str(label),(l,t),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)

cv2.imshow('Bounding box BusinessCard',img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

### Parser

In [78]:
def parser(text,label):
    if label == 'PHONE':
        text = text.lower()
        text = re.sub(r'\D','',text) # replaces non digits with empty. \d is digits, \D is not
    elif label == 'EMAIL':
        text = text.lower()
        allow_special_char = "@_.\\-"
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'',text)
    elif label == 'WEB':
        text = text.lower()
        allow_special_char = ':/.%#\\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(allow_special_char),'',text)
    elif label in ('NAME', 'DES'):
        text = text.lower()
        text = re.sub(r'[^a-z ]','',text)
        text = text.title()

    elif label == 'ORG':
        text = text.lower()
        text = re.sub(r'[^a-z0-9 ]','',text)
        text = text.title()

    return text

In [77]:
parser('dwad-awd)&#$@GMAIL.COM', 'EMAIL')

'dwad-awd@gmail.com'

### Entities

In [86]:
info_array = dataframe_info[['token','label']].values
entities = dict(NAME=[],ORG=[],DES=[],PHONE=[],EMAIL=[],WEB=[])
previous = 'O'

for token,label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]

    # step 1 parse the token
    text = parser(token,label_tag)

    if bio_tag in ('B','I'):
        if previous != label_tag:
            entities[label_tag].append(text)
            
        else:
            if bio_tag == 'B':
                entities[label_tag].append(text)
            else:
                if label_tag in ("NAME","ORG","DES"):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text

    previous = label_tag
            
    




In [87]:
print(entities)

{'NAME': ['Thathineni Srikanth'], 'ORG': ['Life Insurance Corporation Of India', 'Life Insurance Corporation Of India'], 'DES': ['Insurance Advisor'], 'PHONE': ['8099948528', '8466045457'], 'EMAIL': ['lictsrikant@gmail.com', 'seosrikantht@gmail', 'lictsrikant8099948528.blogspot.in'], 'WEB': ['interviewsinhyderabad.blogspot.in']}
