In [1]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string

In [2]:
def cleanText(txt):
    whitespace = string.whitespace
    punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
    tableWhitespace = str.maketrans('','',whitespace)
    tablePunctuation = str.maketrans('','',punctuation)
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
### Load NER model
model_ner = spacy.load('./output/model-best/')
model_ner

<spacy.lang.en.English at 0x2ef1b638590>

In [5]:
# Load Image
image = cv2.imread('./test_templates/Alendemeltest.png')

#cv2.imshow('businesscard',image)
#cv2.waitKey(0)
#cv2.destroyAllWindows()

In [6]:
# extract data using Pytesseract 
tessData = pytesseract.image_to_data(image)

In [7]:
# convert into dataframe
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:],columns=tessList[0])
df.dropna(inplace=True) # drop missing values
df['text'] = df['text'].apply(cleanText)
df

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
0,1,1,0,0,0,0,0,0,793,1123,-1,
1,2,1,1,0,0,0,270,92,251,30,-1,
2,3,1,1,1,0,0,270,92,251,30,-1,
3,4,1,1,1,1,0,270,92,251,30,-1,
4,5,1,1,1,1,1,270,92,83,30,96.573158,alan
...,...,...,...,...,...,...,...,...,...,...,...,...
221,4,1,12,1,2,0,367,997,58,9,-1,
222,5,1,12,1,2,1,367,997,58,9,76.755676,4666331
223,4,1,12,1,3,0,274,1012,246,13,-1,
224,5,1,12,1,3,1,274,1013,54,9,93.223030,e-mail


In [8]:
# convet data into content
df_clean = df.query('text != "" ')
content = " ".join([w for w in df_clean['text']])
print(content)

alan de maid works order to srarter appliances ltd date 23/05/2024 m stedeford ‘order bros-16009551 386-137 high street shracombe ‘ordered by jeri davison bromley pmc bg4 sez con.fax access tenant key return keys please execute the appliance - repairs at oionods24 property 000004324 11 hilda vale close ‘orpington bre tah contact ms stephanie susan sawyer 07742 677619 cooker in white - hotpoint indesit -£649vat delivery, installation and removal - £125 vat. stoptap please contact the office below with a quotation prior to carrying out any works. required by as this works order has been placed on behalf of the landlord, your invoice must be addressed directly to the landlord mr john worton jo bromley adm and alan de maid bromley 18 east street, bromey ent br 1qe l_ _ please email / send all invoices to the office address below office address 19 east street, bromley, kent, bri 1qe phone 02084667172 fax 0208 4666331 e-mail lettings@alandemaid.co.uk


In [9]:
# get prediction from NER model
doc = model_ner(content)

In [10]:
from spacy import displacy

In [11]:
#displacy.serve(doc,style='ent')

In [12]:
#displacy.render(doc,style='ent')

In [13]:
### Tagging
docjson = doc.to_json()
docjson.keys()


dict_keys(['text', 'ents', 'tokens'])

In [14]:
doc_text = docjson['text']

In [15]:
datafram_tokens = pd.DataFrame(docjson['tokens'])
datafram_tokens['token'] = datafram_tokens[['start','end']].apply(
    lambda x:doc_text[x[0]:x[1]] , axis = 1)
datafram_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,4,alan
1,1,5,7,de
2,2,8,12,maid
3,3,13,18,works
4,4,19,24,order
5,5,25,27,to
6,6,28,35,srarter
7,7,36,46,appliances
8,8,47,50,ltd
9,9,51,55,date


In [16]:
right_table = pd.DataFrame(docjson['ents'])[['start','label']]
datafram_tokens = pd.merge(datafram_tokens,right_table,how='left',on='start')

datafram_tokens.fillna('O',inplace=True)
datafram_tokens.head(10)



Unnamed: 0,id,start,end,token,label
0,0,0,4,alan,I-PAYMENTCOMPANYNAME
1,1,5,7,de,I-PAYMENTCOMPANYNAME
2,2,8,12,maid,I-PAYMENTCOMPANYNAME
3,3,13,18,works,O
4,4,19,24,order,O
5,5,25,27,to,O
6,6,28,35,srarter,O
7,7,36,46,appliances,O
8,8,47,50,ltd,O
9,9,51,55,date,O


In [17]:
# join lable to df_clean dataframe
df_clean['end'] = df_clean['text'].apply(lambda x: len(x)+1).cumsum() - 1 
df_clean['start'] = df_clean[['text','end']].apply(lambda x: x[1] - len(x[0]),axis=1)

# inner join with start 
dataframe_info = pd.merge(df_clean,datafram_tokens[['start','token','label']],how='inner',on='start')

dataframe_info.tail(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
146,5,1,12,1,1,7,369,981,34,11,91.937225,"kent,",882,877,kent,O
147,5,1,12,1,1,8,409,981,25,9,41.557808,bri,886,883,bri,O
148,5,1,12,1,1,9,440,981,25,11,71.14949,1qe,890,887,1qe,O
149,5,1,12,1,1,10,475,981,53,9,85.978706,phone,896,891,phone,B-PAYMENTBYERPHONE
150,5,1,12,1,1,11,538,981,101,9,74.419983,02084667172,908,897,02084667172,O
151,5,1,12,1,1,12,652,981,32,9,94.912079,fax,912,909,fax,B-PAYMENTBYERFAX
152,5,1,12,1,1,13,694,981,33,9,95.495956,0208,917,913,0208,I-PAYMENTBYERFAX
153,5,1,12,1,2,1,367,997,58,9,76.755676,4666331,925,918,4666331,I-PAYMENTBYERFAX
154,5,1,12,1,3,1,274,1013,54,9,93.22303,e-mail,932,926,e,O
155,5,1,12,1,3,2,339,1012,181,13,66.468765,lettings@alandemaid.co.uk,958,933,lettings@alandemaid.co.uk,B-PAYMENTBYEREMAIL


In [18]:
### Bounding Box

bb_df = dataframe_info.query("label != 'O' ")
img = image.copy()

for x,y,w,h,label in bb_df[['left','top','width','height','label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img,(x,y),(x+w,y+h),(0,255,0),2)
    cv2.putText(img,str(label),(x,y),cv2.FONT_HERSHEY_PLAIN,1,(255,0,255),2)
    
    
cv2.imshow('Predictions',img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [19]:
bb_df['label'] = bb_df['label'].apply(lambda x: x[2:])
bb_df.head()

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,1,1,1,1,270,92,83,30,96.573158,alan,4,0,alan,PAYMENTCOMPANYNAME
1,5,1,1,1,1,2,370,92,45,30,96.573158,de,7,5,de,PAYMENTCOMPANYNAME
2,5,1,1,1,1,3,432,92,89,30,96.904709,maid,12,8,maid,PAYMENTCOMPANYNAME
10,5,1,4,1,2,7,586,225,67,12,92.514702,23/05/2024,66,56,23/05/2024,REQUIREDDATE
14,5,1,4,1,3,4,588,236,92,29,55.614967,bros-16009551,99,86,bros-16009551,FAULTDETAIL


In [20]:
# group the label
class groupgen():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
        
grp_gen = groupgen()

bb_df['group'] = bb_df['label'].apply(grp_gen.getgroup)

# right and bottom of bounding box
bb_df[['left','top','width','height']] = bb_df[['left','top','width','height']].astype(int)
bb_df['right'] = bb_df['left'] + bb_df['width']
bb_df['bottom'] = bb_df['top'] + bb_df['height']

# tagging: groupby group
col_group = ['left','top','right','bottom','label','token','group']
group_tag_img = bb_df[col_group].groupby(by='group')

img_tagging = group_tag_img.agg({
    
    'left':min,
    'right':max,
    'top':min,
    'bottom':max,
    'label':np.unique,
    'token':lambda x: " ".join(x)
    
})

img_tagging


Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,270,521,92,122,[PAYMENTCOMPANYNAME],alan de maid
2,586,653,225,237,[REQUIREDDATE],23/05/2024
3,588,680,236,265,[FAULTDETAIL],bros-16009551
4,184,221,257,265,[INSTRUCTIONNOTES],386
5,503,556,265,275,[PAYMENTPONUMBER],‘
6,585,747,265,278,[PAYMENTBYERNAME],jeri davison bromley pmc
7,184,206,282,290,[PAYMENTBYERADDRESS],bg4
8,209,622,282,315,[ACCESSKEY],sez tenant
9,132,333,374,385,[FAULTDETAIL],execute the appliance
10,186,247,409,418,[INSTRUCTIONNOTES],11 hilda vale


In [21]:
img_bb = image.copy()
for l, r, t, b, label, token in img_tagging.values:
    cv2.rectangle(img_bb, (l, t), (r, b), (0, 255, 0), 2)
    
    # Ensure label is a string
    cv2.putText(img_bb, str(label), (l, t), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 255), 2)

cv2.imshow('Bounding Box workorder', img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()


In [23]:
#parser
def parser(text, label):
    if label in ['[PAYMENTBYERPHONE]']:
        # Standardize phone numbers (keep digits only)
        text = re.sub(r'\D', '', text)
        
    elif label in ['[PAYMENTBYEREMAIL]']:
        # Standardize email (keep alphanumeric, @, ., _, and -)
        text = text.lower()
        allow_special_char = '@_.\-'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char), '', text)
        
    elif label == 'WEB':
        # Standardize URLs (keep common URL characters)
        text = text.lower()
        allow_special_char = ':/.%#\-'
        text = re.sub(r'[^A-Za-z0-9{}]'.format(allow_special_char), '', text)
        
    elif label in ['[PAYMENTBYERNAME]', '[SHIPPINGNAME]', '[PAYMENTCOMPANYNAME]']:
        # Standardize names (convert to title case, allow only alphabet and spaces)
        text = text.lower()
        text = re.sub(r'[^a-z ]', '', text)
        text = text.title()
        
    elif label in ['[PAYMENTCOMPANYNAME]', '[PAYMENTBYERADDRESS]', '[SHIPPINGSTREET]']:
        # Standardize organization names and addresses (allow alphanumeric and spaces)
        text = text.lower()
        text = re.sub(r'[^a-z0-9 ]', '', text)
        text = text.title()
        
    elif label in ['[REQUIREDDATE]']:
        # For date, ensure it's in a valid format (e.g., DD/MM/YYYY)
        # You can add a date parser to validate the date if needed
        text = text.strip()  # Simple clean up
        
    else:
        # General clean up for other labels (remove extra spaces)
        text = text.strip()
    
    return text


In [27]:
# Example data
#test_data = [
#    ("0208 4666331", "[PAYMENTBYERPHONE]"),
#    ("lettings@alandemaid.co.uk", "[PAYMENTBYEREMAIL]"),
#    ("Alan de Maid", "[PAYMENTCOMPANYNAME]"),
#    ("23/05/2024", "[REQUIREDDATE]"),
#    ("Bromley ADM and Alan de Maid", "[PAYMENTCOMPANYNAME]"),
#]

#for text, label in test_data:
#    cleaned_text = parser(text, label)
#    print(f"Label: {label}, Parsed: {cleaned_text}")


In [None]:
#### Entities

info_array = dataframe_info[['token','label']].values
entities = dict(PAYMENTBYERPHONE=[],ORG=[],DES=[],PHONE=[],EMAIL=[],WEB=[])
previous = 'O'

for token, label in info_array:
    bio_tag = label[0]
    label_tag = label[2:]
    
    # step -1 parse the token
    text = parser(token,label_tag)
    
    if bio_tag in ('B','I'):
        
        if previous != label_tag:
            entities[label_tag].append(text)
            
        else:
            if bio_tag == "B":
                entities[label_tag].append(text)
                
            else:
                if label_tag in ("NAME",'ORG','DES'):
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                    
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text
                    
    
    
    previous = label_tag

entities

In [28]:
info_array = dataframe_info[['token','label']].values
# Updated entities dictionary to include all relevant entity types
entities = dict(
    PAYMENTBYERPHONE=[], 
    PAYMENTBYEREMAIL=[], 
    PAYMENTCOMPANYNAME=[], 
    REQUIREDDATE=[], 
    PAYMENTBYERNAME=[], 
    PAYMENTBYERADDRESS=[], 
    SHIPPINGNAME=[], 
    SHIPPINGSTREET=[], 
    FAULTDETAIL=[], 
    INSTRUCTIONNOTES=[], 
    ACCESSKEY=[], 
    PAYMENTPONUMBER=[], 
    WEB=[], 
    ORG=[], 
    DES=[]
)

previous = 'O'

# Iterate over info_array (which contains token and label)
for token, label in info_array:
    bio_tag = label[0]  # 'B' or 'I' tag
    label_tag = label[1:-1]  # Extract label from brackets

    # Ensure label_tag matches keys in entities dictionary
    if label_tag not in entities:
        print(f"Warning: Unrecognized label tag '{label_tag}' encountered.")
        continue

    # Step 1: Parse the token based on its label
    text = parser(token, label)  # Use parser to clean the text

    if bio_tag in ('B', 'I'):  # If token is part of a named entity (B/I tags)
        if previous != label_tag:
            # Add new token to the corresponding entity
            entities[label_tag].append(text)
        else:
            if bio_tag == "B":
                # Start of a new entity (B tag)
                entities[label_tag].append(text)
            else:
                # Continue the entity (I tag)
                if label_tag in ('NAME', 'ORG', 'DES', 'PAYMENTCOMPANYNAME', 'PAYMENTBYERNAME'):
                    # Concatenate tokens with space for specific labels
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    # Concatenate without space for labels like phone numbers, etc.
                    entities[label_tag][-1] = entities[label_tag][-1] + text

    previous = label_tag  # Track the previous label tag for proper entity joining

# Final entities output
print(entities)

{'PAYMENTBYERPHONE': [], 'PAYMENTBYEREMAIL': [], 'PAYMENTCOMPANYNAME': [], 'REQUIREDDATE': [], 'PAYMENTBYERNAME': [], 'PAYMENTBYERADDRESS': [], 'SHIPPINGNAME': [], 'SHIPPINGSTREET': [], 'FAULTDETAIL': [], 'INSTRUCTIONNOTES': [], 'ACCESSKEY': [], 'PAYMENTPONUMBER': [], 'WEB': [], 'ORG': [], 'DES': []}
