In [None]:
'''
    File name: Parsing client information from utility bills in various format. 
    Author: Xiangyang Meng
    Date created: 6/2/2021
    Version:1.0
    License:MIT License
    Email: xmeng1@umbc.edu
'''

In [1]:
from PyPDF2 import PdfFileReader
import re
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfpage import PDFPage
import pdfminer
import nltk 


In [2]:
#Data directory
current_dir='Datasource/'
doc = [i for i in os.listdir(current_dir) if i.endswith(".pdf")]

In [3]:
#Function of getting the metadata,such as author, creation date, and modification date

def extract_information(pdf_path):
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        if pdf.isEncrypted:
            pdf.decrypt('')

        information = pdf.documentInfo
        number_of_pages = pdf.getNumPages()

    txt = f"""
    Information about {pdf_path}: 
    Author: {information.author}
    Creation date:{information['/CreationDate']}
    Modification date: {information['/ModDate']}
    """

    print(txt)
    return information

In [4]:
#Getting metadata(author, creation date, and modification date) for each pdf
for i in doc:
    print(i)
    pdf_path=current_dir+i
    information=extract_information(pdf_path)

104191-Sample Bill.pdf

    Information about Datasource/104191-Sample Bill.pdf: 
    Author: None
    Creation date:D:20191209101020-05'00'
    Modification date: D:20191209101522-05'00'
    
ccrutilitybillsample.pdf

    Information about Datasource/ccrutilitybillsample.pdf: 
    Author: becky
    Creation date:D:20130329152445-04'00'
    Modification date: D:20130405154117-04'00'
    
DocOrigin_Utility_Bill_Tyler-Technologies.pdf

    Information about Datasource/DocOrigin_Utility_Bill_Tyler-Technologies.pdf: 
    Author: None
    Creation date:D:20150421085458-04'00'
    Modification date: D:20150424084953-04'00'
    
National-Grid-1.pdf

    Information about Datasource/National-Grid-1.pdf: 
    Author: roach
    Creation date:D:20060119095607-05'00'
    Modification date: D:20060123095406-05'00'
    
OhioSampleBilll.pdf

    Information about Datasource/OhioSampleBilll.pdf: 
    Author: s133375
    Creation date:D:20180521103041-04'00'
    Modification date: D:20180521105823-04'0



In [5]:
#Function of identifying the next word
def nextword(target, source):
    for i, w in enumerate(source):
        if w == target and target!= source[-1]:
            return source[i+1]
    

In [6]:
#Function of identifying the previous word
def previousword(target, source):
    for i, w in enumerate(source):
        if w == target:
            return source[i-1]


In [7]:
#Function of finding names
#The first number in the tokenized textbox should be the street number or PO Box number, so the words before the first number
#or PO Box should be the names. 
def name(num):
    for i in num:
        if i.isdigit():
            if num[num.index(i)-1] not in ('Box', 'BOX') and num.index(i)>1:
                ln=num[num.index(i)-1]
                if len(num[num.index(i)-2])==1:
                    mn=num[num.index(i)-2]
                    fn=num[num.index(i)-3]
                    nm=fn+' '+mn+' '+ln
                    return nm
                if len(num[num.index(i)-2])!=1:
                    mn='No middle name'
                    fn=num[num.index(i)-2]
                    nm=fn+' '+ln
                    return nm
            if num[num.index(i)-1] in ('Box', 'BOX') and num.index(i)>2:
                ln=num[num.index(i)-3]
                if len(num[num.index(i)-4])==1:
                    mn=num[num.index(i)-4]
                    fn=num[num.index(i)-5]
                    nm=fn+' '+mn+' '+ln
                    return nm
                if len(num[num.index(i)-4])!=1:
                    fn=num[num.index(i)-4]
                    nm=fn+' '+ln
                    return nm




In [8]:
#Function of finding the street address index
def street(num):
    for i in num:
        if i.isdigit():
            street_index=num.index(i)
            return street_index

In [9]:
#removing lower case words in the block
remove_lower = lambda text: re.sub('[a-z]', '', text) 


Intuition of identifying name and address in the following code:
1. Identifying the textbox or container of zipcode. Name and address are in the same textbox or container.
2. Tokenize the identified information.
3. Remove certain punctuations or meaningless words.
4. Remove words made of only lowercase letters since name and address should have uppercase letters.
5. Finding state abbreviation. 
6. Some address does not have state abbreviation, so find US or USA instead. 
7. Finding the first number in the tokenized list. The first number should be the street number or PO BOX number. 
8. Identify PO BOX. 
9. Otherwise, The text between the first number and state abbreviation or US should be local address.
10. The text before the first number (street number) is name. 
11. Some pdf can not parse name or address due to the disorder of layout. 


In [10]:
#Parse the address and owner name
for i in doc:
    print('\033[1m'+i+'\033[0m')
    pdf_path=current_dir+i
    # Open a PDF file.
    fp = open(pdf_path, 'rb')
    # Create a PDF parser 
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        print('is not extractable!\n\n')
        continue        
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    def parse_obj(lt_objs):
        # loop over the object list
        for obj in lt_objs:
            # if it's a textbox, print text and location
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                condition_a=re.findall('\s\d{5}-\d{4}\s',obj.get_text()) 
                condition_b=re.findall('\s\d{5}\s',obj.get_text())
                condition_c=re.findall('USA\b',obj.get_text())
                condition_d=re.findall('\s\d\d\d\d\d\b',obj.get_text())
                condition_e=re.findall('\s\d{5}-\d{4}\b',obj.get_text())
                if  condition_a or condition_b or condition_c or condition_d or condition_e:
                    text=obj.get_text() # name and address are in the same textbox or container
                    token=nltk.word_tokenize(text) #tokenize the identified information (mainly name address, but also includes other miscellenous info)
                    token=[word for word in token if word not in (',','.',':','ADDRESS')]#remove certain punctuations or word
                    token = [word for word in token if not word.islower()] #remove all the lowercase letters since name and address should have uppercase letters
                    try:
                        print('Name:',name(token))
                    except:
                        print('There is an error in name token:'+str(token))
                     
                   # print(token)
                    for word in token:
                        con1=re.findall('^[A-Z]{2}$',word)
                        if con1 and word not in ('US', 'PO','ST','RD') and 'PO' not in token:#find state abbreviation
                            state=word
                            state_index=token.index(word) #state abbreviation index
                            street_address=token[street(token):state_index]# the text between the first number and state abbreviation should be street address
                            print('Street Address:',' '.join (street_address))
                            print('State:',state)
                            zipcode=nextword(state, token)#the following word of state abbreviation is zipcode 
                            print('Zipcode:',zipcode,'\n')
                                                    
                        if word in ('US','USA') and len(previousword(word,token))!=2:#some address does not contain state abbreviation, but has US or USA
                            street_address=token[street(token):token.index(word)] #identifying street address
                            print('Street Address:',' '.join (street_address))
                            city=previousword(word,token)
                            print('Country：US')
                            zipcode=nextword(word, token)
                            print('Zipcode:',zipcode,'\n')
                            
                        if word =='PO':
                            PO_BOX=word,nextword(word,token),nextword(nextword(word,token),token)
                            print('PO BOX:',' '.join (PO_BOX))
                            city=token[street(token)+1]
                            print('City:',city)
                            state=token[street(token)+2]
                            print('State',state,'\n')
                   
            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs)

    for page in PDFPage.create_pages(document):

        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()
        
        # extract text from this object
        parse_obj(layout._objs)

[1m104191-Sample Bill.pdf[0m
[1mccrutilitybillsample.pdf[0m
[1mDocOrigin_Utility_Bill_Tyler-Technologies.pdf[0m
Name: JOHN Q
Street Address: 123 ANYSTREET ANYWHERE
Country：US
Zipcode: 12345 

Name: JOHN Q SAMPLE
Street Address: 123 ANYSTREET ANYWHERE
Country：US
Zipcode: 12345 

Name: City TylerTown
Street Address: 123 Anystreet Anywhere
Country：US
Zipcode: 12345-1234 

[1mNational-Grid-1.pdf[0m
is not extractable!


[1mOhioSampleBilll.pdf[0m
Name: None
PO BOX: PO BOX 24401
City: CANTON
State OH 

Name: JANE SMITH
Street Address: 123 MAIN ST ANY CITY
State: OH
Zipcode: 43999-9999 

Name: JANE SMITH
Street Address: 123 MAIN ST ANY CITY
State: OH
Zipcode: 43999-9999 

Name: None
PO BOX: PO Box 24417
City: Canton
State OH 

Name: JANE SMITH
Street Address: 123 MAIN ST ANY CITY
State: OH
Zipcode: 43999-9999 

Name: Inquiries To
PO BOX: PO BOX 24401
City: CANTON
State OH 

Name: ELECTRIC POWER
PO BOX: PO BOX 24417
City: CANTON
State OH 

Name: None
Street Address: ANY CITY
State: O