In [1]:
import os
import glob
import re
import pandas as pd
import pypandoc
from tqdm.notebook import tqdm
from datetime import datetime

In [None]:
# directory where files are stored
data = 'data'

In [None]:
# dictionary to expand street abbreviations
street_abbrev = {'boul.':'boulevard',
            'St-':'Saint-',
            'Nve-':'Neuve- ',
            ' pass.':'passage',
            ' av.':'avenue',
            'Faub.':'Faubourg'
          }

In [None]:
def prep_text(file):
    
    if '.doc' in file:
        fulltext = pypandoc.convert_file(file,'plain')
    else:
        with open(file) as f:
            fulltext = f.read()

    # remove heading and line breaks
    fulltext = fulltext.replace('FLEURS NATURELLES.','').replace('«',',')
    fulltext = ' '.join(fulltext.split('\n'))

    # update street abbreviations
    for k,v in street_abbrev.items():
        fulltext = fulltext.replace(k,v)
        
    # split full text on digits that are followed by punctuation (usually street numbers)
    bus_list = re.split('(?:(?<=\s\d)|(?<=\s\d{2})|(?<=\s\d{3}))[\.\:\,\-\sA-Z]',fulltext.replace('\n',''))
    # remove extra white space
    bus_list = [line.strip() for line in bus_list]
    
    return bus_list

In [None]:
def fix_street(street):

    # common problem to have this extra space
    street = street.replace('- ','-').replace(' -','-')
        
    # default street is 'rue' if nothing else present
    # this will catch most of them
    if len(street.split()) == 1:
        street = 'rue {}'.format(street)
        
    return street

In [None]:
def parse_line(line,file,df):
    
    if line.strip() == '':
        return df
    
    # first try to split on commas to see if there are three values
    parsed = line.split(',')
    # next try - commas are often OCRed as period
    parsed2 = line.replace('.',',').split(',')
    
    if len(parsed) == 3:    
        street = fix_street(parsed[1])

        df = df.append({'name':parsed[0], 'street':street, 'number':parsed[2], 'line':line, 'source':file}, ignore_index=True)
        
    elif len(parsed2) == 3:
        street = fix_street(parsed2[1])
        df = df.append({'name':parsed2[0], 'street':street, 'number':parsed2[2], 'line':line, 'source':file}, ignore_index=True)

    else:
        try:
            street = fix_street(parsed[-2])
            extra_text = ', '.join(parsed[1:-2])
            df = df.append({'name':parsed[0], 'street':street, 'number':parsed[-1], 'extra text':extra_text, 'line':line, 'source':file}, ignore_index=True)
        except IndexError:
            df = df.append({'name': line, 'line':line, 'source':file}, ignore_index=True)
            pass
    
    return df

In [None]:
# set empty dataframe with needed columns
df = pd.DataFrame(columns=['line','name','number','street','extra text','source'])    

# gather all .txt and .docx files to process
allfiles = glob.glob('{}/*.docx'.format(data)) + glob.glob('{}/*.txt'.format(data))

# process each docx file in the directory
for file in tqdm(allfiles):
    
    # pre-process docx file
    bus_list = prep_text(file)
    
    for line in bus_list:
        df = parse_line(line,file,df)

In [None]:
df.sample(3)

In [None]:
# get timestamp from current time
date = datetime.now()
date = datetime.strftime(date, '%Y%m%d%H%M%S')

# write out to Excel
df.to_excel('export/business_list_{}.xlsx'.format(date), index=False)