Import libraries

In [1]:
#data loading and manipulation
import pandas as pd
import numpy as np

#for working with file paths
from os import path

#regular expressions
import re

#punctuation
import string

#spelling
#import enchant

In [2]:
pd.set_option('display.max_rows', 1000)

Load exempt data
* AQ06_where_data.csv is AQ06s submitted from 11/1/2016 50 11/15/2016
* ANF_where_data.csv is ANFs submitted from 11/1/2016 50 11/15/2016

In [3]:
aq06_path = path.join('data', 'AQ06_where_data.csv')
anf_path = path.join('data', 'ANF_where_data.csv')

aq06 = pd.read_csv(aq06_path)
anf = pd.read_csv(anf_path)

In [4]:
print aq06.shape
aq06.head()

(372, 7)


Unnamed: 0,ADDRESS,FORMTYPE,STICKER.,CONTRACTOR,STARTDATE,ENDDATE,EXPORTDATE
0,"363 HIGHLAND AVENUE, FALL RIVER, MA",AQ-06,100179693R1,GO SERVICES,7/1/2013 0:00:00,12/31/2018 0:00:00,12/9/2016 15:07:04
1,"154-160 GROVE STREET, CHICOPEE, MA",AQ-06,100211920R7,"COSTELLO DISMANTLING COMPANY, INC.",12/29/2014 0:00:00,12/23/2016 0:00:00,12/9/2016 15:07:04
2,"8 NEWBURY STREET, BOSTON, MA",AQ-06,100216252R2,P&R PARTNERS CONSTRUCTION,11/5/2016 0:00:00,11/5/2017 0:00:00,12/9/2016 15:07:04
3,"19,21,23,25 DRYDOCK AVE., BOSTON, MA",AQ-06,100218033R2,"NASDI, LLC",4/20/2015 0:00:00,5/1/2017 0:00:00,12/9/2016 15:07:04
4,"ONE FINANCIAL CENTER, 11TH FLOOR, BOSTON, MA",AQ-06,100225105,CORDERMAN & COMPANY,10/31/2016 0:00:00,12/31/2016 0:00:00,12/9/2016 15:07:04


In [5]:
print anf.shape
anf.head()

(4486, 7)


Unnamed: 0,ADDRESS,FORMTYPE,STICKER.,CONTRACTOR,STARTDATE,ENDDATE,EXPORTDATE
0,"115-119 HEMENWAY ST, BOSTON, MA",ANF-001,100253272,NATIONAL ABATEMENT INC,11/5/2016 0:00:00,11/30/2016 0:00:00,12/9/2016 15:06:29
1,"700 SOUTH STREET, PITTSFIELD, MA",ANF-001,100253330,OMNI ENVIRONMENTAL LLC,11/2/2016 0:00:00,1/13/2017 0:00:00,12/9/2016 15:06:29
2,"7 EDGAR TERRACE, SOMERVILLE, MA",ANF-001,100253332,NATIONAL ABATEMENT INC,11/2/2016 0:00:00,11/2/2016 0:00:00,12/9/2016 15:06:29
3,"238 S MAIN STREET, FREETOWN, MA",ANF-001,100236585R11,BANNER ENVIRONMENTAL SERVICES INC,10/26/2016 0:00:00,11/2/2016 0:00:00,12/9/2016 15:06:29
4,"5 WOODFIN TERRACE, MARBLEHEAD, MA",ANF-001,100251415R2,AIR QUALITY EXPERTS INC,10/17/2016 0:00:00,10/28/2016 0:00:00,12/9/2016 15:06:29


Check for and reconcile missing data

In [6]:
aq06.isnull().sum()

ADDRESS       0
FORMTYPE      0
STICKER.      0
CONTRACTOR    0
STARTDATE     0
ENDDATE       0
EXPORTDATE    0
dtype: int64

In [7]:
aq06[aq06.FORMTYPE.isnull()]

Unnamed: 0,ADDRESS,FORMTYPE,STICKER.,CONTRACTOR,STARTDATE,ENDDATE,EXPORTDATE


In [8]:
def replace_null(df,col_name,value):
    df[col_name] = df[col_name].fillna(value)

replace_null(aq06,'FORMTYPE','AQ-06')

In [9]:
anf.isnull().sum()

ADDRESS       0
FORMTYPE      0
STICKER.      0
CONTRACTOR    0
STARTDATE     0
ENDDATE       0
EXPORTDATE    0
dtype: int64

In [10]:
open_today = pd.DataFrame()
open_today = open_today.append(aq06)
open_today = open_today.append(anf)
open_today.shape

(4858, 7)

In [11]:
#check for SAME
print open_today.loc[open_today['CONTRACTOR']=='SAME']

Empty DataFrame
Columns: [ADDRESS, FORMTYPE, STICKER., CONTRACTOR, STARTDATE, ENDDATE, EXPORTDATE]
Index: []


In [12]:
#check FORMTYPE values
print open_today['FORMTYPE'].value_counts()

ANF-001    4486
AQ-06       372
Name: FORMTYPE, dtype: int64


In [13]:
#clean up FORMTYPE
open_today.loc[open_today['FORMTYPE'].str.contains('06'),'FORMTYPE']='aq06'
open_today.loc[open_today['FORMTYPE'].str.contains('001'),'FORMTYPE']='anf001'

Remove punctuation from all text fields and convert to uppercase

In [14]:
#copy CONTRACTOR to CON to preserve original name; all edtis to be done on CON
for row in open_today:
    open_today['CON'] = open_today['CONTRACTOR']

In [15]:
#count unique CON
print len(open_today['CON'].value_counts())
open_today['CON'].value_counts()

330


AIR SAFE  INC                                  208
FJC ENTERPRISES INC                            208
DEC-TAM CORPORATION                            189
ABIDE INC                                      133
NON LICENSED REMOVAL                           125
NEW ROADS ENVIRONMENTAL SERVICES LLC           123
NORTHSTAR CONTRACTING GROUP INC                121
ENVIRONMENTAL RESPONSE SERVICES  INC           105
NATIONAL ABATEMENT INC                          99
ACME ABATEMENT CONTRACTOR INC                   93
BANNER ENVIRONMENTAL SERVICES INC               92
GREEN ENVIRONMENTAL                             88
S & S ABATEMENT LLC                             86
WALSH ENVIRONMENTAL SERVICES                    85
NEW ENGLAND SURFACE MAINTENANCE LLP             82
A & E ENVIRONMENTAL INC                         81
MORAN ENVIRONMENTAL RECOVERY LLC                73
NORTHEAST REMEDIATION                           72
COMPASS RESTORATION SERVICE SERVICES  LLC       69
CLEAN AIR ENVIRONMENTAL INC    

In [16]:
def upper_no_punct(df,col_name):
    for field in df[col_name]:
        new_field = field.translate(None, string.punctuation).upper()
        df[col_name] = df[col_name].replace(field,new_field)

upper_no_punct(open_today,'CON')
#370-->341 aq06 only
#502-->463 aq06 + anf

In [17]:
#Replaces 'pattern' in 'string' with 'sub' if 'pattern' starts 'string'
def lreplace(pattern, sub, string):
    return re.sub('^%s' % pattern, sub, string)

#Replaces 'pattern' in 'string' with 'sub' if 'pattern' ends 'string'
def rreplace(pattern, sub, string):
    return re.sub('%s$' % pattern, sub, string)

In [18]:
#standardize abbreviations and misspellings
def remove_abbrev(df, col_name, old_string, new_string):
    for field in df[col_name]:
        new_field = field.replace(old_string,new_string)
        df[col_name] = df[col_name].replace(field,new_field)

#remove abbreviations that appear at the end of the string only (ex: CO at the end, but not CO in CONSTRUCTION)
def remove_ending(df, col_name, ending):
    for field in df.loc[df[col_name].str.endswith(ending),col_name]:
        new_field = rreplace(ending,'',field)
        new_field = new_field.rstrip()
        df[col_name] = df[col_name].replace(field,new_field)

In [19]:
#slower?
abbrevs = {
    'AND':('ADN',),
    'BRO':('BROS',
           'BROTHER',
           'BROTHERS'),
    'BUILDING':('BUIDLING',
                'BUILIDNG'),
    'CAULFIELD':('CAULLFIELD',),
    'CONSTRUCTION':('CONSTRCTION',
                    'CONSTRCUTION',
                    'CONSTRUCION',
                    'CONSTRUCTON',
                    'CONSTRUCTUON',
                    'CONSTRUTION',
                    'CONTRUCTION',),
    'CORP':('CORPORATION',
            'CORPORATIO',
            'CORPOTATION',),
    'DESIGN':('DEISGN',),
    'DEMO':('DEMOLIITION',
            'DEMOLITION',
            'DEMOLTION'),
    'ENVIRONMENTAL':('ENVIRONMENATL',
                    'ENVIRONMETAL'),
    'INC':('INCORPORATED',),
    'JK SCANLAN':('JKS',),
    'SHAWMUT':('SAHWMUT',),
    'STRUCTURETONE':('STRUCTURE TONE',
                    'STURUCTURE TONE',),
    'SON':('SONS',),
    'DISMANTLING':('DISMANTLIING',),
}

spacing = {
    ' ':('  ',),
}

endings = [
    'LLC',
    'COINC',
    'INC',
    'COMPANY',
    'CORP',
    'LLP',
    'DESIGN',
    'CO',
    'CONC',
    'MANAGERS',
    'GROUP',
    'SERVICES',
    'SERVICE',
    'AND',
]

for new_string, old_strings in abbrevs.iteritems():
    for old_string in old_strings: 
        remove_abbrev(open_today, 'CON', old_string, new_string)

for ending in endings:
        remove_ending(open_today, 'CON', ending)
        
for new_string, old_strings in spacing.iteritems():
    for old_string in old_strings: 
        remove_abbrev(open_today, 'CON', old_string, new_string)
#341-->273 aq06 only
#463--382 aq06 + anf

In [20]:
#count unique CON
print len(open_today['CON'].value_counts())
open_today['CON'].value_counts()

275


FJC ENTERPRISES                             208
AIR SAFE                                    208
DECTAM                                      192
ABIDE                                       133
NON LICENSED REMOVAL                        125
NEW ROADS ENVIRONMENTAL                     123
NORTHSTAR CONTRACTING                       122
ENVIRONMENTAL RESPONSE                      105
NATIONAL ABATEMENT                           99
ACME ABATEMENT CONTRACTOR                    93
BANNER ENVIRONMENTAL                         92
GREEN ENVIRONMENTAL                          88
S S ABATEMENT                                86
WALSH ENVIRONMENTAL                          85
NEW ENGLAND SURFACE MAINTENANCE              82
THE AULSON                                   81
A E ENVIRONMENTAL                            81
MORAN ENVIRONMENTAL RECOVERY                 73
NORTHEAST REMEDIATION                        72
COMPASS RESTORATION                          69
CLEAN AIR ENVIRONMENTAL                 

#faster?
abbrevs = {
    'ADN':'AND',
    'BROS':'BRO',
    'BROTHER':'BRO',
    'BROTHERS':'BRO',
    'BUIDLING':'BUILDING',
    'BUILIDNG':'BUILDING',
    'CAULLFIELD':'CAULFIELD',
    'CONSTRCTION':'CONSTRUCTION',
    'CONSTRCUTION':'CONSTRUCTION',
    'CONSTRUCION':'CONSTRUCTION',
    'CONSTRUCTON':'CONSTRUCTION',
    'CONSTRUCTUON':'CONSTRUCTION',
    'CONSTRUTION':'CONSTRUCTION',
    'CONTRUCTION':'CONSTRUCTION',
    'CORPORATION':'CORP',
    'CORPORATIO':'CORP',
    'CORPOTATION':'CORP',
    'DEISGN':'DESIGN',
    'DEMOLIITION':'DEMO',
    'DEMOLITION':'DEMO',
    'DEMOLTION':'DEMO',
    'DISMANTLIING':'DISMANTLING',
    'ENVIRONMENATL':'ENVIRONMENTAL',
    'ENVIRONMETAL':'ENVIRONMENTAL',
    'INCORPORATED':'INC',
    'JKS':'JK SCANLAN',
    'SAHWMUT':'SHAWMUT',
    'STRUCTURE TONE':'STRUCTURETONE',
    'STURUCTURE TONE':'STRUCTURETONE',
    'SONS':'SON',    
}

spacing = {
    '  ':' ',
}

endings = [
    'LLC',
    'COINC',
    'INC',
    'COMPANY',
    'CORP',
    'LLP',
    'DESIGN',
    'CO',
    'CONC',
    'MANAGERS',
    'GROUP',
    'SERVICES',
    'SERVICE',
    'AND',
]

for old_string, new_string in abbrevs.iteritems():
    remove_abbrev(open_today, 'CON', old_string, new_string)

for ending in endings:
    remove_ending(open_today, 'CON', ending)
        
for old_string, new_strings in spacing.iteritems():
    remove_abbrev(open_today, 'CON', old_string, new_string)
#341-->273 aq06 only
#463--382 aq06 + anf

#standardize abbreviations and misspellings using using re.sub
def re_remove_abbrev(df, col_name, old_string, new_string):
    for field in df[col_name]:
        new_field = re.sub(old_string, new_string, old_string)
        df[col_name] = df[col_name].replace(field,new_field)

#fastest?
abbrevs = {
    'ADN':'AND',
    'BROS':'BRO',
    'BROTHER':'BRO',
    'BROTHERS':'BRO',
    'BUIDLING':'BUILDING',
    'BUILIDNG':'BUILDING',
    'CAULLFIELD':'CAULFIELD',
    'CONSTRCTION':'CONSTRUCTION',
    'CONSTRCUTION':'CONSTRUCTION',
    'CONSTRUCION':'CONSTRUCTION',
    'CONSTRUCTON':'CONSTRUCTION',
    'CONSTRUCTUON':'CONSTRUCTION',
    'CONSTRUTION':'CONSTRUCTION',
    'CONTRUCTION':'CONSTRUCTION',
    'CORPORATION':'CORP',
    'CORPORATIO':'CORP',
    'CORPOTATION':'CORP',
    'DEISGN':'DESIGN',
    'DEMOLIITION':'DEMO',
    'DEMOLITION':'DEMO',
    'DEMOLTION':'DEMO',
    'DISMANTLIING':'DISMANTLING',
    'ENVIRONMENATL':'ENVIRONMENTAL',
    'ENVIRONMETAL':'ENVIRONMENTAL',
    'INCORPORATED':'INC',
    'JKS':'JK SCANLAN',
    'SAHWMUT':'SHAWMUT',
    'STRUCTURE TONE':'STRUCTURETONE',
    'STURUCTURE TONE':'STRUCTURETONE',
    'SONS':'SON',    
}

spacing = {
    '  ':' ',
}

endings = [
    'LLC',
    'COINC',
    'INC',
    'COMPANY',
    'CORP',
    'LLP',
    'DESIGN',
    'CO',
    'CONC',
    'MANAGERS',
    'GROUP',
    'SERVICES',
    'SERVICE',
    'AND',
]

for old_string, new_string in abbrevs.iteritems():
    re_remove_abbrev(open_today, 'CON', old_string, new_string)

#for ending in endings:
#    remove_ending(open_today, 'CON', ending)
        
#for old_string, new_strings in spacing.iteritems():
#    remove_abbrev(open_today, 'CON', old_string, new_string)
#341-->273 aq06 only
#463--382 aq06 + anf

In [21]:
#write data to csv
open_today.to_csv('where.csv', columns=('ADDRESS','FORMTYPE','STICKER.','CON','STARTDATE','ENDDATE','EXPORTDATE'), header=False, index=False)

In [22]:
con = open_today[open_today['CON'].str.contains('NASDI')]
print con['FORMTYPE'].value_counts()

anf001    22
aq06       1
Name: FORMTYPE, dtype: int64
