# Data Profiling and Cleaning

We profiled and cleaned the NYC opendata `DOB Job Application Filings` data using pandas and openclean

Run all the cells in order to profile and clean the data

Robert Ronan, Sheng Tong, Jerry Lee

In [1]:
import openclean
import glob
import pandas as pd
import numpy as np


# Data Downloading

Download the data using openClean

In [2]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

dataset = Socrata().dataset('ipu4-2q9a')
datafile = './ipu4-2q9a.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Using 'DOB Permit Issuance' in file ./ipu4-2q9a.tsv.gz of size 508.16 MB


# Data Loading

Load the data into pandas and openClean dataset object

In [3]:
import pandas as pd
from openclean.pipeline import stream

df  = pd.read_csv(datafile, dtype='object', sep='\t')
ds = stream(datafile, encoding='utf8', delim='\t')

In [4]:
np.__version__

'1.21.3'

In [5]:
pd.__version__

'1.3.4'

In [6]:
import glob

In [7]:
glob.glob("*")

['DOB_Job_Application_Filings.csv',
 'DOB_Job_Cleaning.ipynb',
 'DOB_Job_Cleaning_2-Copy1.ipynb',
 'DOB_Job_Cleaning_2.ipynb',
 'ic3t-wcy2.tsv.gz',
 'ipu4-2q9a.tsv.gz',
 'README.md',
 'xubg-57si.tsv.gz']

### Get some basic info about the dataset columns

In [8]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3777844 entries, 0 to 3777843
Data columns (total 60 columns):
 #   Column                            Non-Null Count    Dtype 
---  ------                            --------------    ----- 
 0   BOROUGH                           3777844 non-null  object
 1   Bin #                             3777844 non-null  object
 2   House #                           3777840 non-null  object
 3   Street Name                       3777840 non-null  object
 4   Job #                             3777844 non-null  object
 5   Job doc. #                        3777844 non-null  object
 6   Job Type                          3777844 non-null  object
 7   Self_Cert                         2501176 non-null  object
 8   Block                             3777345 non-null  object
 9   Lot                               3777336 non-null  object
 10  Community Board                   3773003 non-null  object
 11  Zip Code                          3775600 non-null

If any rows are complete duplicates, drop them

In [9]:
df = df.drop_duplicates()

Take an a look at some of the rows to get an idea of what the datset looks like

In [10]:
df

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,...,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
0,MANHATTAN,1077287,1230,6TH AVENUE,123725807,01,A2,Y,01264,00005,...,NY,10111,2127150300,12/12/2020 00:00:00,3554580,40.758977,-73.981089,4,96,Midtown-Midtown South
1,STATEN ISLAND,5113169,715,OCEAN TERRACE,500876037,01,A2,Y,00683,00001,...,NY,11101,7184728000,12/12/2020 00:00:00,3719150,40.608512,-74.102067,50,177,Todt Hill-Emerson Hill-Heartland Village-Light...
2,BROOKLYN,3253458,9952,3 AVE,321963014,01,DM,N,06133,00056,...,NY,11234,3478661439,06/18/2020 00:00:00,3765458,40.613341,-74.035582,43,5602,Bay Ridge
3,BROOKLYN,3117942,179,LOTT STREET,322006618,01,DM,N,05136,00058,...,NY,11205,7184146042,06/18/2020 00:00:00,3765459,40.645537,-73.954034,40,792,Erasmus
4,BROOKLYN,3210296,2917,AVENUE N,321996970,01,DM,N,07665,00004,...,NY,11210,3474928492,06/18/2020 00:00:00,3765460,40.617141,-73.945805,45,746,Flatlands
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3777839,QUEENS,4043577,108-19,35TH AVENUE,421155109,01,A1,N,01750,00022,...,,,9176896641,12/06/2021 00:00:00,3884223,40.755485,-73.860206,21,381,North Corona
3777840,QUEENS,4043635,36-09,108 STREET,402639052,01,A2,Y,01752,00050,...,,,3473867093,12/06/2021 00:00:00,3884224,40.754500,-73.860385,21,381,North Corona
3777841,STATEN ISLAND,5158059,210,NAUGHTON AVENUE,510015591,01,NB,,03546,00029,...,,,7183517447,12/06/2021 00:00:00,3884225,40.585325,-74.096908,50,11401,Old Town-Dongan Hills-South Beach
3777842,QUEENS,4100478,25-63,125 STREET,421641049,01,NB,Y,04266,00044,...,,,6462648899,12/06/2021 00:00:00,3884226,40.775061,-73.843574,19,907,College Point


In [11]:
# need 384 samples for 95% +/- 5% confidence

In [12]:
df_sample_data =  df.sample(384).copy()

In [13]:
df_sample_data

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,...,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
2690858,MANHATTAN,1011225,544,HUDSON STREET,110418717,02,A2,N,00621,00004,...,NY,10014,2129893100,2017-11-03,1950362,40.734786,-74.006116,3,73,West Village
952463,MANHATTAN,1040031,300,EAST 59 STREET,101954529,01,A2,,01351,00001,...,NY,10022,2122421840,2017-11-03,853118,40.760546,-73.964206,5,108,Turtle Bay-East Midtown
3661636,BRONX,2012913,2469,CROTONA AVENUE,220482430,01,A2,N,03105,00032,...,NY,10604,6464351897,2019-12-05,3718155,40.855586,-73.882794,15,389,Belmont
1370957,BROOKLYN,3000436,27,SMITH ST,320765141,01,A2,N,00155,00003,...,NY,11201,7182430911,2017-11-03,2549261,40.690776,-73.987711,33,37,DUMBO-Vinegar Hill-Downtown Brooklyn-Boerum Hill
670952,QUEENS,4113559,136-98,ROOSEVELT AVENUE,421015500,01,A2,Y,05019,00063,...,NY,11354,9177038886,2017-11-03,2741013,40.760192,-73.827596,20,853,Flushing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133799,MANHATTAN,1057086,270,RIVERSIDE DRIVE,104923335,01,A2,Y,01888,00043,...,NY,10025,2129329361,2017-11-03,1793499,40.797890,-73.973215,6,187,Upper West Side
1713306,MANHATTAN,1052712,325,EAST 104 STREET,103508571,01,A2,,01676,00011,...,NY,10029,2124269800,2017-11-03,1808018,40.788997,-73.942628,8,170,East Harlem South
671620,MANHATTAN,1063228,586,WEST 177 STREET,102315166,01,A3,Y,02133,00040,...,NY,11101,7184728000,2017-11-03,1385109,40.846542,-73.935205,10,261,Washington Heights South
3347239,BROOKLYN,3406429,5702 REAR,AVENUE N,321561396,01,DM,N,07902,00038,...,NY,11219,7186866262,2017-11-17,3367993,40.619371,-73.919979,46,688,Flatlands


## Describe columns in groups so they fit on screen

In [14]:
df[df.columns[:20]].describe()

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job doc. #,Job Type,Self_Cert,Block,Lot,Community Board,Zip Code,Bldg Type,Residential,Special District 1,Special District 2,Work Type,Permit Status,Filing Status,Permit Type
count,3777844,3777844,3777840,3777840,3777844,3777844,3777844,2501176,3777345,3777336,3773003,3775600,3723362,1527401,435216,68077,3108956,3766812,3777844,3777843
unique,5,393869,35907,28175,1626233,12,6,5,13890,2069,146,232,2,1,113,8,13,4,2,8
top,MANHATTAN,1015862,1,BROADWAY,401910438,1,A2,Y,16,1,105,10022,2,YES,MID,IBZ,OT,ISSUED,INITIAL,EW
freq,1558049,3390,29663,103115,96,3430746,2266758,1647350,11268,444449,371371,94968,2814068,1527401,106896,27571,1102619,3699589,2686198,1696227


In [15]:
# Notes:
# Building Type looks binary and has 2 values + maybe NAN
# Cluster looks binary and has 2 values + maybe NAN
# Landmarked looks binary and has 4 values + maybe NAN
# Adult Establishment looks binary and has 2 values + maybe NAN
# Loft Board looks binary and has 2 values + maybe NAN
# City Owned looks binary and has 4 values + maybe NAN
# Little e looks binary and has 5 values + maybe NAN


In [16]:
df[df.columns[20:40]].describe()

Unnamed: 0,Permit Sequence #,Permit Subtype,Oil Gas,Site Fill,Filing Date,Issuance Date,Expiration Date,Job Start Date,Permittee's First Name,Permittee's Last Name,Permittee's Business Name,Permittee's Phone #,Permittee's License Type,Permittee's License #,Act as Superintendent,Permittee's Other Title,HIC License,Site Safety Mgr's First Name,Site Safety Mgr's Last Name,Site Safety Mgr Business Name
count,3777844,2284975,38816,3315766,3777843,3757239,3766039,3777813,3761554,3761538,3728175,3761323,3508177,3538313,1596054,271677,31199,34611,34635,22383
unique,32,15,2,5,13337,13275,16027,15407,38823,91916,366042,158410,14,57417,3,3308,5542,724,1725,1710
top,1,OT,OIL,NONE,2017-03-29,2017-03-29,2007-12-31,2008-06-27,JOHN,SINGH,EVEREST SCAFFOLDING INC,2124816100,GC,0,Y,GC,0,MICHAEL,WAIVER,TOTAL SAFETY CONSULTING
freq,2686208,1038185,36049,1582047,1077,1086,18713,1502,152939,57972,13957,22246,2287705,73818,1587797,105976,788,1808,1446,1765


In [17]:
# PC Filed -- Other all look binary, and have 1-2 values + maybe NAN
#
# Take a look at Other Description for weird strings

#  Lots of the same First and Last name

# Check names and titles

# APPLICATNT LICENCSE # NEEDS TO BE A STRING TO PERSERVE THE 0 ON IT (PROBABLY)

# Professional Cert looks binary and has 5 values + maybe NAN



In [18]:
df[df.columns[40:60]].describe()

Unnamed: 0,Superintendent First & Last Name,Superintendent Business Name,Owner's Business Type,Non-Profit,Owner's Business Name,Owner's First Name,Owner's Last Name,Owner's House #,Owner's House Street Name,Owner’s House City,Owner’s House State,Owner’s House Zip Code,Owner's Phone #,DOBRunDate,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
count,1727932,1690546,3612932,3617061,3001634,3775957,3776273,3732870,3732625,3733291,3733335,3727825,3728749,3777843,3777844,3763796.0,3763796.0,3763796,3763796,3763796
unique,158019,310425,15,4,487027,98968,176595,41437,138073,12844,57,8304,408107,1370,3777844,231342.0,243453.0,51,1327,194
top,JOHN WHITE,ROCKLEDGE SCAFFOLD,CORPORATION,N,NY SCHOOL CONSTRUCTION AUTHORITY,MICHAEL,SINGH,100,BROADWAY,NEW YORK,NY,10022,7184728000,2017-11-03,3554580,40.748342,-73.984643,4,7,Midtown-Midtown South
freq,13476,6847,1228255,3387948,27065,89612,34548,87903,95650,1050271,3632137,129643,32649,3219110,1,3291.0,3324.0,448847,31271,270202


In [19]:
# Need to convert date columns to pd.datetime
# RENAME PAID TO PAID DATE
# RENAME FULLY PAID TO FULLY PAID DATE
# RENAME ASSISGNED TO ASSIGNED DATE
# RENAME APPROVED TO APPROVED DATE

# CHECK COHERENCE OF PAID DATE <= FULLY PAID DATE
# CHECK COHERENCE OF PRE FILING DATE <= PAID DATE
# CHECK COHERENCE OF ASSIGNED DATE <= APPROVED DATE
# 

# REMOVE $ FROM Initial Cost and Total Estimated Fee, and put them in column name, convert values to floats

# Check What fee status is

# Check Existing Zoning Sqft, Propsed Zoning Sqft, Enlargement SQ Footage for reasonable values
# Change either Sqft to SQ Footage or vvice-versa

# Horizontal Enlargement and Vertical Enlargement are booleans + NAN
# Change Enlrgmt to Enlargement

#Chcek ExisitngNo. of Stories and Proposed # of Stories for reasonableness
# Add space between Existing and No.
# Change either Job# to Job No. or vice versa
# maybe just change all the No./# to "number"

# Check Existing and proposed height for reasonableness. Add unit to column name

# check Existing Dwelling Units for reasonableness 



In [20]:
#df[df.columns[60:80]].describe()

In [21]:
# check Proposed Dwelling Units
## Why does Existing Occupancy have fewer cats than Proposed Occupancy. Check those.

# What is Site Fill. 

# Get list of NYC ZOning Districts and Special Districts

# Checmk Owner Typer for spelling issues

# Non Profit is binary

# Check Owners's First and last name

# Owner'sBuisness Name should not be "OWNER"

# Owner's house number, streeet name, city, state and zip have almost no values

# Why is the same phone number so common

In [22]:
#df[df.columns[80:100]].describe()

In [23]:

# That is a lot of unique job descriptions

# Add spaces to DOB Run Date name
# make DOB Run Date a datetime

# What is Job_S1_NO. It uses underscores.

# All the remaining columns have ALL CAPS NAMES WITH UNDERSCORES 
# TOTAL_CONSTRUCTION_FLOOR_AREA, WITHDRAWAL_FLAG

# SIGNOFF_DATE needs to be datetime
# SPECIAL_ACTION_STATUS
# SPECIAL_ACTION_DATE needs to be datetime
# BUILDING_CLASS
# What is JOB_NO_GOOD_COUNT
#
# maybe need GIS DATA
# GIS_LATITUDE
# GIS_LONGITUDE
# GIS_COUNCIL_DISTRICT
# GIS_CENSUS_TRACT
# GIS_NTA_NAME
# GIS_BIN
# 

In [24]:
df.columns

Index(['BOROUGH', 'Bin #', 'House #', 'Street Name', 'Job #', 'Job doc. #',
       'Job Type', 'Self_Cert', 'Block', 'Lot', 'Community Board', 'Zip Code',
       'Bldg Type', 'Residential', 'Special District 1', 'Special District 2',
       'Work Type', 'Permit Status', 'Filing Status', 'Permit Type',
       'Permit Sequence #', 'Permit Subtype', 'Oil Gas', 'Site Fill',
       'Filing Date', 'Issuance Date', 'Expiration Date', 'Job Start Date',
       'Permittee's First Name', 'Permittee's Last Name',
       'Permittee's Business Name', 'Permittee's Phone #',
       'Permittee's License Type', 'Permittee's License #',
       'Act as Superintendent', 'Permittee's Other Title', 'HIC License',
       'Site Safety Mgr's First Name', 'Site Safety Mgr's Last Name',
       'Site Safety Mgr Business Name', 'Superintendent First & Last Name',
       'Superintendent Business Name', 'Owner's Business Type', 'Non-Profit',
       'Owner's Business Name', 'Owner's First Name', 'Owner's Last Name',
 

## Renaming columns

In [25]:
df = df.rename(columns={
    
    "Job doc. #": "Job Document #",
    "Self_Cert": "Self Cert",
    "Bldg Type": "Building Type",
    "Site Safety Mgr's First Name" : "Site Safety Manager's First Name",
    "Site Safety Mgr's Last Name" : "Site Safety Manager's Last Name",
    "Site Safety Mgr Business Name" : "Site Safety Manager's Buisness Name",

    
                          "Owner'sPhone #": "Owner's Phone #" 
                        , "Owner'sHouse Street Name": "Owner's House Street Name"
                        , "Owner’s House City": "Owner's House City"
                        , "Owner’s House State" : "Owner's House State"
                        , "Owner’s House Zip Code" : "Owner's House Zip Code"
                        , "Paid": "Paid Date"
                        , "Fully Paid": "Fully Paid Date"
                        , "Assigned": "Assigned Date"
                        , "Approved": "Approved Date"
                        , "Job Status Descrp": "Job Status Description"
                        , "Community - Board": "Community Board"
                        , "Adult Estab": "Adult Establishment"
                        , "Pre- Filing Date": "Pre-Filing Date"
                        , "Total Est. Fee": "Total Estimated Fee"
                        , "Horizontal Enlrgmt": "Horizontal Enlargement"
                        , "Vertical Enlrgmt": "Vertical Enlargement"
                        , "ExistingNo. of Stories": "Existing # of Stories"
                        , "Proposed No. of Stories": "Proposed # of Stories"
                        , "Zoning Dist1": "Zoning District 1"
                        , "Zoning Dist2": "Zoning District 2"
                        , "Zoning Dist3": "Zoning District 3"
                        , "City ": "Owner's House City"
                        , "State": "Owner's House State"
                        , "Zip": "Owner's House Zip"
                        , "DOBRunDate": "DOB Run Date"
                       })

#Lower cased but not sure if this is necessary

# df = df.rename(columns={
#                          "PERMIT_SI_NO": "Permit Si #"
#                         , "TOTAL_CONSTRUCTION_FLOOR_AREA": "Total Construction Floor Area"
#                         , "WITHDRAWAL_FLAG": "Withdrawl Flag"
#                         , "SIGNOFF_DATE": "Signoff Date"
#                         , "SPECIAL_ACTION_STATUS": "Special Action Status"
#                         , "SPECIAL_ACTION_DATE": "Special Action Date"
#                         , "BUILDING_CLASS": "Building Class"
#                         , "JOB_NO_GOOD_COUNT": "Job No Good Count"
#                         , "LATITUDE": "GIS Latitude"
#                         , "LONGITUDE": "GIS Longitude"
#                         , "COUNCIL_DISTRICT": "GIS Council District"
#                         , "CENSUS_TRACT": "GIS Census Tract"
#                         , "NTA_NAME": "GIS NTA Name"
#                         , "GIS_BIN": "GIS Bin"
#                         })


#### Method to get an idea of the top 10 values of a column

In [26]:
def show_vals(column_name, show_rows=10, df=df):
    print("Top {} {}:\n".format(show_rows, column_name))
    print(df[column_name].value_counts(dropna=False)[:show_rows])
    print()

### Examining Job #s

Some repition in the Job #'s, but nothing major. We will check some of the repeated Job #s to be sure they actually refer to the same jobs

In [27]:
df['Job #'].value_counts(dropna=False)

401910438    96
402887550    72
200718330    72
402885776    69
201190059    66
             ..
300888243     1
320176750     1
120419252     1
401000812     1
340829330     1
Name: Job #, Length: 1626233, dtype: int64

Nothing weird looking here

In [28]:
df['Job #'].min()

'100030011'

In [29]:
df['Job #'].max()

'566001839'

No Job #s starting with 0, so there are integer values, unlike Doc # which has values 01, 02, etc.

In [30]:
df.loc[df['Job #'].str.startswith('0')]

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME


Fraction of Rows with unique job numbers

In [31]:
df['Job #'].nunique()/df['Job #'].count()

0.4304658953625401

Group by Job # and check if latitude and longitude are the same all the same for the job, which would indicate different instances of the Job # all refer to the same Job.

In [32]:
group = df[['Job #', 'LATITUDE', 'LONGITUDE']].groupby('Job #')

This will take a little while to run

In [33]:
tranformed = group.aggregate(lambda x: x.unique().shape[0])

Jobs with multiple latitude and longitudes:

In [34]:
# 4344 Potential bad jobs this time
tranformed.loc[(tranformed['LATITUDE']!=1)
              |(tranformed['LONGITUDE']!=1)]

Unnamed: 0_level_0,LATITUDE,LONGITUDE
Job #,Unnamed: 1_level_1,Unnamed: 2_level_1
100107493,2,2
100134472,2,2
100134481,2,2
100134515,2,2
100134524,2,2
...,...,...
540163767,2,2
540166014,2,1
540166032,2,1
540169280,2,2


In [35]:
potential_bad_jobs = list(tranformed.loc[(tranformed['LATITUDE']!=1)
              |(tranformed['LONGITUDE']!=1)].index.unique())

Separate these into a temporary dataframe to play around with:

In [36]:
df_temp = df.loc[df['Job #'].isin(potential_bad_jobs)].copy()

In [37]:
df_temp = df_temp.sort_values(['Job #', 'LATITUDE', 'LONGITUDE'])

Most of these are just missing lat and long values.

The others look to be Jobs that manage multiple houses/lots in a small area, so are probably correct

In [38]:
df_temp[df_temp.duplicated(subset=['Job #', 'Block', 'Lot', 'Bin #', 'Job Type'], keep=False)].sort_values('Job #')

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
388233,MANHATTAN,1085348,298A,WEST 137 STREET,100134472,01,NB,,01942,07501,...,NY,10038,2129786310,2017-11-03,397245,40.817208,-73.944222,9,228,Central Harlem North-Polo Grounds
1803971,MANHATTAN,1085348,298A,WEST 137 STREET,100134472,01,NB,,01942,07501,...,NY,10038,2129786310,2017-11-03,346953,40.817208,-73.944222,9,228,Central Harlem North-Polo Grounds
3084547,MANHATTAN,1085348,298A,WEST 137 STREET,100134481,01,NB,,01942,07501,...,NY,10038,2129786310,2017-11-03,397250,40.817208,-73.944222,9,228,Central Harlem North-Polo Grounds
3230146,MANHATTAN,1085348,298A,WEST 137 STREET,100134481,01,NB,,01942,07501,...,NY,10038,2129786310,2017-11-03,346960,40.817208,-73.944222,9,228,Central Harlem North-Polo Grounds
1491879,MANHATTAN,1015300,42,WEST 15 STREET,100172705,01,A2,,00816,00065,...,NY,10011,2127418140,2017-11-03,104499,40.737281,-73.994544,3,54,Hudson Yards-Chelsea-Flatiron-Union Square
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3660452,STATEN ISLAND,5127008,2865,RICHMOND AVENUE,540166032,01,A2,N,02460,00098,...,NY,11042,5168692935,2019-08-20,3679635,40.576224,-74.169772,51,27702,Todt Hill-Emerson Hill-Heartland Village-Light...
3710042,STATEN ISLAND,5049717,475,SEAVIEW AVENUE,540169280,01,A2,N,03355,00032,...,,,7182269079,08/26/2021 00:00:00,3822064,40.583659,-74.086375,50,70,Old Town-Dongan Hills-South Beach
3688828,STATEN ISLAND,5049717,475,SEAVIEW AVENUE,540169280,01,A2,N,03355,00032,...,NY,10305,7182269079,2020-04-11,3754760,40.583953,-74.086930,50,70,Old Town-Dongan Hills-South Beach
62270,STATEN ISLAND,5029201,1686,FOREST AVE,540173113,01,A2,Y,01476,00051,...,NY,07733,7186109104,07/29/2020 00:00:00,3753839,40.624953,-74.145180,50,251,Port Richmond


In [39]:
df_temp['Job #']

1072174    100107493
1412144    100107493
388233     100134472
1803971    100134472
317578     100134472
             ...    
3660452    540166032
3710042    540169280
3688828    540169280
62270      540173113
3690753    540173113
Name: Job #, Length: 28077, dtype: object

In [40]:
df_temp.loc[df_temp['Job #']=='100134472']

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
388233,MANHATTAN,1085348,298A,WEST 137 STREET,100134472,1,NB,,1942,7501,...,NY,10038,2129786310,2017-11-03,397245,40.817208,-73.944222,9,228,Central Harlem North-Polo Grounds
1803971,MANHATTAN,1085348,298A,WEST 137 STREET,100134472,1,NB,,1942,7501,...,NY,10038,2129786310,2017-11-03,346953,40.817208,-73.944222,9,228,Central Harlem North-Polo Grounds
317578,MANHATTAN,1085348,298,WEST 137 STREET,100134472,3,NB,,1942,1,...,NY,10038,2129786310,2017-11-03,375619,40.817618,-73.94519,9,228,Central Harlem North-Polo Grounds


#### Later, after we have cleaned more values, we will fill these missing values by job #

Remove Jobs we know to be just missing data from the list of bad jobs

In [41]:
not_bad_jobs = df_temp[df_temp.duplicated(subset=['Job #', 'Block', 'Lot', 'Bin #', 'Job Type'], keep=False)]['Job #'].unique()

In [42]:
df_temp = df_temp.loc[~df_temp['Job #'].isin(not_bad_jobs)]

All of these are jobs that handle multiple lots or House #s, which explains why the lat/long change 

Job # 122171130 and 122222638 may be exact copies of each other though

In [43]:
df_temp

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
1072174,MANHATTAN,1046972,41,EAST 89 STREET,100107493,1,A1,,1501,7501,...,NY,10017,2122106666,2017-11-03,84615,40.782479,-73.956647,4,15002,Upper East Side-Carnegie Hill
1412144,MANHATTAN,1046972,45,EAST 89 STREET,100107493,2,A1,,1501,20,...,NY,10017,2122106666,2017-11-03,108821,40.782869,-73.957095,4,15002,Upper East Side-Carnegie Hill
1332500,MANHATTAN,1085627,298A,WEST 138 STREET,100134515,1,NB,,2023,7503,...,NY,10038,2129786310,2017-11-03,293431,40.818232,-73.944712,9,228,Central Harlem North-Polo Grounds
1776539,MANHATTAN,1085627,298,WEST 138 STREET,100134515,2,NB,,2023,1,...,NY,10038,2129786310,2017-11-03,303377,40.818362,-73.945016,9,228,Central Harlem North-Polo Grounds
844400,MANHATTAN,1085627,298A,WEST 138 STREET,100134524,1,NB,,2023,7503,...,NY,10038,2129786310,2017-11-03,293482,40.818232,-73.944712,9,228,Central Harlem North-Polo Grounds
873160,MANHATTAN,1085627,298,WEST 138 STREET,100134524,2,NB,,2023,1,...,NY,10038,2129786310,2017-11-03,303385,40.818362,-73.945016,9,228,Central Harlem North-Polo Grounds
2065199,MANHATTAN,1057135,301,WEST 100 STREET,100786295,1,A1,,1889,7501,...,NY,10025,2128655858,2017-11-03,441148,40.797937,-73.971604,6,187,Upper West Side
275572,MANHATTAN,1057135,823,WEST END AVENUE,100786295,2,A1,,1889,17,...,NY,10025,2128655858,2017-11-03,446711,40.797964,-73.971243,6,187,Upper West Side
1931029,MANHATTAN,1083664,120,CLAREMONT AVENUE,121347266,2,A2,N,1993,1,...,NY,10027,2127492802,2017-11-03,2537444,40.812588,-73.961848,7,211,Morningside Heights
1654428,MANHATTAN,1083664,132,CLAREMONT AVENUE,121347266,1,A2,N,1993,2,...,NY,10027,2127492802,2017-11-03,2537443,40.812879,-73.961635,7,211,Morningside Heights


In [44]:
df_temp = df_temp.loc[(df_temp['Job #'].isin(['122171130', '122222638']))]

Latest action date is different, so its probably differnt but very similar jobs 

In [45]:
df_temp[df_temp.columns[0:20]]


Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,Community Board,Zip Code,Building Type,Residential,Special District 1,Special District 2,Work Type,Permit Status,Filing Status,Permit Type


Check if any Job #s have non-digit values

In [46]:
df['Job #'] = df['Job #'].astype('str')

In [47]:
df.loc[(~df['Job #'].isna())
       &(~df['Job #'].str.isdigit())]['Job #']

Series([], Name: Job #, dtype: object)

All Job #s entirely composed of digits, so we cast them to ints

In [48]:
df['Job #'] = df['Job #'].astype('int')

In [49]:
df['Job #'].describe()

count    3.777844e+06
mean     2.539403e+08
std      1.350176e+08
min      1.000300e+08
25%      1.207598e+08
50%      3.001189e+08
75%      4.005585e+08
max      5.660018e+08
Name: Job #, dtype: float64

## Examining and reparing house #s

House #'s appear to be mostly ints

However, there are legitimate house numbers with dashes so we'll have to make them strings

In [50]:
show_vals('House #', show_rows=10)

Top 10 House #:

1      29663
200    22676
100    16288
40     14448
30     14419
11     13698
10     13642
55     13512
15     13244
150    13003
Name: House #, dtype: int64



Replace NaN values with empty strings, then convert column to string, and make everything uppercase


In [51]:
df['House #'].fillna('', inplace=True)
df['House #'] = df['House #'].astype('str')
df['House #'] = df['House #'].str.upper()

Check for numbers spelled out as words

In [52]:
df.loc[(~df['House #'].isna())
       &(df['House #'].str.isalpha())]['House #']

8222        ONE
8229        ONE
66136       ONE
86695       ONE
1133924    PIER
1300189    PIER
1309270       B
2132095    PIER
2479868    PIER
2946035    PIER
3078145       B
3143977       B
3589793     ONE
3714254     ONE
Name: House #, dtype: object

Maybe the house # and borough were flipped in the 'manhattan' case?

Check if thses are empty strings:

In [53]:
df.loc[(~df['House #'].str.contains('\\d', regex=True))]['House #']

8222             ONE
8229             ONE
66136            ONE
86695            ONE
403344     NO NUMBER
529214              
607922              
1133924         PIER
1234234             
1300189         PIER
1309270            B
2074101    NO NUMBER
2132095         PIER
2479868         PIER
2721408             
2857535    NO NUMBER
2913620    NO NUMBER
2946035         PIER
3078145            B
3143977            B
3589793          ONE
3714254          ONE
Name: House #, dtype: object

Replace spelling of numbers with their value, and remove values 'PIER',  'MANHATTAN',  'NO NUMBER'

In [54]:
df.loc[df['House #'].str.strip('')=='ONE', 'House #'] = '1'
df.loc[df['House #'].str.strip('')=='PIER', 'House #'] = ''
df.loc[df['House #'].str.strip('')=='MANHATTAN', 'House #'] = ''
df.loc[df['House #'].str.strip('')=='NO NUMBER', 'House #'] = ''

# need to add B
df.loc[df['House #'].str.strip('')=='B', 'House #'] = ''


Most of these will probably be legitimate house numbers, since house numbers can have dashes

In [55]:
df.loc[(~df['House #'].isna())
       &(~df['House #'].str.isdigit())]['House #']

14             58A
79             11A
107         106-13
128        4314GAR
131         128-19
            ...   
3777838     106-10
3777839     108-19
3777840      36-09
3777842      25-63
3777843      25-63
Name: House #, Length: 754365, dtype: object

Check non-numeric house #'s that don't have dashes

In [56]:
df.loc[(~df['House #'].isna())
       &(~df['House #'].str.isdigit())
      &(~df['House #'].str.contains('-', regex=False))]['House #'][:25]

14             58A
79             11A
128        4314GAR
271        242 GAR
351      1369 REAR
592          144 A
594          144 A
598           123A
664           438A
835     253 GARAGE
848        476 1/2
995         53 GAR
996         55 GAR
1000        51 GAR
1084           32A
1243       413 GAR
1247       775REAR
1421       249 GAR
1479       9952GAR
1480       9960GAR
1762       415 GAR
2196     2911 REAR
2197      1296 GAR
2313      1568 AIR
2604      1110REAR
Name: House #, dtype: object

We see a mix of reference to the house's garage, the rear house and single letters that likely indicate apartments in multi-occupancy venues. 

We will standardize the formatting, and maintain the reference to garage, rear, and appartment, since there is no apartment column for the job.

First split the numbers and words with a space

In [57]:
df['House #'] = df['House #'].str.replace(pat='(?P<one>\\d)(?P<two>[A-Z]+)', repl='\g<one> \g<two>', regex=True)

Now we will fix the formatting for garage and 
remove references to north, south, east, west, since they should be in street #

In [58]:
df['House #'] = df['House #'].str.replace(pat='(?P<one>GAR$)', repl='GARAGE', regex=True)

In [59]:
df['House #'] = df['House #'].str.replace(pat='NORTH([A-Z]+)?', repl='', regex=True)
df['House #'] = df['House #'].str.replace(pat='EAST([A-Z]+)?', repl='', regex=True)
df['House #'] = df['House #'].str.replace(pat='SOUTH([A-Z]+)?', repl='', regex=True)
df['House #'] = df['House #'].str.replace(pat='WEST([A-Z]+)?', repl='', regex=True)

In [60]:
# Confirm that it worked correctly:
df.loc[(~df['House #'].isna())
       &(~df['House #'].str.isdigit())
       &(~df['House #'].str.contains('-', regex=False))]['House #'][:30]

14             58 A
79             11 A
128     4314 GARAGE
271      242 GARAGE
351       1369 REAR
592           144 A
594           144 A
598           123 A
664           438 A
835      253 GARAGE
848         476 1/2
995       53 GARAGE
996       55 GARAGE
1000      51 GARAGE
1084           32 A
1243     413 GARAGE
1247       775 REAR
1421     249 GARAGE
1479    9952 GARAGE
1480    9960 GARAGE
1762     415 GARAGE
2196      2911 REAR
2197    1296 GARAGE
2313       1568 AIR
2604      1110 REAR
2607    1580 GARAGE
2612    1582 GARAGE
2613    9964 GARAGE
2829          408 A
2917          206 A
Name: House #, dtype: object

## Looking at Binary/Pseudo-binary columns:

For these columns it's clear NaN idicates 'no', however some columns, like site-fill don't quite work

In [61]:
show_vals('Non-Profit')
show_vals('Self Cert')
show_vals('Filing Status')
show_vals('Site Fill')
show_vals('Act as Superintendent')
show_vals('Building Type')
show_vals('Residential')
show_vals('Oil Gas')

Top 10 Non-Profit:

N      3387948
Y       229111
NaN     160783
ï¿½          1
8            1
Name: Non-Profit, dtype: int64

Top 10 Self Cert:

Y      1647350
NaN    1276668
N       853808
R           11
X            4
J            3
Name: Self Cert, dtype: int64

Top 10 Filing Status:

INITIAL    2686198
RENEWAL    1091646
Name: Filing Status, dtype: int64

Top 10 Site Fill:

NONE                   1582047
NOT APPLICABLE         1193902
NaN                     462078
ON-SITE                 348827
OFF-SITE                105300
USE UNDER 300 CU.YD      85690
Name: Site Fill, dtype: int64

Top 10 Act as Superintendent:

NaN    2181790
Y      1587797
N         8256
A            1
Name: Act as Superintendent, dtype: int64

Top 10 Building Type:

2      2814068
1       909294
NaN      54482
Name: Building Type, dtype: int64

Top 10 Residential:

NaN    2250443
YES    1527401
Name: Residential, dtype: int64

Top 10 Oil Gas:

NaN    3739028
OIL      36049
GAS       2767
Name: Oil Gas, dty

Replace the Nan values with False and replace the other values with True, and then cast the columns to be type bool

In [62]:
df['Non-Profit'].fillna(False, inplace=True)
df['Self Cert'].fillna(False, inplace=True)
df['Residential'].fillna(False, inplace=True)
df['Act as Superintendent'].fillna(False, inplace=True)

In [63]:
df.loc[df['Non-Profit']=='Y', 'Non-Profit'] = True
df.loc[df['Self Cert']=='Y', 'Self Cert'] = True
df.loc[df['Residential']=='YES', 'Residential'] = True

df.loc[df['Act as Superintendent']=='Y', 'Act as Superintendent'] = True


In [64]:
df.loc[df['Non-Profit']!=True, 'Non-Profit'] = False
df.loc[df['Self Cert']!=True, 'Self Cert'] = False
df.loc[df['Residential']!=True, 'Residential'] = False

df.loc[df['Act as Superintendent']!=True, 'Act as Superintendent'] = False


In [65]:
df['Non-Profit'] = df['Non-Profit'].astype('bool')
df['Self Cert'] = df['Self Cert'].astype('bool')
df['Residential'] = df['Residential'].astype('bool')
df['Act as Superintendent'] = df['Act as Superintendent'].astype('bool')


In [66]:

# Notes:
# Building Type looks binary and has 2 values + maybe NAN
# Cluster looks binary and has 2 values + maybe NAN
# Landmarked looks binary and has 4 values + maybe NAN
# Adult Establishment looks binary and has 2 values + maybe NAN
# Loft Board looks binary and has 2 values + maybe NAN
# City Owned looks binary and has 4 values + maybe NAN
# Little e looks binary and has 5 values + maybe NAN
# PC Filed -- Other all look binary, and have 1-2 values + maybe NAN
#
# Take a look at Other Description for weird strings

# Professional Cert looks binary and has 5 values + maybe NAN
# Non Profit is binary



These columns are less clear cut since they have yes, no, and NaN values

With the exception of "Landmarked" and "Little e", which both have additional values at high frequency, we can probably assume with relatively high confidence that NaN values would indicate false values for these columns, and we will map the columns to boolean values. 

For Landmarked and Little e, we will map NaN to 'N', and leave all the values as strings

In [67]:
df.columns

Index(['BOROUGH', 'Bin #', 'House #', 'Street Name', 'Job #', 'Job Document #',
       'Job Type', 'Self Cert', 'Block', 'Lot', 'Community Board', 'Zip Code',
       'Building Type', 'Residential', 'Special District 1',
       'Special District 2', 'Work Type', 'Permit Status', 'Filing Status',
       'Permit Type', 'Permit Sequence #', 'Permit Subtype', 'Oil Gas',
       'Site Fill', 'Filing Date', 'Issuance Date', 'Expiration Date',
       'Job Start Date', 'Permittee's First Name', 'Permittee's Last Name',
       'Permittee's Business Name', 'Permittee's Phone #',
       'Permittee's License Type', 'Permittee's License #',
       'Act as Superintendent', 'Permittee's Other Title', 'HIC License',
       'Site Safety Manager's First Name', 'Site Safety Manager's Last Name',
       'Site Safety Manager's Buisness Name',
       'Superintendent First & Last Name', 'Superintendent Business Name',
       'Owner's Business Type', 'Non-Profit', 'Owner's Business Name',
       'Owner's First 

In [68]:
show_vals('Building Type')
show_vals('Site Fill')
show_vals('Oil Gas')

Top 10 Building Type:

2      2814068
1       909294
NaN      54482
Name: Building Type, dtype: int64

Top 10 Site Fill:

NONE                   1582047
NOT APPLICABLE         1193902
NaN                     462078
ON-SITE                 348827
OFF-SITE                105300
USE UNDER 300 CU.YD      85690
Name: Site Fill, dtype: int64

Top 10 Oil Gas:

NaN    3739028
OIL      36049
GAS       2767
Name: Oil Gas, dtype: int64



Fill null values with False/'N'

In [69]:
df['Oil Gas'].fillna('NONE', inplace=True)
df['Site Fill'].fillna('NONE', inplace=True)
df.loc[df['Site Fill']=='NOT APPLICABLE', 'Site Fill'] = 'NONE' 


For the clearly binary cases, fill 'Y' values with True, and the other values with False

## ~Checking Monetary Values for consistency~ No Monetary values here

## Checking owner's information

In [70]:
# Get list of Owner's columns:
owner_cols = df.columns[np.where(np.char.find(np.array(list(df.columns)), 'Owner') > -1)[0]]

In [71]:
np.where(np.char.find(np.array(list(df.columns)), 'Owner') > -1)[0]

array([42, 44, 45, 46, 47, 48, 49, 50, 51, 52], dtype=int64)

In [72]:
owner_cols

Index(['Owner's Business Type', 'Owner's Business Name', 'Owner's First Name',
       'Owner's Last Name', 'Owner's House #', 'Owner's House Street Name',
       'Owner's House City', 'Owner's House State', 'Owner's House Zip Code',
       'Owner's Phone #'],
      dtype='object')

In [73]:
for c in owner_cols:
    show_vals(c)

Top 10 Owner's Business Type:

CORPORATION           1228255
INDIVIDUAL            1185534
PARTNERSHIP            819695
NaN                    164912
OTHER                  156268
CONDO/CO-OP            103162
OTHER GOV'T AGENCY      67876
NYCHA/HHC               16306
NYCHA                   12371
NYC AGENCY              11886
Name: Owner's Business Type, dtype: int64

Top 10 Owner's Business Name:

NaN                                 776210
NY SCHOOL CONSTRUCTION AUTHORITY     27065
OWNER                                23371
NONE                                 22619
NYC SCA                              20026
HPD                                  18702
NYC HOUSING AUTHORITY                14394
NYC HPD                              13648
NYCHA                                12693
-                                     9881
Name: Owner's Business Name, dtype: int64

Top 10 Owner's First Name:

MICHAEL    89612
JOHN       80208
ROBERT     77684
DAVID      72961
JOSEPH     72829
ANTHONY  

## Fixing owner's informations

In [74]:
df.loc[~df["Owner's Business Name"].isna() & df["Owner's Business Name"].str.contains("(?i)new york city")]["Owner's Business Name"].value_counts()

NEW YORK CITY HOUSING AUTHORITY     9744
New York City Housing Authority      559
NEW YORK CITY HPD                    313
NEW YORK CITY SCHOOL CONSTRUCTIO     207
FIRE DEPARTMENT OF NEW YORK CITY     206
                                    ... 
NEW YORK CITY HPD.                     1
NEW YORK CITY H.P.D                    1
NEW YORK CITY HOUSING AUTHROTY.        1
NEW YORK CITY ECONOMIC DEVELPMEN       1
THE NEW YORK CITY HOT DOG CO           1
Name: Owner's Business Name, Length: 263, dtype: int64

Normalizes a couple of duplicate names

In [75]:
df["Owner's Business Name"] = df["Owner's Business Name"].str.replace("NEW YORK CITY", "NYC")
df["Owner's Business Name"] = df["Owner's Business Name"].str.upper()
df["Owner's Business Name"] = df["Owner's Business Name"].str.replace(".", '', regex=False)
df["Owner's Business Name"] = df["Owner's Business Name"].str.replace(",", '', regex=False)

All these are the same thing. Uses clusters to fix

In [76]:
#may have to use fuzzy/cluster to fix this problem
df.loc[~df["Owner's Business Name"].isna() & df["Owner's Business Name"].str.contains("(?i)HOUSING AUTHORITY")]["Owner's Business Name"].value_counts()

NYC HOUSING AUTHORITY              26566
NEW YORK CITY HOUSING AUTHORITY      562
NY CITY HOUSING AUTHORITY            292
NEW YORK HOUSING AUTHORITY           211
NYCHOUSING AUTHORITY                 189
                                   ...  
NHYC HOUSING AUTHORITY                 1
N  Y C HOUSING AUTHORITY               1
NEW YORK    HOUSING AUTHORITY          1
NYC HOUSING AUTHORITY DESIGN DV        1
NEW TORK CITY HOUSING AUTHORITY        1
Name: Owner's Business Name, Length: 74, dtype: int64

Used clusters to try to fix the rest of them further below

In [77]:
df["Owner's House State"].value_counts()

NY        3632137
NJ          54605
CT           6243
FL           5214
CA           5154
PA           4407
IL           3591
MA           3099
NC           2093
VA           1951
OH           1941
MD           1573
TX           1551
GA           1243
RI            859
NV            691
DC            669
CO            608
AZ            603
MI            548
MN            504
TN            479
WA            476
NH            364
UT            351
SC            282
IN            200
MO            195
NM            193
DE            193
KS            187
WI            175
KY            155
VT            101
ME             83
IA             82
NE             69
AR             65
HI             54
AL             50
LA             47
OR             42
OK             38
CN             27
WY             24
AK             22
ND             16
WV             14
SD             14
PR             13
ON             12
MS             12
ID              7
MT              5
sw              2
ï¿½ï¿½    

Since the states can be outside NYC, these are probably fine

In [78]:
df["Owner's House Zip Code"].value_counts()

10022        129643
11101        128463
10017        113966
10038        107855
10019        106392
              ...  
28306             1
91024             1
33326             1
42429             1
113772034         1
Name: Owner's House Zip Code, Length: 8304, dtype: int64

## Looking at Phone Numbers:

In [79]:
show_vals("Owner's Phone #")

Top 10 Owner's Phone #:

NaN           49095
7184728000    32649
2128637625    14848
2128947000    12218
2128637490     9123
7184728534     8846
7189452300     7755
2129786310     7409
2120000000     7029
2125295688     7003
Name: Owner's Phone #, dtype: int64



A lot of the same phone numbers

In [80]:
df["Owner's Phone #"] = df["Owner's Phone #"].astype('str')

In [81]:
df.loc[df["Owner's Phone #"].str.contains("7184728000")][["Owner's First Name", "Owner's Last Name","Owner's Business Name", "Owner's Phone #"]]

Unnamed: 0,Owner's First Name,Owner's Last Name,Owner's Business Name,Owner's Phone #
1,ELAN,ABNERI,SCHOOL CONSTRUCTION AUTHORITY,7184728000
58,ELAN,ABNERI,NYCSCA,7184728000
611,JOSEPH,SCALISI,NYC SCA,7184728000
621,ELAN,ABNERI,NYC SCA,7184728000
767,ELAN,ABNERI,NYC SCA,7184728000
...,...,...,...,...
3777530,JOSEPH,SCALISI,NYC SCHOOL CONST AUTHORITY,7184728000
3777587,COLIN,ALBERT,NYC SCHOOL CONSTRUCTION AUTHORIT,7184728000
3777588,COLIN,ALBERT,NYC SCHOOL CONSTRUCTION AUTHORIT,7184728000
3777602,ELAN,ABNERI,NYCSCA,7184728000


All from the same business name so it makes sense

In [82]:
df.loc[df["Owner's Phone #"]=='nan']

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
8820,QUEENS,4014256,22-57,47 STREET,410022871,01,A1,True,00759,00004,...,NY,11105,,07/29/2020 00:00:00,2998503,40.768882,-73.902890,22,137,Steinway
12364,MANHATTAN,1091658,447,LEXINGTON AVENUE,101491271,01,A2,False,01299,00053,...,NY,11735,,12/25/2020 00:00:00,673977,40.753050,-73.974865,4,92,Turtle Bay-East Midtown
12365,MANHATTAN,1091658,447,LEXINGTON AVENUE,101491271,01,A2,False,01299,00053,...,NY,11735,,12/25/2020 00:00:00,673980,40.753050,-73.974865,4,92,Turtle Bay-East Midtown
25579,QUEENS,4536576,106-29,156 STREET,402172965,01,A1,False,10124,00044,...,NY,11433,,07/29/2020 00:00:00,3617521,40.696960,-73.798163,28,254,South Jamaica
28027,BRONX,2124102,1474,OAKLEY STREET,200985960,01,NB,True,04717,00034,...,NY,10456,,07/29/2020 00:00:00,3623446,40.880173,-73.849612,12,386,Eastchester-Edenwald-Baychester
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3768594,BROOKLYN,3153537,442,86 STREET,301354807,01,A2,False,06045,00023,...,,,,10/09/2021 00:00:00,1259432,40.622239,-74.027449,43,160,Bay Ridge
3768919,QUEENS,4289557,137-05,253 STREET,402622444,01,A2,True,13627,00012,...,,,,10/13/2021 00:00:00,1679137,40.662410,-73.728295,31,656,Rosedale
3769851,BROOKLYN,3129734,1870,51 STREET,301795074,01,A1,False,05468,00034,...,,,,11/05/2021 00:00:00,3876879,40.626078,-73.980338,44,472,Borough Park
3773124,BROOKLYN,3129734,1870,51 STREET,301795074,01,A1,False,05468,00034,...,,,,11/05/2021 00:00:00,3879877,40.626078,-73.980338,44,472,Borough Park


Nothing wrong with these jobs without an owner's phone number

In [83]:
df.loc[~df["Owner's Phone #"].isna() & df["Owner's Phone #"].str.contains("-")]["Owner's Phone #"].value_counts()

71822936-3    77
212386749-     8
718446591-     7
718343645-     7
71835636-7     7
              ..
212478-989     1
718270499-     1
212269689-     1
212753085-     1
2126-37131     1
Name: Owner's Phone #, Length: 83, dtype: int64

Phone numbers should not contain "-"

In [84]:
df.loc[~df["Owner's Phone #"].isna() & df["Owner's Phone #"].str.contains(" ")]["Owner's Phone #"].value_counts()

212 929318    46
718  42334    30
212 226650    17
718  88876    10
718   3891    10
              ..
212490 197     1
718  57593     1
212594  14     1
7182 86600     1
212  47727     1
Name: Owner's Phone #, Length: 209, dtype: int64

Phone numbers should not contain empty space

### Cleaning phone number

#### removes non-numeric characters

In [85]:
df["Owner's Phone #"] = df["Owner's Phone #"].str.extract('(\d+)', expand=False)
df.loc[~df["Owner's Phone #"].isna() & df["Owner's Phone #"].str.contains(" ")]["Owner's Phone #"].value_counts()

Series([], Name: Owner's Phone #, dtype: int64)

#### Turns phone numbers that start with 0, 1, and does not have 10 digits into nan

In [86]:
df["Owner's Phone #"] = df["Owner's Phone #"].astype('str')
df.loc[~df["Owner's Phone #"].isna() & ((df["Owner's Phone #"].str[0] == "0") | (df["Owner's Phone #"].str[0] == "1") | (df["Owner's Phone #"].apply(len) != 10)), ["Owner's Phone #"]] = np.nan

#### Checks to see if there are any others not of length 10

In [87]:
df["Owner's Phone #"] = df["Owner's Phone #"].astype('str')
df.loc[(df["Owner's Phone #"].apply(len) != 10)]["Owner's Phone #"]

266        nan
1270       nan
2343       nan
5535       nan
5548       nan
          ... 
3776801    nan
3776806    nan
3777021    nan
3777023    nan
3777291    nan
Name: Owner's Phone #, Length: 62721, dtype: object

#### Check for non-numeric charaters

In [88]:
df.loc[(~df["Owner's Phone #"].str.isnumeric()) & (~(df["Owner's Phone #"]=='nan'))]["Owner's Phone #"]

Series([], Name: Owner's Phone #, dtype: object)

In [89]:
df.columns

Index(['BOROUGH', 'Bin #', 'House #', 'Street Name', 'Job #', 'Job Document #',
       'Job Type', 'Self Cert', 'Block', 'Lot', 'Community Board', 'Zip Code',
       'Building Type', 'Residential', 'Special District 1',
       'Special District 2', 'Work Type', 'Permit Status', 'Filing Status',
       'Permit Type', 'Permit Sequence #', 'Permit Subtype', 'Oil Gas',
       'Site Fill', 'Filing Date', 'Issuance Date', 'Expiration Date',
       'Job Start Date', 'Permittee's First Name', 'Permittee's Last Name',
       'Permittee's Business Name', 'Permittee's Phone #',
       'Permittee's License Type', 'Permittee's License #',
       'Act as Superintendent', 'Permittee's Other Title', 'HIC License',
       'Site Safety Manager's First Name', 'Site Safety Manager's Last Name',
       'Site Safety Manager's Buisness Name',
       'Superintendent First & Last Name', 'Superintendent Business Name',
       'Owner's Business Type', 'Non-Profit', 'Owner's Business Name',
       'Owner's First 

### ~Checking additional numerical columns for coherency~

## Districts

### Looking at Zoning districts

In [90]:
#Residence (R), Commerical (C), Manufacturing (M)
#show_vals("Zoning District 1")
#show_vals("Zoning District 2")

Some districts may contain invalid formats

In [91]:
#.4-4 looks weird
#df["Zoning District 1"].value_counts(dropna=False)

In [92]:
#Checks for irregular values (values that do not start with (R), (C), (M))
#df["Zoning District 1"] = df["Zoning District 1"].astype('str')
#df.loc[(df["Zoning District 1"] != "nan") & ~df["Zoning District 1"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH"))]["Zoning District 1"].value_counts()

In [93]:
#Checks for irregular values (values that do not start with (R), (C), (M))
#df["Zoning District 2"] = df["Zoning District 2"].astype('str')
#df.loc[(df["Zoning District 2"] != "nan") & ~df["Zoning District 2"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH"))]["Zoning District 2"].value_counts()

In [94]:
#turning the values above into np.nan
#df.loc[(df["Zoning District 1"] != "nan") & ~df["Zoning District 1"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH")), ["Zoning District 1"]] = np.nan
#df.loc[(df["Zoning District 2"] != "nan") & ~df["Zoning District 2"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH")), ["Zoning District 2"]] = np.nan

### Looking at special districts

In [95]:
show_vals("Special District 1")
show_vals("Special District 2")

Top 10 Special District 1:

NaN    3342628
MID     106896
SRD      26695
LM       24853
OP       24006
PI       21676
SR       21012
CL       16681
BR       13874
DB       13523
Name: Special District 1, dtype: int64

Top 10 Special District 2:

NaN     3709767
IBZ       27571
POPS      23082
GW         8519
JAM        7260
HILI        988
BPRK        522
GCP2        101
LPCA         34
Name: Special District 2, dtype: int64



In [96]:
#Checks to see if there are lower case values
df.loc[~df["Special District 1"].isna() & df["Special District 1"].str.islower()]

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME


### Analysis

Zoning districts had some zones that were invalid such as number only values (ex.31010)  and we changed those values to nan

The typical format for Zoning districts start with C, R, and N. There are also some special districts like PARK and BPC that we also checked

Special Districts didn't have any noticable values that were out of place

In [97]:
df.columns

Index(['BOROUGH', 'Bin #', 'House #', 'Street Name', 'Job #', 'Job Document #',
       'Job Type', 'Self Cert', 'Block', 'Lot', 'Community Board', 'Zip Code',
       'Building Type', 'Residential', 'Special District 1',
       'Special District 2', 'Work Type', 'Permit Status', 'Filing Status',
       'Permit Type', 'Permit Sequence #', 'Permit Subtype', 'Oil Gas',
       'Site Fill', 'Filing Date', 'Issuance Date', 'Expiration Date',
       'Job Start Date', 'Permittee's First Name', 'Permittee's Last Name',
       'Permittee's Business Name', 'Permittee's Phone #',
       'Permittee's License Type', 'Permittee's License #',
       'Act as Superintendent', 'Permittee's Other Title', 'HIC License',
       'Site Safety Manager's First Name', 'Site Safety Manager's Last Name',
       'Site Safety Manager's Buisness Name',
       'Superintendent First & Last Name', 'Superintendent Business Name',
       'Owner's Business Type', 'Non-Profit', 'Owner's Business Name',
       'Owner's First 

## Quick look at GIS

In [98]:
show_vals("LATITUDE")
show_vals("LONGITUDE")
show_vals("COUNCIL_DISTRICT")
show_vals("CENSUS_TRACT")
show_vals("NTA_NAME")
#show_vals("BIN")

Top 10 LATITUDE:

NaN          14048
40.748342     3291
40.751098     2979
40.758754     2416
40.754162     2383
40.711537     2097
40.582305     2034
40.764020     2032
40.703597     2031
40.733848     1894
Name: LATITUDE, dtype: int64

Top 10 LONGITUDE:

NaN           14048
-73.984643     3324
-73.992926     2967
-73.978692     2490
-73.976557     2383
-74.015673     2111
-73.973189     2034
-74.169053     2034
-74.009781     2031
-73.973246     1873
Name: LONGITUDE, dtype: int64

Top 10 COUNCIL_DISTRICT:

4     448847
3     288775
1     246939
2     147866
6     129512
33    123278
5      94111
19     86245
39     83769
51     81966
Name: COUNCIL_DISTRICT, dtype: int64

Top 10 CENSUS_TRACT:

7      31271
33     29997
102    25964
104    24574
137    24494
9      24042
96     23414
92     23063
119    22086
94     21540
Name: CENSUS_TRACT, dtype: int64

Top 10 NTA_NAME:

Midtown-Midtown South                         270202
Hudson Yards-Chelsea-Flatiron-Union Square    122303
SoHo-Tri

In [99]:
#Manually looking at some of these
df[["LATITUDE", "LONGITUDE", "COUNCIL_DISTRICT", "CENSUS_TRACT", "NTA_NAME"]]

Unnamed: 0,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
0,40.758977,-73.981089,4,96,Midtown-Midtown South
1,40.608512,-74.102067,50,177,Todt Hill-Emerson Hill-Heartland Village-Light...
2,40.613341,-74.035582,43,5602,Bay Ridge
3,40.645537,-73.954034,40,792,Erasmus
4,40.617141,-73.945805,45,746,Flatlands
...,...,...,...,...,...
3777839,40.755485,-73.860206,21,381,North Corona
3777840,40.754500,-73.860385,21,381,North Corona
3777841,40.585325,-74.096908,50,11401,Old Town-Dongan Hills-South Beach
3777842,40.775061,-73.843574,19,907,College Point


In [100]:
#shouldn't be 0
df["LATITUDE"] = df["LATITUDE"].astype('float')
df["LATITUDE"].min()

40.498628

In [101]:
df["LATITUDE"].max()

40.913711

In [102]:
df.loc[df["LATITUDE"] == 0.0]

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME


#### Removed the the rows above because its obviously a filler job

In [103]:
df = df.drop(df[df["LATITUDE"] == 0.0].index)

In [104]:
df["LATITUDE"].min()

40.498628

#### The min and max makes sense as the values range from Staten Island to the Bronx

In [105]:
df["LONGITUDE"] = df["LONGITUDE"].astype('float')
df["LONGITUDE"].min()

-74.254886

In [106]:
df["LONGITUDE"].max()

-73.700376

In [107]:
df.loc[df["LONGITUDE"] == -73.700376]

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
515062,QUEENS,4179641,270-03,HILLSIDE AVENUE,420236097,1,A2,False,8781,101,...,NY,11001,5163540656,2017-11-03,2156149,40.739112,-73.700376,23,157901,Glen Oaks-Floral Park-New Hyde Park
868949,QUEENS,4179641,270-03,HILLSIDE AVENUE,420236097,1,A2,False,8781,101,...,NY,11001,5163540656,2017-11-03,2180665,40.739112,-73.700376,23,157901,Glen Oaks-Floral Park-New Hyde Park
1009373,QUEENS,4179641,270-03,HILLSIDE AVENUE,420236097,1,A2,False,8781,101,...,NY,11001,5163540656,2017-11-03,2198331,40.739112,-73.700376,23,157901,Glen Oaks-Floral Park-New Hyde Park


#### These longitudes and latitudes range from Queens to Staten Island which is also consistent with our dataset

In [108]:
df["COUNCIL_DISTRICT"] = df["COUNCIL_DISTRICT"].astype('float')
df["COUNCIL_DISTRICT"].min()

1.0

In [109]:
df["COUNCIL_DISTRICT"].max()

51.0

#### 1-51 are all valid districts

In [110]:
df["CENSUS_TRACT"] = df["CENSUS_TRACT"].astype('float')
df["CENSUS_TRACT"].min()

1.0

In [111]:
df["CENSUS_TRACT"].max()

157903.0

In [112]:
df.loc[df["CENSUS_TRACT"] == 157903]

Unnamed: 0,BOROUGH,Bin #,House #,Street Name,Job #,Job Document #,Job Type,Self Cert,Block,Lot,...,Owner's House State,Owner's House Zip Code,Owner's Phone #,DOB Run Date,PERMIT_SI_NO,LATITUDE,LONGITUDE,COUNCIL_DISTRICT,CENSUS_TRACT,NTA_NAME
2405,QUEENS,4179819,84-24,259 STREET,440598471,01,A2,True,08788,00018,...,NY,11001,9176485442,06/24/2020 00:00:00,3767505,40.736156,-73.708866,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
9939,QUEENS,4623613,84-14,261 STREET,421895747,01,NB,False,08790,00024,...,NY,11004,3476989824,03/10/2021 00:00:00,3816186,40.737110,-73.707270,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
15301,QUEENS,4180389,86-31,256 STREET,420593637,01,A2,True,08815,00022,...,NY,11001,7188501900,07/15/2020 00:00:00,3774752,40.732155,-73.710287,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
19794,QUEENS,4615276,261-04,EAST WILLISTON AVENUE,421477994,01,A2,True,08804,00039,...,NY,11596,9177630251,07/29/2020 00:00:00,3601856,40.735917,-73.706637,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
20332,QUEENS,4180035,85-45,256 STREET,402596847,01,NB,True,08799,00013,...,NY,11001,5166031575,07/29/2020 00:00:00,3603581,40.733476,-73.710772,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3760605,QUEENS,4179882,84-28,260 STREET,421793544,01,A2,False,08789,00049,...,,,9172266055,08/25/2021 00:00:00,3867333,40.736379,-73.707974,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
3761067,QUEENS,4179911,84-33,260 ST,440657139,01,A2,False,08790,00047,...,,,7187265500,08/28/2021 00:00:00,3869066,40.736340,-73.707938,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
3761980,QUEENS,4623613,84-14,261 STREET,421895747,01,NB,False,08790,00024,...,,,3476989824,09/02/2021 00:00:00,3869772,40.737110,-73.707270,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park
3764871,QUEENS,4180404,256-16,86 AVENUE,401841129,01,NB,False,08815,00039,...,,,,09/21/2021 00:00:00,3872349,40.732672,-73.709696,23.0,157903.0,Glen Oaks-Floral Park-New Hyde Park


#### No irregulars for census tract

In [113]:
#df["BIN"] = df["BIN"].astype('float')
#df["BIN"].min()

In [114]:
#df["BIN"] = df["BIN"].astype('float')
#df["BIN"].max()

In [115]:
#df.loc[df["BIN"] == 1000000]

In [116]:
#df.loc[df["BIN"] == 5799501]

#### Nothing wrong with GIS BIN either

# Data Profilling for datetime columns


Find format problems and outliers in all datetime columns

Using openclean's sklearn modules to detect problems and outliers

In [117]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

def findDateOutliers(column_name, eps_setting = 0.05):
    datetime_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(datetime_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(datetime_data)))
    print(DBSCANOutliers().find(datetime_data))
    print(DBSCANOutliers(eps = eps_setting).find(datetime_data))
    print('\n==================================')

In [118]:
date_cols = []

print("Datetime Data columns:\n")
for col in ds.columns:
    if 'Date' in col or 'DATE' in col:
        print(col)
        date_cols.append(col)

print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.02)

Datetime Data columns:

Filing Date
Issuance Date
Expiration Date
Job Start Date
DOBRunDate
----------------------------

Column:  Filing Date
1.  2017-03-29        1,077
2.  2017-11-08        1,064
3.  2017-11-09        1,044
4.  2016-11-09        1,020
5.  2013-05-03        1,006
6.  2007-03-29        1,003
7.  2015-11-12        1,001
8.  2014-06-27          998
9.  2017-03-30          991
10. 2018-11-07          989

Total number of distinct values in Filing Date is 13338
['']
['', '2020-02-02 ', '08/26/2019', '02/20/2020', '2014-06-27 ', '02/22/2020', '11/12/2020', '2020-02-20 ', '07/06/2020', '03/01/2021', '2000-02-22 ', '2010-11-12 ', '09/22/2020', '2011-11-10 ', '07/07/2020', '2002-02-02 ', '2012-10-11 ', '07/08/2020', '10/20/2020', '2000-02-02 ', '2017-09-26 ', '06/27/2019', '2020-02-22 ', '1994-05-27 ', '2017-11-09 ', '2017-03-29 ', '2002-02-20 ', '07/01/2020', '2015-11-12 ', '11/10/2020', '03/24/2021', '2002-02-22 ', '02/22/2000', '2017-11-08 ', '2007-07-02 ', '2007-03-30 ', 

# Remember that after changing some of the column names, there are some columns that are also datetime data:

"Paid": "Paid Date"\
"Fully Paid": "Fully Paid Date"\
"Assigned": "Assigned Date"\
"Approved": "Approved Date"\
"Pre- Filing Date": "Pre-Filing Date"\
"DOB Run Date": "DOB Run Date"\
"SIGNOFF_DATE": "Signoff Date"\
"SPECIAL_ACTION_DATE": "Special Action Date"\

In [119]:
##date_cols = ["Filing Date","Issuance Date","Expiration Date","Job Start Date", "DOB Run Date"]

for col in date_cols:
    findDateOutliers(col, 0.02)

Column:  Filing Date
1.  2017-03-29        1,077
2.  2017-11-08        1,064
3.  2017-11-09        1,044
4.  2016-11-09        1,020
5.  2013-05-03        1,006
6.  2007-03-29        1,003
7.  2015-11-12        1,001
8.  2014-06-27          998
9.  2017-03-30          991
10. 2018-11-07          989

Total number of distinct values in Filing Date is 13338
['']
['', '2020-02-02 ', '08/26/2019', '02/20/2020', '2014-06-27 ', '02/22/2020', '11/12/2020', '2020-02-20 ', '07/06/2020', '03/01/2021', '2000-02-22 ', '2010-11-12 ', '09/22/2020', '2011-11-10 ', '07/07/2020', '2002-02-02 ', '2012-10-11 ', '07/08/2020', '10/20/2020', '2000-02-02 ', '2017-09-26 ', '06/27/2019', '2020-02-22 ', '1994-05-27 ', '2017-11-09 ', '2017-03-29 ', '2002-02-20 ', '07/01/2020', '2015-11-12 ', '11/10/2020', '03/24/2021', '2002-02-22 ', '02/22/2000', '2017-11-08 ', '2007-07-02 ', '2007-03-30 ', '06/22/2020', '02/02/2020', '2017-07-12 ', '1991-11-19 ', '1999-11-19 ', '06/26/2020', '12/02/2020', '08/01/2019']

Column

# Analysis

the above results show the problems for the data cleaning task:
    
### Latest Action Date
outliers: '06//1403'
format: 'yyyy-mm-dd' and 'mm/dd/yyyy'

### Pre- Filing Date
no problem found

### DOB Run Date
format: 'yyyy-mm-dd' and 'mm/dd/yyyy 00:00:00'

### SIGNOFF_DATE
outliers: empty value

### SPECIAL_ACTION_DATE
outliers: empty value and '11//2006'

### Paid
outliers: empty value

### Fully Paid
outliers: empty value

### Assigned
outliers: empty value

### Approved
outliers: empty value

# Data Cleaning for outliers in datetime columns

## Fixing Datetime columns format

In [120]:
datetime_column_list =  ["Filing Date","Issuance Date","Expiration Date","Job Start Date", "DOB Run Date"]

for col in datetime_column_list:
    show_vals(col)

Top 10 Filing Date:

2017-03-29     1077
2017-11-08     1064
2017-11-09     1044
2016-11-09     1020
2013-05-03     1006
2007-03-29     1003
2015-11-12     1001
2014-06-27      998
2017-03-30      991
2018-11-07      989
Name: Filing Date, dtype: int64

Top 10 Issuance Date:

NaN            20605
2017-03-29      1086
2017-11-08      1065
2017-11-09      1048
2013-05-03      1013
2016-11-09      1007
2016-11-15      1002
2017-03-30       994
2014-06-27       991
2007-03-29       991
Name: Issuance Date, dtype: int64

Top 10 Expiration Date:

2007-12-31    18713
2006-12-31    18149
2005-12-31    16438
2004-12-31    14042
NaN           11805
2009-04-01    11441
2003-12-31    10568
10/31/2020    10385
2010-04-01    10198
01/31/2021     9944
Name: Expiration Date, dtype: int64

Top 10 Job Start Date:

2008-06-27    1502
2015-06-12    1406
2015-06-10    1180
2008-06-26    1139
2008-06-25    1133
2015-06-05    1112
2008-06-24    1107
2015-06-08    1104
2015-06-11    1098
2007-07-17    1093
Na

Check to see if any columns have values in year-month-day format

In [121]:
for col in datetime_column_list:
    print(col, '\n', df.loc[df[col].str.contains('-', regex=False, na=False)][col], '\n\n')

Filing Date 
 101553     2006-03-22 
101554     2005-07-06 
101555     2002-06-17 
101556     2014-02-20 
101557     2003-03-03 
              ...     
3698860    2020-06-04 
3698861    2020-06-04 
3698862    2020-06-04 
3698863    2020-06-04 
3698864    2020-06-04 
Name: Filing Date, Length: 3596140, dtype: object 


Issuance Date 
 101565     2002-05-16 
101582     2014-10-29 
101585     2002-09-16 
101656     2007-09-19 
101668     2009-09-16 
              ...     
3698860    2020-06-05 
3698861    2020-06-05 
3698862    2020-06-05 
3698863    2020-06-05 
3698864    2020-06-05 
Name: Issuance Date, Length: 3577667, dtype: object 


Expiration Date 
 101557     2002-02-03
101559     2004-08-01
101565     2003-02-02
101567     2002-12-04
101574     2003-12-30
              ...    
3698860    2021-06-05
3698861    2021-01-26
3698862    2021-06-05
3698863    2021-06-05
3698864    2020-12-22
Name: Expiration Date, Length: 3585786, dtype: object 


Job Start Date 
 101553     2006-04-03


#### Fix the remaining Datetime columns

In [122]:
for col in datetime_column_list:
    #if (col == 'Latest Action Date') or (col == 'DOB Run Date'):
    #    continue
    #else:
    df.loc[:,col] = pd.to_datetime(df[col], errors='coerce')

These should all be proper datetime64[ns] columns now:

In [123]:
df.select_dtypes(include='datetime')

Unnamed: 0,Filing Date,Issuance Date,Expiration Date,Job Start Date,DOB Run Date
0,2020-12-11,2020-12-11,2021-11-02,2019-12-23,2020-12-12
1,2020-12-11,2020-12-11,2020-12-31,2019-08-02,2020-12-12
2,2020-06-17,2020-06-17,2021-05-10,2020-06-17,2020-06-18
3,2020-06-17,2020-06-17,2021-02-21,2020-06-17,2020-06-18
4,2020-06-17,2020-06-17,2021-03-04,2020-06-17,2020-06-18
...,...,...,...,...,...
3777839,2021-12-04,NaT,2022-12-04,2020-11-12,2021-12-06
3777840,2021-12-05,2021-12-05,2022-12-05,2008-04-09,2021-12-06
3777841,2021-12-05,2021-12-05,2022-12-05,2008-05-15,2021-12-06
3777842,2021-12-05,2021-12-05,2022-09-05,2020-07-08,2021-12-06


In [124]:
for col in datetime_column_list:
    show_vals(col)

Top 10 Filing Date:

2017-03-29     1077
2017-11-08     1064
2017-11-09     1044
2016-11-09     1020
2013-05-03     1006
2007-03-29     1003
2015-11-12     1001
2014-06-27      998
2017-03-30      991
2018-11-07      989
Name: Filing Date, dtype: int64

Top 10 Issuance Date:

NaN            20605
2017-03-29      1086
2017-11-08      1065
2017-11-09      1048
2013-05-03      1013
2016-11-09      1007
2016-11-15      1002
2017-03-30       994
2014-06-27       991
2007-03-29       991
Name: Issuance Date, dtype: int64

Top 10 Expiration Date:

2007-12-31    18713
2006-12-31    18149
2005-12-31    16438
2004-12-31    14042
NaN           11805
2009-04-01    11441
2003-12-31    10568
10/31/2020    10385
2010-04-01    10198
01/31/2021     9944
Name: Expiration Date, dtype: int64

Top 10 Job Start Date:

2008-06-27    1502
2015-06-12    1406
2015-06-10    1180
2008-06-26    1139
2008-06-25    1133
2015-06-05    1112
2008-06-24    1107
2015-06-08    1104
2015-06-11    1098
2007-07-17    1093
Na

### Check the coherence of datetime values

~These don't make sense, but it's not entirely clear if they should be swapped, or removed or what~

All these are okay!


In [125]:
df.loc[(df['Issuance Date'] < df['Filing Date'])
      &(~df['Issuance Date'].isna() & ~df['Filing Date'].isna())][['Issuance Date', 'Filing Date']]

Unnamed: 0,Issuance Date,Filing Date
119948,2008-06-03,2008-08-04
147767,2013-02-20,2013-06-25
177039,1999-10-08,2000-08-04
178838,2005-05-09,2006-05-15
186769,2005-03-04,2006-07-07
...,...,...
3204480,2008-01-31,2008-04-10
3230875,2010-08-17,2011-08-12
3248363,2011-11-18,2012-09-18
3276353,2004-09-30,2005-07-29


In [126]:
df.loc[(df['Expiration Date'] < df['Issuance Date'])
      &(~df['Expiration Date'].isna() & ~df['Issuance Date'].isna())][['Expiration Date', 'Issuance Date']]

Unnamed: 0,Expiration Date,Issuance Date
118080,2008-07-05,2008-07-23
147528,2000-04-01,2000-04-05
150989,1998-12-31,1999-11-23
164950,2008-10-13,2008-10-21
179051,1999-12-31,2000-06-06
...,...,...
2986441,1998-11-09,1998-11-13
3121146,1997-04-13,1997-05-06
3169357,1908-12-31,1998-01-15
3229956,2008-08-14,2008-10-28


Good, no expirations before issuance

Here it's not clear how a job could start after the issuance expires, but this may have actually happened

In [127]:
df.loc[(df['Job Start Date'] > df['Expiration Date'])
      &(~df['Job Start Date'].isna() & ~df['Expiration Date'].isna())][['Job Start Date', 'Expiration Date']]

Unnamed: 0,Job Start Date,Expiration Date
2501,2022-06-23,2021-03-27
4492,2018-04-12,2018-04-07
5149,2020-07-06,2020-05-30
8821,2017-04-23,2016-02-23
9607,2104-10-31,2020-12-19
...,...,...
3766702,2021-10-09,2021-10-01
3766708,2021-10-09,2021-10-01
3768009,2021-10-06,2021-06-06
3768832,2022-01-31,2022-01-04


##### These are, however a small percentage of of our total jobs

Fraction of jobs Assigned after they were already approved out of total jobs

In [128]:
df.loc[df['Job Start Date'] > df['Expiration Date']]['Job Start Date'].count()/df['Job Start Date'].count()

0.000786171221739196

# Data Profilling for City and Other Description

Find format problems and outliers in City and Description columns

Using openclean's sklearn modules to detect problems and outliers

In [129]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    applicant_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(applicant_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(applicant_data)))
    print(DBSCANOutliers(eps = eps_setting).find(applicant_data))
    print('\n==================================')

In [130]:
#date_cols = ["City ", "Other Description"]

In [131]:
# date_cols = ["City ", "Other Description"]
# print("----------------------------\n")        
        
# for col in date_cols:
#     findDateOutliers(col, 0.1)

In [132]:
ds.columns

['BOROUGH',
 'Bin #',
 'House #',
 'Street Name',
 'Job #',
 'Job doc. #',
 'Job Type',
 'Self_Cert',
 'Block',
 'Lot',
 'Community Board',
 'Zip Code',
 'Bldg Type',
 'Residential',
 'Special District 1',
 'Special District 2',
 'Work Type',
 'Permit Status',
 'Filing Status',
 'Permit Type',
 'Permit Sequence #',
 'Permit Subtype',
 'Oil Gas',
 'Site Fill',
 'Filing Date',
 'Issuance Date',
 'Expiration Date',
 'Job Start Date',
 "Permittee's First Name",
 "Permittee's Last Name",
 "Permittee's Business Name",
 "Permittee's Phone #",
 "Permittee's License Type",
 "Permittee's License #",
 'Act as Superintendent',
 "Permittee's Other Title",
 'HIC License',
 "Site Safety Mgr's First Name",
 "Site Safety Mgr's Last Name",
 'Site Safety Mgr Business Name',
 'Superintendent First & Last Name',
 'Superintendent Business Name',
 "Owner's Business Type",
 'Non-Profit',
 "Owner's Business Name",
 "Owner's First Name",
 "Owner's Last Name",
 "Owner's House #",
 "Owner's House Street Name",
 '

# Analysis

the above results show the problems for the data cleaning task:
    
### For City

There are many misspellings and abbreviations for city names. We can use both clustering and Soundex to detect misspellings and abbreviations. And we can check if our cleaning is right by refer to the U.S. Cities reference datasets in openclean.


### For Other Description

Other Description can be anything, so we just care about empty value and values that are too similar and are showing exactly same things (for example 'GC' and '___GC')

In [133]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

In [134]:
upper = ds\
    .select("Ownerâ€™s House City")\
    .update("Ownerâ€™s House City", str.upper)

In [135]:
from openclean.data.refdata import RefStore

refdata = RefStore()
city_df = refdata\
    .load('encyclopaedia_britannica:us_cities', auto_download=True)\
    .df()


In [136]:
city_list = city_df['city']
print(city_list)

0          Demopolis
1          Sylacauga
2               Troy
3             Dothan
4           Prichard
            ...     
1956          Powell
1957        Riverton
1958        Sheridan
1959    Rock Springs
1960         Buffalo
Name: city, Length: 1961, dtype: object


# An example of using soundex in openclean

However, using soundex for each of the city is too slow, the code below take nearly 4 mins for one sningle city.\
So we should use clustering first and then use hard code to clean the remianing city name that is not in the city_list.

In [137]:
brooklyn = ds\
    .select("Ownerâ€™s House City")\
    .update("Ownerâ€™s House City", str.upper)\
    .filter(And(Eval("Ownerâ€™s House City", Soundex()) == soundex('BROOKLYN'), Col("Ownerâ€™s House City") != 'BROOKLYN'))\
    .distinct()

print('RANK\tCOUNT\tNAME')
for i, entry in enumerate(brooklyn.most_common()):
    key, count = entry
    print('{}.\t{}\t{}'.format(i + 1, count, key))

RANK	COUNT	NAME
1.	1943	BRKLYN
2.	1265	BROOKYLN
3.	933	BROOKLY
4.	784	BROOKLYN,
5.	432	BROKLYN
6.	367	BRROKLYN
7.	313	BROOKLN
8.	242	BROOKLYLN
9.	165	BROOOKLYN
10.	132	BROOKLNY
11.	111	BROOKLYNN
12.	110	BROOKLYM
13.	108	BROOKLYNB
14.	86	BRO0KLYN
15.	84	BROOKLYN NY
16.	81	BROOKLLYN
17.	68	BROKKLYN
18.	67	BROOKLYN`
19.	58	BROOKKLYN
20.	54	BROOKLYNQ
21.	51	BROOKLKYN
22.	48	BROOKLUN
23.	46	BRROOKLYN
24.	38	BROOKLNYN
25.	33	BROOKLTN
26.	31	BROOKLINE
27.	29	BR00KLYN
28.	29	BROOKLYN, NY
29.	28	BROOKLY N
30.	27	BROOKLYN.
31.	25	BROOKLYN & S.I.
32.	24	BROOKLYB
33.	22	BROOKLYKN
34.	21	BROOKLYNM
35.	19	BERKELEY
36.	18	BERKLEY
37.	18	BROOKLIN
38.	16	BROOKL
39.	13	BROOKJLYN
40.	13	BROOKLYN HEIGHT
41.	13	BROOKYLYN
42.	13	BROOKLYTN
43.	12	BROOKLEN
44.	11	BROOJLYN
45.	11	BROOKLYNNY
46.	11	BRIACLIFF MANOR
47.	11	BROOKLYHN
48.	11	BR0OKLYN
49.	11	BROOKLYN `
50.	10	BREOOKLYN
51.	10	BROOKLRN
52.	9	BROOKLYN1
53.	9	BROOKL;YN
54.	8	BROOKILYN
55.	8	BROOKLYN 11207
56.	8	BROOKLYJN
57.	8	BROOKLYNS
58.	8	BEROOKLYN

# Data Cleaning for Applicant columns

* how to deal with empty values has not decided yet

# Transform all city names to upper case

### Remember that we have changed some column names:
"City ": "Owner's House City"\
"State": "Owner's House State"

In [138]:
df["Owner's House City"] = df["Owner's House City"].str.upper()

In [139]:
# Convert similar values to suggested value using kNN clustering

In [140]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(col, minsize = 2, preds = 0.5):
    dba = ds.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(preds)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    item_count = 0
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        item_count += 1
        if item_count <= 10:
            print('{} ({})'.format(val, count))
    if item_count>10:
        print(".......{} more items".format(item_count-10))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def updateUsingClusters(col, clusters, isPrint = False):
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i <5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
    
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [141]:
date_cols = ["Owner's House City"]#,'Other Description' ]


# print("kNN cluster for ", "Owner's House City")
# col_clusters = getClusters("Owner's House City ")
# print("updating column ", "Owner's House City")
# print("----------------------\nTop 5 Cluster:\n----------------------")
# updateUsingClusters("Owner's House City", col_clusters, True)
# print("================")

print("kNN cluster for ", date_cols[0])
col_clusters = getClusters("Ownerâ€™s House City")
print("updating column ", date_cols[0])
print("----------------------\nTop 5 Cluster:\n----------------------")
updateUsingClusters(date_cols[0], col_clusters, True)
print("================")

kNN cluster for  Owner's House City
updating column  Owner's House City
----------------------
Top 5 Cluster:
----------------------
Cluster 0 (of size 165)

STATEN ISLAND (117693)
LONG ISLAND CIT (11716)
LONG ISLAND CI (133)
LONG ISLANDCITY (576)
LING ISLAND CTY (5)
LONG ISLAND CTY (972)
STATEN  ISLAND (171)
LONG ISLAND (1461)
STATEN ISLAND` (12)
STATE ISLAND (122)
.......155 more items

Suggested value: STATEN ISLAND


Cluster 1 (of size 162)

STATEN IS (49)
STATEN ISLAND` (12)
STATEN ISAND (17)
SATEN ISLAND (29)
STATEN ISLAND, (70)
SATETEN ISLAND (55)
STATEN ISALND (117)
STATEN ISL. (81)
STATEN ISLAN (50)
STATEN ISL (39)
.......152 more items

Suggested value: STATEN ISLAND


Cluster 2 (of size 161)

STATEN ISLAND (117693)
STATEN IS (49)
STATEN ISLAND` (12)
STATEN ISAND (17)
SATEN ISLAND (29)
STATEN ISLAND, (70)
SATETEN ISLAND (55)
STATEN ISALND (117)
STATEN ISL. (81)
STATEN ISLAN (50)
.......151 more items

Suggested value: STATEN ISLAND


Cluster 3 (of size 159)

STATEN ISLAND (11

In [142]:
# After clustering, find data that is not in the reference city dataset, hard code to clean them

In [143]:
upper_city_list = []
for item in city_list:
    upper_city_list.append(str(item.upper()))

outlier_cities = df.loc[(~df['Owner\'s House City'].str.upper().isin(upper_city_list)) & (~df['Owner\'s House City'].isna())]['Owner\'s House City'].drop_duplicates()
print(outlier_cities)

0                NEW YORK
1                  L.I.C.
7                     LIC
11         ROCKAWAY POINT
15                JAMAICA
                ...      
3723282            WARWCK
3723490             11412
3723694    BOLTON LANDING
3727345        GREEENWICH
3734988    WEST BRADENTON
Name: Owner's House City, Length: 10228, dtype: object


In [144]:
# Print standardized cities and found outliers

In [145]:
standardized_cities = df.loc[(df['Owner\'s House City'].str.upper().isin(upper_city_list)) & (~df['Owner\'s House City'].isna())]['Owner\'s House City'].drop_duplicates()
print(standardized_cities)

2               BROOKLYN
13             MANHATTAN
21         STATEN ISLAND
52           JERSEY CITY
57                 BRONX
               ...      
3565047           EXETER
3582952           NORMAN
3683882        FULLERTON
3697844        NAUGATUCK
3708708         WESTERLY
Name: Owner's House City, Length: 653, dtype: object


In [146]:
outlier_city_list = []
for item in outlier_cities:
    outlier_city_list.append(str(item))
    
print(outlier_city_list)

['NEW YORK', 'L.I.C.', 'LIC', 'ROCKAWAY POINT', 'JAMAICA', 'NEW HYDE PARK', 'FAR ROCKAWAY', 'NY', 'LIC NY', 'ROSLYN', 'NEW  YORK', 'BK', 'LONG ISLAND CIT', 'SECAUCUS', 'KATONAH', 'ROSLYN HEIGHTS', 'ENGLEWOOD CLIFF', 'VALLEY STREAM', 'BELLE HARBOR', 'NORTHPORT', 'LITTLE NECK', 'MOONSEY', 'HUNTINGTON STA.', 'LARCHMONT', 'GLEN HEAD', 'MASPETH', 'NORTH BABYLON', 'WOODSIDE', 'CARLSTADT', 'NEW YOKR', 'WOODMERE', 'DOUGLASTON', 'WHITETSTONE', 'JACKSON HEIGHTS', 'MANHASSET', 'FOREST HILL', 'LAKE SUCCESS', 'RED BANK', 'OZONE PARK', 'OAKLAND GARDENS', 'EAST RUTHERFORD', 'HARTSDALE', 'BRIARWOOD', 'FLORAL PARK', 'LYNBROOK', 'MONROE TOWNSHIP', 'BRON', 'BKYN', 'LASING', 'RICHMOND HILL', 'NEW', 'MT VERNON', 'ROCKAWAYS', 'BAYSIDE', 'HOWARD BEACH', 'S.OZONE PARK', 'JERICHO', 'FLSUHING', 'OLD GREENWICH', 'NORTHBOROUGH', 'NORTH BERGEN', 'LI CITY', 'UNIONDALE', 'EAST ELMHURST', 'WESTBURY, NY', 'PARAMUS', 'WOODISE', 'NYC', 'CARLE PLACE', 'BROOKYLN', 'BEDFORD HILLS', 'LONG ISLAND CI', 'RIVERDALE', 'LONG ISLA

In [147]:
# Search for similar city names in reference city dataset, and hard code to replace those outliers

In [148]:
def findCityName(str):
    print(city_df['city'].loc[city_df['city'].str.contains(str)].drop_duplicates())
    print("------------------------\n")
    
findCityName("Rich")
findCityName("Island")
findCityName("White")
findCityName("Philadelphia")
findCityName("Morris")
findCityName("Nassau")
findCityName("Westchester")
  

137       Richmond
1728    Richardson
1860      Richland
Name: city, dtype: object
------------------------

498       Rock Island
1060     Grand Island
1286    Staten Island
1294     Coney Island
Name: city, dtype: object
------------------------

363             White Springs
1266             White Plains
1881    White Sulphur Springs
Name: city, dtype: object
------------------------

997         Philadelphia
1418    New Philadelphia
Name: city, dtype: object
------------------------

1151    Morristown
Name: city, dtype: object
------------------------

Series([], Name: city, dtype: object)
------------------------

Series([], Name: city, dtype: object)
------------------------



In [149]:
outlier_city_list = ['NEW YORK', 'BKLYN', 'ROOKLYN', 'RICHMOND HILL', 'BX', 'NY', 'OLD WESTBURY', 'N.Y.', 'HOLLIS', 'MAHATTAN', 'LAKE SUCCESS', 'BROKKLYN', 'BETHESDA', 'JAMAICA', 'SECAUCUS', 'LIC', 'MASPETH', 'JAMAICA ESTATES', 'SOUTH OZONE PAR', 'BAYSIDE', 'JAM', 'PARMUS', 'KEW GARDENS', 'WOONSECKET', 'LI', 'ST. ALBANS', 'MASSAPEQUA', 'SI', 'FLORAL PARK', 'ROSLYN HEIGHTS', 'HOWARD BEACH', 'WHITEPLAINS', 'JACKSON HEIGHTS', 'REGO PARK', 'NEW HYDE PARK', 'REGO', 'ARVERNE', 'OZONE PARK', 'VALLEY STREAM', 'NEPONSIT', 'ROCKVILLE CENTR', 'BRIARWOOD', 'BRKLYN', 'MOUNT LAUREL', 'QUEEEN', 'ELMSFORD', 'NYC', 'GILLFORD', 'PARSIPPANY', 'WOODSIDE', 'LONG ISLAND CIT', 'QUEEN', 'VAALLEY STREAAM', 'BRONS', 'COLLEGE POINT', 'ROCKAWAY POINT', 'DOUGLASTON', 'ENGLEWOOD CLIFF', 'QNS', 'LYNBROOK', 'SYOSSET', 'FRESH MEADOWS', 'LITTLE NECK', 'WOODHAVEN', 'HARTSDALE', 'ATLANTIC BEACH', 'SAN JUAN CAPIST', 'CALDE PLACE', 'RIVERDALE', 'TUCKAHOE', 'SEAFORD', 'L.I.C.', 'REGO PK', "B'KLYN"]
clean_city_list = ['NEW YORK CITY', 'BROOKLYN', 'BROOKLYN', 'RICHMOND', 'BRONX', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'MANHATTAN', 'NEW YORK CITY', 'BROOKLYN', 'BETHESDA', 'NEW YORK CITY', 'SECAUCUS', 'LONG ISLAND CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'PARAMUS', 'NEW YORK CITY', 'WOONSOCKET', 'LONG ISLAND CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'STATEN ISLAND', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'WHITE PLAINS', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'BROOKLYN', 'PHILADELPHIA', 'QUEEENS', 'NEW YORK CITY', 'NEW YORK CITY', 'GILLFORD', 'MORRIS', 'NEW YORK CITY', 'LONG ISLAND CITY', 'QUEENS', 'BRONX', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'QUEENS', 'NEW YORK CITY', 'NASSAU', 'QUEENS', 'QUEENS', 'WOODHAVEN', 'NEW YORK CITY', 'NASSAU', 'SAN JUAN CAPISTRANO', 'BROOKLYN', 'RIVERDALE', 'STATEN ISLAND', 'NASSAU', 'LONG ISLAND CITY', 'QUEENS', "BROOKLYN"]

df['Owner\'s House City'] = df['Owner\'s House City'].replace(outlier_city_list, clean_city_list)

In [150]:
# Check State Column

In [151]:
state_col = 'Ownerâ€™s House State'
findDateOutliers(state_col, 0.1)

Column:  Ownerâ€™s House State
1.        NY   3,632,137
2.        NJ      54,605
3.                44,509
4.        CT       6,243
5.        FL       5,214
6.        CA       5,154
7.        PA       4,407
8.        IL       3,591
9.        MA       3,099
10.       NC       2,093

Total number of distinct values in Ownerâ€™s House State is 58
['', 'Ã¯Â¿Â½Ã¯Â¿Â½', 'NY']



In [152]:
ds.select('Ownerâ€™s House State').distinct()

Counter({'NY': 3632137,
         'NJ': 54605,
         'NV': 691,
         '': 44509,
         'MN': 504,
         'NC': 2093,
         'TN': 479,
         'CA': 5154,
         'MI': 548,
         'FL': 5214,
         'TX': 1551,
         'CT': 6243,
         'MA': 3099,
         'AZ': 603,
         'VA': 1951,
         'PA': 4407,
         'NM': 193,
         'RI': 859,
         'OH': 1941,
         'IL': 3591,
         'MO': 195,
         'CN': 27,
         'GA': 1243,
         'CO': 608,
         'DE': 193,
         'WV': 14,
         'WA': 476,
         'NE': 69,
         'IN': 200,
         'UT': 351,
         'NH': 364,
         'ON': 12,
         'MD': 1573,
         'DC': 669,
         'ME': 83,
         'WY': 24,
         'OR': 42,
         'SC': 282,
         'WI': 175,
         'VT': 101,
         'AR': 65,
         'HI': 54,
         'KY': 155,
         'IA': 82,
         'KS': 187,
         'SD': 14,
         'OK': 38,
         'AL': 50,
         'AK': 22,
         'ND': 1

In [153]:
# Find functional dependencies violations on City -> State

In [154]:
from openclean.operator.collector.count import distinct
from openclean.operator.map.violations import fd_violations

groups = fd_violations(df, lhs='Owner\'s House City', rhs='Owner\'s House State')

print('City         \t|            State')
print('=============\t|  ===============')
for key in groups:
    conflicts = distinct(groups.get(key), 'Owner\'s House State').most_common()
    state, count = conflicts[0]
    print('{:<12} \t| {} x {}'.format(key, count, state))
    for state, count in conflicts[1:]:
        print('             \t| {} x {}'.format(count, state))
    print('-------------\t|  ---------------')

City         	|            State
NEW YORK CITY 	| 1785695 x NY
             	| 1350 x NJ
             	| 598 x NC
             	| 243 x NV
             	| 56 x NH
             	| 30 x nan
             	| 26 x NE
             	| 20 x CA
             	| 14 x DC
             	| 12 x NM
             	| 11 x OH
             	| 9 x IN
             	| 8 x DE
             	| 6 x TN
             	| 5 x ND
             	| 5 x MA
             	| 3 x MN
             	| 2 x AZ
             	| 2 x PA
             	| 1 x CT
             	| 1 x MD
             	| 1 x VA
             	| 1 x VT
             	| 1 x MO
             	| 1 x GA
             	| 1 x TX
             	| 1 x FL
-------------	|  ---------------
LONG ISLAND CITY 	| 117877 x NY
             	| 27 x NC
             	| 5 x NV
-------------	|  ---------------
BROOKLYN     	| 608802 x NY
             	| 320 x NC
             	| 162 x NV
             	| 71 x nan
             	| 52 x NJ
             	| 12 x NM
             	| 8 x NH
     

GLASTONBURY  	| 286 x CT
             	| 1 x NY
-------------	|  ---------------
MAHWAH       	| 201 x NJ
             	| 10 x NY
-------------	|  ---------------
FREEPORT     	| 518 x NY
             	| 3 x NJ
-------------	|  ---------------
DIX HILLS    	| 608 x NY
             	| 3 x nan
-------------	|  ---------------
PORT WASHINGTON 	| 1868 x NY
             	| 2 x WI
-------------	|  ---------------
BEDMINSTER   	| 750 x NJ
             	| 5 x NY
-------------	|  ---------------
NEW ROCHELLE 	| 6160 x NY
             	| 6 x NC
             	| 4 x NJ
-------------	|  ---------------
PHOENIX      	| 269 x AZ
             	| 5 x NY
             	| 2 x AR
             	| 1 x NJ
-------------	|  ---------------
SCARSDALE    	| 3069 x NY
             	| 4 x NC
-------------	|  ---------------
COLUMBUS     	| 578 x OH
             	| 43 x GA
             	| 24 x NJ
             	| 15 x NY
             	| 4 x IN
             	| 3 x NC
-------------	|  ---------------
WHITE PLAINS 	| 57

PALM BEACH GARD 	| 97 x FL
             	| 36 x NY
-------------	|  ---------------
ARLINGTON    	| 299 x VA
             	| 10 x TX
             	| 5 x NY
-------------	|  ---------------
TINTON FALLS 	| 29 x NJ
             	| 3 x NY
-------------	|  ---------------
SPRING VALLEY 	| 480 x NY
             	| 5 x CA
-------------	|  ---------------
CHARLOTTE    	| 94 x NC
             	| 3 x NY
-------------	|  ---------------
TENAFLY      	| 272 x NJ
             	| 17 x NY
-------------	|  ---------------
JAMAICA ESTATE 	| 365 x NY
             	| 1 x NC
-------------	|  ---------------
JAMACIA      	| 216 x NY
             	| 1 x NC
-------------	|  ---------------
HARRISON     	| 1416 x NY
             	| 14 x NJ
-------------	|  ---------------
POMPANO BEACH 	| 110 x FL
             	| 2 x NY
-------------	|  ---------------
HICKSVILLE   	| 829 x NY
             	| 1 x NC
-------------	|  ---------------
CLIFFSIDE PARK 	| 127 x NJ
             	| 9 x NY
-------------	|  ----------

WALL         	| 234 x NJ
             	| 11 x NY
-------------	|  ---------------
LONG BEACH   	| 519 x NY
             	| 23 x CA
             	| 2 x NJ
-------------	|  ---------------
WESTPORT     	| 232 x CT
             	| 10 x NY
             	| 2 x CO
-------------	|  ---------------
OAKBROOK     	| 109 x IL
             	| 1 x NY
-------------	|  ---------------
DAYTON       	| 9 x NJ
             	| 1 x FL
-------------	|  ---------------
NORWALK      	| 206 x CT
             	| 26 x NY
             	| 2 x CO
             	| 2 x CN
-------------	|  ---------------
COLORADO SPRING 	| 25 x CO
             	| 4 x OH
-------------	|  ---------------
RIVER EDGE   	| 67 x NJ
             	| 3 x NY
-------------	|  ---------------
SPARTANBURG  	| 193 x SC
             	| 8 x NY
             	| 3 x SD
             	| 1 x NC
-------------	|  ---------------
CYPRESS      	| 12 x TX
             	| 1 x CA
-------------	|  ---------------
BELROSE      	| 79 x NY
             	| 2 x NC
---

RANCHO SANTA FE 	| 15 x CA
             	| 2 x NY
-------------	|  ---------------
DENVILLE     	| 5 x NJ
             	| 2 x NY
-------------	|  ---------------
ST. LOUIS    	| 25 x MO
             	| 4 x MS
             	| 1 x MT
-------------	|  ---------------
MOUNTAIN VIEW 	| 8 x CA
             	| 2 x NY
-------------	|  ---------------
MADISON      	| 40 x NJ
             	| 14 x WI
             	| 9 x CT
             	| 3 x NY
-------------	|  ---------------
FAIR LAWN    	| 81 x NJ
             	| 2 x NY
-------------	|  ---------------
NEW JERSEY   	| 87 x NJ
             	| 22 x NY
-------------	|  ---------------
FT LEE       	| 53 x NJ
             	| 4 x NY
-------------	|  ---------------
DUMONT       	| 49 x NJ
             	| 5 x NY
-------------	|  ---------------
FALLS CHURCH 	| 192 x VA
             	| 4 x NY
-------------	|  ---------------
RIVEREDGE    	| 22 x NJ
             	| 3 x NY
-------------	|  ---------------
SOUTH PLAINFIEL 	| 203 x NJ
             	| 27

RIDGE        	| 13 x NY
             	| 3 x NJ
-------------	|  ---------------
RICHBORO     	| 9 x PA
             	| 2 x NY
-------------	|  ---------------
HIGHLAND BEACH 	| 16 x FL
             	| 2 x NY
-------------	|  ---------------
HAMPTON      	| 18 x NH
             	| 6 x NY
             	| 1 x GA
-------------	|  ---------------
FT. LEE      	| 49 x NJ
             	| 2 x NY
-------------	|  ---------------
STRATFORD    	| 3 x CO
             	| 2 x CT
-------------	|  ---------------
EMERSON      	| 63 x NJ
             	| 1 x NY
-------------	|  ---------------
OLD TAPPAN   	| 110 x NJ
             	| 9 x NY
-------------	|  ---------------
NUTLEY       	| 43 x NJ
             	| 1 x NY
-------------	|  ---------------
TREVOSE      	| 27 x PA
             	| 1 x NY
-------------	|  ---------------
L I C        	| 346 x NY
             	| 2 x NC
-------------	|  ---------------
PARSIPPANY,  	| 86 x NJ
             	| 1 x NY
-------------	|  ---------------
LINCROFT     	|

RIDGEFIELD   	| 61 x NJ
             	| 51 x CT
             	| 18 x NY
             	| 1 x CN
-------------	|  ---------------
HARRISBURG   	| 61 x PA
             	| 8 x NY
-------------	|  ---------------
LONGBRANCH   	| 18 x NY
             	| 4 x NJ
-------------	|  ---------------
MELLVILLE    	| 19 x NY
             	| 2 x NJ
-------------	|  ---------------
WHITING      	| 4 x NJ
             	| 1 x NY
-------------	|  ---------------
REDDING      	| 14 x CT
             	| 4 x PA
-------------	|  ---------------
JERSEY       	| 28 x NJ
             	| 1 x NY
-------------	|  ---------------
HAMILTON     	| 28 x NJ
             	| 2 x NY
-------------	|  ---------------
EL SEGUNDO   	| 37 x CA
             	| 1 x NY
-------------	|  ---------------
SOUTH PLAINFEIL 	| 21 x NY
             	| 6 x NJ
-------------	|  ---------------
NEW JERSEY CITY 	| 6 x NJ
             	| 1 x NY
-------------	|  ---------------
NEW  HYDE PARK 	| 12 x NY
             	| 3 x NJ
-------------	|  --

BLOOMINGTON  	| 7 x IL
             	| 1 x MN
-------------	|  ---------------
S. PLAINFIELD 	| 28 x NJ
             	| 1 x NY
-------------	|  ---------------
ANNADALE     	| 6 x VA
             	| 1 x NJ
-------------	|  ---------------
UPPER SADDLE RI 	| 33 x NJ
             	| 1 x NY
-------------	|  ---------------
PALISADES PARK 	| 52 x NJ
             	| 4 x NY
-------------	|  ---------------
SO PLAINFIELD 	| 43 x NJ
             	| 2 x NY
-------------	|  ---------------
ORANGE LAKE  	| 8 x FL
             	| 1 x NY
-------------	|  ---------------
JERCEY CITY  	| 58 x NJ
             	| 3 x NY
-------------	|  ---------------
LEBANON      	| 20 x NJ
             	| 5 x PA
             	| 1 x NY
-------------	|  ---------------
ST           	| 48 x NY
             	| 10 x NJ
-------------	|  ---------------
VAUXHALL     	| 9 x NJ
             	| 5 x NY
-------------	|  ---------------
SAYERVILLE   	| 9 x NJ
             	| 1 x NY
-------------	|  ---------------
HILTON HEAD  	

NORTH HAVEN  	| 4 x NY
             	| 3 x CT
-------------	|  ---------------
GREENICH     	| 2 x NY
             	| 1 x CT
-------------	|  ---------------
BOGOTA       	| 4 x NJ
             	| 3 x NY
-------------	|  ---------------
PAWTUCKET    	| 2 x CT
             	| 1 x NY
-------------	|  ---------------
PALMYRA      	| 25 x NJ
             	| 1 x PA
-------------	|  ---------------
ANTHEM       	| 2 x AZ
             	| 1 x AR
-------------	|  ---------------
NEW JERSY    	| 4 x NY
             	| 1 x NJ
-------------	|  ---------------
NORWICH      	| 2 x CT
             	| 1 x NY
-------------	|  ---------------
HARRIS       	| 2 x PA
             	| 2 x NY
-------------	|  ---------------
PHILADELPIA  	| 3 x NY
             	| 1 x PA
-------------	|  ---------------
NORTH HAMPTON 	| 2 x NH
             	| 1 x MA
-------------	|  ---------------
HIGHFRAM     	| 2 x MA
             	| 1 x NY
             	| 1 x MS
-------------	|  ---------------
NORTH BENINGTON 	| 4 x VT
 

CHANDLER     	| 1 x AZ
             	| 1 x AR
-------------	|  ---------------


In [155]:
#There is a row that has "NEW YORK CITY" as city, but have "NJ" as State, fix its state to "NY"

In [156]:
index = df['Owner\'s House State'].loc[(df['Owner\'s House City'] == "NEW YORK CITY") & (df['Owner\'s House State'] == "NJ")].index[0]
df['Owner\'s House State'].update(pd.Series(['NY'], index = [index]))

In [157]:
df['Owner\'s House State'].loc[(df['Owner\'s House City'] == "NEW YORK CITY") & (df['Owner\'s House State'] == "NJ")]

2386       NJ
7318       NJ
7512       NJ
7840       NJ
13167      NJ
           ..
3717961    NJ
3721711    NJ
3725903    NJ
3731391    NJ
3734383    NJ
Name: Owner's House State, Length: 1349, dtype: object

In [158]:
# Apply similar operation on Owner's Business Name

In [159]:
bn_col = "Owner's Business Name"
findDateOutliers(bn_col)

Column:  Owner's Business Name
1.       N/A     575,253
2.        NA      99,459
3.                91,297
4.  NY SCHOOL CONSTRUCTION AUTHORITY      27,065
5.     OWNER      23,371
6.      NONE      22,619
7.   NYC SCA      20,026
8.       HPD      18,702
9.  NYC HOUSING AUTHORITY      14,394
10.  NYC HPD      13,648

Total number of distinct values in Owner's Business Name is 487031
['', '1764 E 174 LLC', '14 LLC.', 'SHOENBER,HIEBER,INC.', '242 A & A REALTY LLC', 'NÃ¯Â¿Â½A', '305 - 307 WEST 150 LLC', 'CVP11LLC', '20   EAST', '2W.67TH ST.INC.', '54 W. 16 ST. CORP. C/O C.H.G. MG', '444 C.P.W. OWNERS CORP', '101-70_WOODHAVEN CO', 'N.Y.   P.H', 'E.B.,INC', '200 PARK , L.P.', '400-408 H.D.F.C. INC.', '39-33   45TH STREET  N/A', '2616 210TH PLACE LLC', 'E 77 ST', 'PS 753 (K-BROOKLYN)', '240 N.10TH ST.', '5 & 88 CORP', 'B.B.C.3.4./CO.', '136-71/73 ROOSEVELT.', '139-25 ST. J.H. INC.', '64-02 8TH AVE. CORP.', '17-85__215 OWNERS CORP.', 'H.D.F.C., Corp.', '75 CEN.PRK W CORP C/O BRONW ----', '133

In [160]:
# Using clustering for Business Name takes too much time, we can only clean those empty data for now 

In [161]:
df[bn_col] = df[bn_col].replace(['N/A', '', 'NA','NONE'], [None,None,None,None])

In [162]:
# Data Profilling for applicant columns

#Find format problems and outliers in all applicant columns

#Using openclean's sklearn modules to detect problems and outliers

In [163]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    applicant_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(applicant_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(applicant_data)))
    print(DBSCANOutliers(eps = eps_setting).find(applicant_data))
    print('\n==================================')

In [164]:
date_cols = []

print("Permittee Data columns:\n")
for col in ds.columns:
    if 'Permittee' in col:
        print(col)
        date_cols.append(col)

date_cols.remove("Permittee's Business Name")
date_cols.remove("Permittee's Phone #")
date_cols.remove("Permittee's License #")
        
print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.1)

Permittee Data columns:

Permittee's First Name
Permittee's Last Name
Permittee's Business Name
Permittee's Phone #
Permittee's License Type
Permittee's License #
Permittee's Other Title
----------------------------

Column:  Permittee's First Name
1.      JOHN     152,939
2.   MICHAEL     125,007
3.    ROBERT      97,467
4.    JOSEPH      90,195
5.     PETER      68,706
6.   ANTHONY      67,368
7.    THOMAS      63,838
8.     JAMES      56,913
9.   WILLIAM      51,484
10.    DAVID      51,469

Total number of distinct values in Permittee's First Name is 38826
['', 'PAUL  `', 'SANDY 212253674', 'PETER----------', 'YU       QING', 'JOHN  {', 'AP0P9NAR', 'NEAL   NEAL M.', '14B', '37-38', 'MICHAEL____JR', 'KANG 11', 'BENNY   `', 'JOHN`1', 'PAUL   ,', 'KABTOTAL ART 7', 'ROBERT718720722', 'TING-1', 'PETER', 'T.J.R', 'X.H/X.D/TING', '-', '016836', 'JOSEPH', 'GUS``', '9AL', 'ROBERT.........', '..', 'ALDO, JR.', '9999', 'M0-CHUN', 'J.J.', 'LESLIE   S', 'I1', 'JOHN {{', 'THOMAS', 'A T M', 'ONG 

In [165]:
# Analysis

#the above results show the problems for the data cleaning task:
    
### For name data

#in "Permittee's First Name", "Permittee's Last Name", "Permittee Professional Title", there are many outliers which are illegal input, and there are many similar values. We need first converts evident outliers to legal values, then use kNN clusterer to standardize similar values.


### Permittee License #

#Permittee License # is made of 6 digits, there are outliers that do not satisfy the 6-digit format. We can not use kNN clusterer to standardize because many License # are similar. 

In [166]:
# Data Cleaning for Applicant columns

#* how to deal with empty values has not decided yet

In [167]:
# mapping list to replace outliers
# outlier1 = ['', 'MR. ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM 11', 'JOSEP;H``', 'DAID/11/2007', 'CHUNG   LUN', '718 9215010', 'ANTHONY', 'HSIA0-NAN', 'JOSEPH', '``````````', 'ROBERT  `', 'RAJENDRA9956700', '2', 'G.B.M.', 'EUGENE......JR', '6312100', 'CLAUDE,JR.', 'THOMAS``', 'ALAN  L', 'Nab53', 'MR. Y. B', 'J.J', 'PH8ILIP', 'I. M', 'RICHARD', 'ALBERTA S 111 D', 'P ;', 'GENECG.C. ENG &', 'J.J.', '2126202794', 'SHAW  HWA', 'HARRY         H', 'MR DOU8GLAS', '`1D', 'PAUL', 'K. T.', 'JOHN', '...NORMAN', 'EVAN   D', '7184361278BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MAD/Y/ARNI', 'ES ON SCH B', 'EUGENE.......JR', 'NEAL', 'F._ERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0-TECH', 'RODNEY   __', 'DAVID', 'G. L.', 'JAMES', 'LESLI8E', '7186054055', 'GEORGE', 'G.B.M', 'DAVID    JON', 'CHUNG---YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', '1P', 'JUDE.....N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD/HON-AN', 'GLEN  A.L.', 'J.B. Jr.', 'LORENZO..A', 'J J', '..RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY 2', '...JOSEPH', 'RUSSELL 111', 'THOMAS', 'H./E./CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', '--young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']
# mapping1 = [None, 'ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM', 'JOSEPH', None, 'CHUNG LUN', None, 'ANTHONY', 'HSIA0 NAN', 'JOSEPH', None, 'ROBERT', 'RAJENDRA', None, 'G.B.M.', 'EUGENEJR', None, 'CLAUDE JR.', 'THOMAS', 'ALAN  L', 'Nab', 'MR. Y. B', 'J.J', 'PHILIP', 'I. M', 'RICHARD', 'ALBERTA', None, 'GENECG.C. ENG', 'J.J.', None, 'SHAW HWA', 'HARRYH', 'MR DOUGLAS', None, 'PAUL', 'K. T.', 'JOHN', 'NORMAN', 'EVAND', 'BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MADYARNI', 'ES ON SCH B', 'EUGENEJR', 'NEAL', 'FERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0 TECH', 'RODNEY', 'DAVID', 'G. L.', 'JAMES', 'LESLIE', None, 'GEORGE', 'G.B.M', 'DAVID JON', 'CHUNG YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', None, 'JUDE N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD HON-AN', 'GLEN A.L.', 'J.B. Jr.', 'LORENZOA', 'J J', 'RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY', 'JOSEPH', 'RUSSELL', 'THOMAS', 'H.E.CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', 'young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']

# outlier2 = ['SHARMA #0', "0'CONNOR", 'RUSHTON    UEL', 'UDDIN   Z', 'HINKLEY 1', 'O&#039;CONNOR, P.E.', '.OOK', 'SAMUELS111', 'O&#039;CONNOR', 'CALIENDO', 'SMITH   JR.', 'LO  BUE', '7AN', '+-+ETTIERI', 'SMITH, 111', 'KAMEN   1', '.EE', 'MASS, 1', '.EI', 'Zagaroli 3rd', 'RINI   II', 'KAMEN   R', 'RYAN 11', 'SPI8EZIA L S', 'MUFTIC..A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL, P.E.', 'HAMA07', 'HINLEY,1', '1212', "O  ' CONNELL", 'HURT,JR.,', 'WESOLOWSKI', 'CHEN', '`ING, R.A', 'MARTARELLA 111', 'Gandhi, Ph.D., P.E.', '90I', 'ENNIS 2', 'COSTELLO R A A I A', '3UI', 'N/A', 'HURT,  JR', 'LEHR,1', 'KOHLER, 111', 'GERAZOUNIS', 'Alexander,1', 'LUBOW, R.A. LEED AP', 'RINI,111', '08CZAK', '````````````````````', 'CHAO  R.A.', 'Geier 11', '08NGEL', '08SOLOWSKI', 'I11', 'HINKLEY, 1', 'RUDIKOFF, P.E.', "O'CONNOR", 'SHAH   EZ', 'MIELE, JR., P.E.', 'RITTENHOUSE 111', 'AMADI   ISIOFIA', 'HINKLEY,1', 'RENFORE````````', "O'HARA,JR.", '73020012', 'PHAGOO   I', 'BRAY.....,', 'LLL', 'BHATHIA,1', 'GANDHI, PH. D., P.E', 'KO K', 'VASSALOTTI 11', 'HURT, JR .', '0018LKLE', 'RINI -111', 'PARIHAR', 'EE', 'L00802', 'ELISE.111', 'KING , R.A', 'CHRYSLER  P E', 'LEHR 1', 'Walters   Jr.', 'LEE', 'RINI  III', 'D&#039;ANGELO', '0UDOLPH III', 'VIEHE-NAESS 111', ',MO', '08E', '47DIKOFF', 'Yu,', '420865380', 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', '901BEN', '4153LOO', 'SYED-NAQVI', 'RYAN , JR.', 'K O K O R I S', 'ELISEO111', 'O&#039;CONNELL', 'ZEID61', '---Lewis', '00CHELI', 'MOHAMMAD       +++++', 'METZLER  P E', 'BAILEY', 'GANDHI, PH. D., P.E.', 'TIEMANN.111', 'SMITH.111', 'DI GER0NIMO', 'GANDHI, PH,D., P.E', 'III', 'J C', 'MAGAMI-QAIM-MAGAMI', '+M', 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', 'Y10007OR', 'SMITH,111', 'KING R A FAIA', 'RYAN III, AIA', '08AN', 'STARK 1', 'MASS', 'VICTORI0, R.A', 'RIZVI   A', '21029677', "3'CONNOR", 'Wong /  Lai', 'KAPLAN 3', 'GRAICHEN.JR./DAWN/DI', 'GROSSMAN ,PE,F.A.C.I']
# mapping2 = ['SHARMA ', "CONNOR", 'RUSHTON UEL', 'UDDIN Z', 'HINKLEY ', 'CONNOR P.E.', None, 'SAMUELS', 'CONNOR', 'CALIENDO', 'SMITH JR.', 'LO BUE', None, 'ETTIERI', 'SMITH', 'KAMEN', '.EE', 'MASS', '.EI', 'Zagaroli', 'RINI', 'KAMEN R', 'RYAN', 'SPIEZIA L S', 'MUFTIC.A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL P.E.', 'HAMA', 'HINLEY', None, "CONNELL", 'HURT JR.', 'WESOLOWSKI', 'CHEN', 'ING R.A', 'MARTARELLA', 'Gandhi', None, 'ENNIS ', 'COSTELLO R A A I A', None, None, 'HUR  JR', 'LEHR', 'KOHLER 111', 'GERAZOUNIS', 'Alexander', 'LUBOW R.A. LEED AP', 'RINI',None, None, 'CHAO R.A.', 'Geier', None, 'SOLOWSKI', None, 'HINKLEY', 'RUDIKOFF, P.E.', "CONNOR", 'SHAH EZ', 'MIELE JR. P.E.', 'RITTENHOUSE', 'AMADI   ISIOFIA', 'HINKLEY', 'RENFORE', "O'HARA,JR.", None, 'PHAGOO I', 'BRAY,', 'LLL', 'BHATHIA', 'GANDHI', 'KO K', 'VASSALOTTI', 'HURT JR.',None, 'RINI', 'PARIHAR', 'EE', None, 'ELISE', 'KING R.A', 'CHRYSLER  P E', 'LEHR', 'Walters Jr.', 'LEE', 'RINI  III', 'ANGELO', '0UDOLPH III', 'VIEHE-NAESS', 'MO', '08E', None, 'Yu,', None, 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', None, None, None, 'RYAN JR.', 'KOKORIS', 'ELISE', 'CONNELL', None, 'Lewis', 'CHELI', 'MOHAMMAD', 'METZLER  P E', 'BAILEY', 'GANDHI', 'TIEMANN', 'SMITH', 'DI GER0NIMO', 'GANDHI', 'III', 'J C', 'MAGAMI QAIM MAGAMI', None, 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', None, 'SMITH', 'KING R A FAIA', 'RYAN III AIA', None, 'STARK', 'MASS', 'VICTORI0 R.A', 'RIZVIA', None, "CONNOR", 'Wong Lai', 'KAPLAN', 'GRAICHEN.JR. DAWN DI', 'GROSSMAN']

# outlier3 = ['', '....DEMO', '050069', 'DEM. CONTR.,', 'XXXXX', 'G/C 10114H9', 'CGWC10114H99', '00', 'X S000155', '082-36-1245', 'G.G', 'LESSEE', '......GC', "'", '..OWNER', 'GC 2293', '--', 'XXXXXX', 'LS 31,721', '...GC', 'gen.cont.', 'G.C TK#4592', 'PE', 'RLA - 818', '.....OWNER', 'RLA 16077', 'G C', 'X 4129892', 'G. C.', 'R.L.A', 'GC 1028350', 'WC10114H99', 'LEESEE', 'GEN.CONT.', 'SIGN..HANGER', 'DEMO 20451', 'D8615', '.X', 'P.L.L.C', '..DEMO', 'G .C', 'L A', 'G.C NY11101', '32820', '....OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC99792', 'X 1341946', 'TRACK# 1390', 'EXPED.R4466', 'PLLC 9599691', 'G.C 1110101', '029649', '(CHECK)', 'DEM. CONTR,', 'EXPEDIT(H66172)', '.........GC', 'CITY OF N Y', 'GC 1170386', 'G. C', 'CO0OWNER', '(CHECKED)', 'C.C', '23392 1159774', 'DEMO {', 'RA', 'T. 31132', '....GC', 'RLA-787', 'TRACK #1390', 'D C', 'G.CONTR.', 'DEMO  CONT', '1GC', 'CC', 'demo G.C.', 'TRACK. #1390', 'M.F.S.P.C.', '...DEMO', 'DEMO G C', '13328', 'GEN  CONT', 'GC 1221073', "GC;'", 'DEMO 1341946', '11234', 'G.C.,', '.....GC', 'LIC.133668259 1', '?', '0WNER', 'C10892', 'GEN..CONT']
# mapping3 = [None, 'DEMO', None, 'DEM. CONTR', None, 'G/C', 'CGWC', None, 'X S', None, 'G.G', 'LESSEE', 'GC', None, 'OWNER', 'GC', None, None, 'LS ', 'GC', 'gen.cont.', 'G.C TK', 'PE', 'RLA ', 'OWNER', 'RLA ', 'G C', 'X', 'G. C.', 'R.L.A', 'GC', 'WC', 'LEESEE', 'GEN.CONT.', 'SIGN.HANGER', 'DEMO', None,None, 'P.L.L.C', 'DEMO', 'G.C', 'L A', 'G.C ', None, 'OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC', None, 'TRACK', 'EXPED.R', 'PLLC ', 'G.C', None, None, 'DEM. CONTR,', 'EXPEDIT', 'GC', None, 'GC', 'G.C', 'CO0OWNER', None, 'C.C', None, 'DEMO', 'RA', None, 'GC', 'RLA', None, 'D C', 'G.CONTR.', 'DEMO  CONT', 'GC', 'CC', 'demo G.C.', None, 'M.F.S.P.C.', 'DEMO', 'DEMO G C', None, 'GEN  CONT', 'GC ', "GC ", 'DEMO ', None, 'G.C.', 'GC', 'LIC', None, '0WNER',None, 'GEN.CONT']

# outlier4 = ['', '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '99998', '000N/A', '65569+', '01827O', 'R9526', 'LP0256', 'N/A', '1964', 'ISLAND', '1609', '000PW1', '00DEMO', '0688.6', '00000', '.20929', 'LP0258', '000TOR', '0D8615', '0SWITA', '818', 'O02200', 'DEMO', '196', '1075', '0000NT', '215', '0', '00000`', "D'ALTO", '0455', '22377', 'DD8615', '050579', '226', 'SWITA', 'DD6815', 'X02689']
# mapping4 = [None, '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '099998', '000000', '065569', '01827O', '0R9526', 'LP0256',None, '001964',None, '001609', '000PW1', '00DEMO', '006886', '000000', '020929', 'LP0258', '000TOR', '0D8615', '0SWITA', '000818', 'O02200', None, '000196', '001075', '0000NT', '000215', '000000', '000000', None, '000455', '022377', 'DD8615', '050579', '000226', None, 'DD6815', 'X02689']

# outliers = [outlier1, outlier2, outlier3, outlier4]
# mappings = [mapping1, mapping2, mapping3, mapping4]



In [168]:
# Remove evident outliers using hard coded mapping

In [169]:
# i = 0
# for col in date_cols:
#     df[col] = df[col].replace(outliers[i], mappings[i])
#     i += 1

In [170]:
# Convert similar values to suggested value using kNN clustering

In [171]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(col, minsize = 2):
    dba = ds.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.75)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    item_count = 0
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        item_count += 1
        if item_count <= 10:
            print('{} ({})'.format(val, count))
    if item_count>10:
        print(".......{} more items".format(item_count-10))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def updateUsingClusters(col, clusters, isPrint = False):
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i < 5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
            
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [172]:
for col in date_cols:
    print("kNN cluster for ", col)
    col_clusters = getClusters(col)
    print("updating column ", col)
    print("----------------------\nTop 5 Cluster:\n----------------------")
    updateUsingClusters(col, col_clusters, True)
    print("================")

kNN cluster for  Permittee's First Name
updating column  Permittee's First Name
----------------------
Top 5 Cluster:
----------------------
Cluster 0 (of size 92)

CONSTANTINOS (264)
CONSTANTIN (6353)
CONSTANTIAN (1594)
NONSTANTINE (3)
CONSTANTNINE (2)
CONSTANTIE (5)
KONSTANTINE (73)
CONSTANTAIN (3)
KONSTANTIN (20)
CNSTANTIN (9)
.......82 more items

Suggested value: CONSTANTIN


Cluster 1 (of size 92)

CONSTANTINE (4840)
CONSTANTINOS (264)
CONSTANTIAN (1594)
NONSTANTINE (3)
CONSTANTNINE (2)
CONSTANTIE (5)
KONSTANTINE (73)
CONSTANTAIN (3)
KONSTANTIN (20)
CNSTANTIN (9)
.......82 more items

Suggested value: CONSTANTIN


Cluster 2 (of size 85)

KONSTANTINOS (1639)
CONSTANTINE (4840)
CONSTANTINOS (264)
CONSTANTIN (6353)
CONSTANTIAN (1594)
NONSTANTINE (3)
CONSTANTNINE (2)
CONSTANTIE (5)
KONSTANTINE (73)
CONSTANTAIN (3)
.......75 more items

Suggested value: CONSTANTIN


Cluster 3 (of size 77)

CONSTANTINE (4840)
CONSTANTINOS (264)
CONSTANTIN (6353)
CONSTANTIAN (1594)
NONSTANTINE (3)
CONST

In [173]:
# Save cleaned data

In [174]:
#outputpath = 'cleaned_data.csv'
#df.to_csv(outputpath,sep=',',index=False,header=True) 

# Some discussion

We have profiled and cleaned most of the columns, we first change some of the column names so that they present right information about the data, then we look at each of these columns to detect outliers and wrong format.

However, there are still some issues, first we keep most of the empty value as NaN, and we don't know if clustering is the best way to clean the name data since it might convert similar names to one same name. And, business names are too long that we can not perform clustering on them so we only fixed empty values. Also, there are some column names in upper case, we do not know if we should convert them to lower case as other columns.

## Precision and Recall

In [175]:
cleaned_columns = ['House #', "Self Cert", "Non-Profit", "Filing Status", "Site Fill", "Act as Superintendent",
                   "Building Type", "Residential", "Oil Gas", 
                   "Owner's House #", "Owner's Business Name", "Owner's Phone #","Owner's House City", "Owner's House State",
                   
                   "Owner's House Zip Code", "Filing Date","Issuance Date","Expiration Date","Job Start Date", "DOB Run Date"
                  ]

In [176]:
for col in df.columns:
    if 'Permittee' in col:
        cleaned_columns.append(col)

In [177]:
df_sample_data = df_sample_data.rename(columns={
    
                        "Job doc. #": "Job Document #",
                        "Self_Cert": "Self Cert",
                        "Bldg Type": "Building Type",
                        "Site Safety Mgr's First Name" : "Site Safety Manager's First Name",
                        "Site Safety Mgr's Last Name" : "Site Safety Manager's Last Name",
                        "Site Safety Mgr Business Name" : "Site Safety Manager's Buisness Name",

    
                          "Owner'sPhone #": "Owner's Phone #" 
                        , "Owner'sHouse Street Name": "Owner's House Street Name"
                        , "Owner’s House City": "Owner's House City"
                        , "Owner’s House State" : "Owner's House State"
                        , "Owner’s House Zip Code" : "Owner's House Zip Code"
                        , "Paid": "Paid Date"
                        , "Fully Paid": "Fully Paid Date"
                        , "Assigned": "Assigned Date"
                        , "Approved": "Approved Date"
                        , "Job Status Descrp": "Job Status Description"
                        , "Community - Board": "Community Board"
                        , "Adult Estab": "Adult Establishment"
                        , "Pre- Filing Date": "Pre-Filing Date"
                        , "Total Est. Fee": "Total Estimated Fee"
                        , "Horizontal Enlrgmt": "Horizontal Enlargement"
                        , "Vertical Enlrgmt": "Vertical Enlargement"
                        , "ExistingNo. of Stories": "Existing # of Stories"
                        , "Proposed No. of Stories": "Proposed # of Stories"
                        , "Zoning Dist1": "Zoning District 1"
                        , "Zoning Dist2": "Zoning District 2"
                        , "Zoning Dist3": "Zoning District 3"
                        , "City ": "Owner's House City"
                        , "State": "Owner's House State"
                        , "Zip": "Owner's House Zip"
                        , "DOBRunDate": "DOB Run Date"
                       })


In [178]:
df_sample_data.columns

Index(['BOROUGH', 'Bin #', 'House #', 'Street Name', 'Job #', 'Job Document #',
       'Job Type', 'Self Cert', 'Block', 'Lot', 'Community Board', 'Zip Code',
       'Building Type', 'Residential', 'Special District 1',
       'Special District 2', 'Work Type', 'Permit Status', 'Filing Status',
       'Permit Type', 'Permit Sequence #', 'Permit Subtype', 'Oil Gas',
       'Site Fill', 'Filing Date', 'Issuance Date', 'Expiration Date',
       'Job Start Date', 'Permittee's First Name', 'Permittee's Last Name',
       'Permittee's Business Name', 'Permittee's Phone #',
       'Permittee's License Type', 'Permittee's License #',
       'Act as Superintendent', 'Permittee's Other Title', 'HIC License',
       'Site Safety Manager's First Name', 'Site Safety Manager's Last Name',
       'Site Safety Manager's Buisness Name',
       'Superintendent First & Last Name', 'Superintendent Business Name',
       'Owner's Business Type', 'Non-Profit', 'Owner's Business Name',
       'Owner's First 

In [179]:
df_sample_data = df_sample_data[cleaned_columns]

In [180]:
df_temp = df.loc[df_sample_data.index][cleaned_columns].copy()

In [181]:
for col in cleaned_columns:
    print("column: ", col)
    print("Original,\t Cleaned\n")
    for i in range(50):
        print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])
        
    print('======================\n\n')

column:  House #
Original,	 Cleaned

544 	 544
300 	 300
2469 	 2469
27 	 27
136-98 	 136-98
135 	 135
443 	 443
64 	 64
611 	 611
3 	 3
158-04 	 158-04
37 	 37
539 	 539
235 	 235
112-20 	 112-20
24 	 24
49 	 49
75 	 75
7215 	 7215
2515 	 2515
336 	 336
222-14 	 222-14
100 	 100
100 	 100
675 	 675
150 	 150
126 	 126
118-17 	 118-17
2566 	 2566
818 	 818
765 	 765
1039 	 1039
302 	 302
36-13 	 36-13
1695 	 1695
137 	 137
75 	 75
30-31 	 30-31
311 	 311
100 	 100
303 	 303
506 	 506
220 	 220
4503 	 4503
768 	 768
3952 	 3952
53-09 	 53-09
512 	 512
147-17 	 147-17
140-08 	 140-08


column:  Self Cert
Original,	 Cleaned

N 	 False
nan 	 False
N 	 False
N 	 False
Y 	 True
nan 	 False
Y 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
nan 	 False
Y 	 True
nan 	 False
Y 	 True
Y 	 True
N 	 False
nan 	 False
Y 	 True
N 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
nan 	 False
Y 	 True
Y 	 T

EASTCHESTER 	 EASTCHESTER
FRESHMEADOWS 	 FRESHMEADOWS
NY 	 NEW YORK CITY
NYC 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
LIC 	 LONG ISLAND CITY
NY 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
Jericho 	 JERICHO
BRONX 	 BRONX
NEW YORK 	 NEW YORK CITY
LIC 	 LONG ISLAND CITY
NEW YORK 	 NEW YORK CITY
NY 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
NEW YORK, 	 NEW YORK,
LAKE SUCCESS 	 NEW YORK CITY
BRONX 	 BRONX
GARDEN CITY 	 GARDEN CITY
NEW YORK 	 NEW YORK CITY
BROOKLYN 	 BROOKLYN
NY 	 NEW YORK CITY
L.I.C. 	 LONG ISLAND CITY
NEW YORK 	 NEW YORK CITY
BROOKLYN 	 BROOKLYN
GREAT NECH 	 GREAT NECH
QUEENS 	 QUEENS
NEW YORK 	 NEW YORK CITY
BRONX 	 BRONX
PARSIPPANY 	 MORRIS
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
BK 	 BK
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
QUEENS 	 QUEENS
NEW YORK 	 NEW YORK CITY
WHITESTONE 	 WHITESTONE
Rosedale 	 ROSEDALE


column:  Owner's House State
Original,	 Cleaned

NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY

2010-09-27 	 2010-09-27 00:00:00
2002-08-08 	 2002-08-08 00:00:00
2014-08-12 	 2014-08-12 00:00:00
12/09/2020 	 2020-12-09 00:00:00
2012-08-28 	 2012-08-28 00:00:00
2007-07-06 	 2007-07-06 00:00:00
2008-06-16 	 2008-06-16 00:00:00
10/25/2018 	 2018-10-25 00:00:00
2013-10-31 	 2013-10-31 00:00:00
11/12/2020 	 2020-11-12 00:00:00
2016-09-14 	 2016-09-14 00:00:00
2006-10-25 	 2006-10-25 00:00:00


column:  DOB Run Date
Original,	 Cleaned

2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2019-12-05  	 2019-12-05 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 20

In [198]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

In [183]:
cleaned_columns

['House #',
 'Self Cert',
 'Non-Profit',
 'Filing Status',
 'Site Fill',
 'Act as Superintendent',
 'Building Type',
 'Residential',
 'Oil Gas',
 "Owner's House #",
 "Owner's Business Name",
 "Owner's Phone #",
 "Owner's House City",
 "Owner's House State",
 "Owner's House Zip Code",
 'Filing Date',
 'Issuance Date',
 'Expiration Date',
 'Job Start Date',
 'DOB Run Date',
 "Permittee's First Name",
 "Permittee's Last Name",
 "Permittee's Business Name",
 "Permittee's Phone #",
 "Permittee's License Type",
 "Permittee's License #",
 "Permittee's Other Title"]

In [184]:
col_idx = 0

In [219]:
tp = 0
fp = 0


In [227]:
fn = 0

In [185]:
# House #

col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  House #
Original,	 Cleaned

544 	 544
300 	 300
2469 	 2469
27 	 27
136-98 	 136-98
135 	 135
443 	 443
64 	 64
611 	 611
3 	 3
158-04 	 158-04
37 	 37
539 	 539
235 	 235
112-20 	 112-20
24 	 24
49 	 49
75 	 75
7215 	 7215
2515 	 2515
336 	 336
222-14 	 222-14
100 	 100
100 	 100
675 	 675
150 	 150
126 	 126
118-17 	 118-17
2566 	 2566
818 	 818
765 	 765
1039 	 1039
302 	 302
36-13 	 36-13
1695 	 1695
137 	 137
75 	 75
30-31 	 30-31
311 	 311
100 	 100
303 	 303
506 	 506
220 	 220
4503 	 4503
768 	 768
3952 	 3952
53-09 	 53-09
512 	 512
147-17 	 147-17
140-08 	 140-08




In [None]:
# + 0

In [186]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Self Cert
Original,	 Cleaned

N 	 False
nan 	 False
N 	 False
N 	 False
Y 	 True
nan 	 False
Y 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
nan 	 False
Y 	 True
nan 	 False
Y 	 True
Y 	 True
N 	 False
nan 	 False
Y 	 True
N 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
nan 	 False
Y 	 True
Y 	 True
N 	 False
Y 	 True
Y 	 True
nan 	 False
N 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
N 	 False
N 	 False
N 	 False
N 	 False
Y 	 True




In [220]:
tp += 17

In [187]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Non-Profit
Original,	 Cleaned

N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
nan 	 False
N 	 False
N 	 False
N 	 False
nan 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
Y 	 True
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
nan 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False
N 	 False




In [221]:
tp += 3

In [188]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Filing Status
Original,	 Cleaned

INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
INITIAL 	 INITIAL
INITIAL 	 INITIAL
RENEWAL 	 RENEWAL
RENEWAL 	 RENEWAL




In [189]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Site Fill
Original,	 Cleaned

nan 	 NONE
NONE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
NONE 	 NONE
NONE 	 NONE
ON-SITE 	 ON-SITE
NONE 	 NONE
NONE 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
NONE 	 NONE
nan 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
nan 	 NONE
NONE 	 NONE
NONE 	 NONE
NONE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
NONE 	 NONE
NONE 	 NONE
NOT APPLICABLE 	 NONE
nan 	 NONE
nan 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
NONE 	 NONE
nan 	 NONE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
OFF-SITE 	 OFF-SITE
NOT APPLICABLE 	 NONE
NOT APPLICABLE 	 NONE
ON-SITE 	 ON-SITE
nan 	 NONE
NOT APPLICABLE 	 NONE
ON-SITE 	 ON-SITE




In [222]:
tp += 7

In [190]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Act as Superintendent
Original,	 Cleaned

nan 	 False
Y 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
Y 	 True
Y 	 True
nan 	 False
Y 	 True
Y 	 True
nan 	 False
nan 	 False
Y 	 True
Y 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
Y 	 True
Y 	 True
Y 	 True
nan 	 False
nan 	 False
nan 	 False
Y 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
Y 	 True
Y 	 True
Y 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
Y 	 True




In [223]:
tp += 35

In [192]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Building Type
Original,	 Cleaned

2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
1 	 1
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
1 	 1
1 	 1
2 	 2
1 	 1
2 	 2
2 	 2
2 	 2
1 	 1
2 	 2
1 	 1
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
2 	 2
1 	 1
2 	 2
1 	 1
1 	 1




In [193]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Residential
Original,	 Cleaned

nan 	 False
nan 	 False
YES 	 True
nan 	 False
nan 	 False
nan 	 False
YES 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
YES 	 True
YES 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
YES 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
YES 	 True
YES 	 True
YES 	 True
YES 	 True
nan 	 False
nan 	 False
nan 	 False
YES 	 True
YES 	 True
nan 	 False
YES 	 True
YES 	 True
nan 	 False
YES 	 True
nan 	 False
nan 	 False
nan 	 False
nan 	 False
YES 	 True
YES 	 True
nan 	 False
YES 	 True
YES 	 True




In [224]:
tp += 32


In [194]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Oil Gas
Original,	 Cleaned

nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE
nan 	 NONE




In [225]:
tp += 50

In [195]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Owner's House #
Original,	 Cleaned

544 	 544
1110 	 1110
1133 	 1133
27 	 27
136-98 	 136-98
750 	 750
1345 	 1345
6807 	 6807
150 	 150
3657 	 3657
164-32 	 164-32
595 	 595
539 	 539
235 	 235
35-11 	 35-11
22 	 22
49 	 49
75 	 75
500 	 500
PO BOX 	 PO BOX
336 	 336
30-30 	 30-30
100 	 100
420 	 420
22 	 22
150 	 150
ONE 	 ONE
1981 	 1981
1067 	 1067
152 	 152
885 	 885
1039 	 1039
204 	 204
36-13 	 36-13
570 	 570
137 	 137
55 	 55
30-31 	 30-31
330 	 330
2049 	 2049
14 	 14
506 	 506
220 	 220
1462 	 1462
ONE 	 ONE
902 	 902
53-09 	 53-09
512 	 512
147-17 	 147-17
241-30 	 241-30




In [228]:
fn += 3

In [196]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Owner's Business Name
Original,	 Cleaned

540 HUDSON STREET, LLC 	 540 HUDSON STREET LLC
LANDMARK O 	 LANDMARK O
FIVE STAR REALTY CO. 	 FIVE STAR REALTY CO
SOPHIE'S CUBAN CUISINE 	 SOPHIE'S CUBAN CUISINE
nan 	 nan
135 East 57th Street Associates 	 135 EAST 57TH STREET ASSOCIATES
MAUTNER GLICK CORP. 	 MAUTNER GLICK CORP
NAPCO  REALTY  LTD 	 NAPCO  REALTY  LTD
151 READE ST PARKING CORP. 	 151 READE ST PARKING CORP
RHB INVESTORS, LLC 	 RHB INVESTORS LLC
LEWIS REALTY MANAGEMENT INC 	 LEWIS REALTY MANAGEMENT INC
C/O L & B IPM OF NEW YORK 	 C/O L & B IPM OF NEW YORK
nan 	 nan
BELAIR CORPORATION 	 BELAIR CORPORATION
nan 	 nan
LAWRENCE & MELVIN FRIEDLAND 	 LAWRENCE & MELVIN FRIEDLAND
FAMURB CO. 	 FAMURB CO
RXR REALTY 	 RXR REALTY
Benjamin Beechwood Breakers LLC 	 BENJAMIN BEECHWOOD BREAKERS LLC
nan 	 nan
336 TENANTS CORPORATION 	 336 TENANTS CORPORATION
NYC SCA 	 NYC SCA
LENOX HILL HOSPITAL 	 LENOX HILL HOSPITAL
SL GREEN REALTY CORP. 	 SL GREEN REALTY CORP
FRIEDLAND PROPERTIES 	 FRIED

In [229]:
tp += 8
fn += 3

In [199]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Owner's Phone #
Original,	 Cleaned

2129893100 	 2129893100
2122421840 	 2122421840
6464351897 	 6464351897
7182430911 	 7182430911
9177038886 	 9177038886
2128381800 	 2128381800
2122881999 	 2122881999
7182366866 	 7182366866
2128881200 	 2128881200
9143375555 	 9143375555
9176812799 	 9176812799
2127551166 	 2127551166
2126941229 	 2126941229
2128641300 	 2128641300
7183920111 	 7183920111
2128797734 	 2128797734
2127309790 	 2127309790
2127156134 	 2127156134
5169355555 	 5169355555
7189315199 	 7189315199
2123502852 	 2123502852
7184729000 	 7184729000
2124392010 	 2124392010
2122161738 	 2122161738
2127443300 	 2127443300
2123700928 	 2123700928
2126766000 	 2126766000
5169445000 	 5169445000
9175602434 	 9175602434
4129995930 	 4129995930
6465027200 	 6465027200
7188338502 	 7188338502
2122880077 	 2122880077
7183261400 	 7183261400
2125841150 	 2125841150
7187570639 	 7187570639
5167640226 	 5167640226
6466421505 	 6466421505
2122475577 	 2122475577
7185432231 	 718543

In [200]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Owner's House City
Original,	 Cleaned

NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
WHITE PLAINS 	 WHITE PLAINS
BROOKLYN 	 BROOKLYN
QUEENS 	 QUEENS
New York 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
Brooklyn 	 BROOKLYN
NY 	 NEW YORK CITY
EASTCHESTER 	 EASTCHESTER
FRESHMEADOWS 	 FRESHMEADOWS
NY 	 NEW YORK CITY
NYC 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
LIC 	 LONG ISLAND CITY
NY 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
Jericho 	 JERICHO
BRONX 	 BRONX
NEW YORK 	 NEW YORK CITY
LIC 	 LONG ISLAND CITY
NEW YORK 	 NEW YORK CITY
NY 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW YORK CITY
NEW YORK, 	 NEW YORK,
LAKE SUCCESS 	 NEW YORK CITY
BRONX 	 BRONX
GARDEN CITY 	 GARDEN CITY
NEW YORK 	 NEW YORK CITY
BROOKLYN 	 BROOKLYN
NY 	 NEW YORK CITY
L.I.C. 	 LONG ISLAND CITY
NEW YORK 	 NEW YORK CITY
BROOKLYN 	 BROOKLYN
GREAT NECH 	 GREAT NECH
QUEENS 	 QUEENS
NEW YORK 	 NEW YORK CITY
BRONX 	 BRONX
PARSIPPANY 	 MORRIS
NEW YORK 	 NEW YORK CITY
NEW YORK 	 NEW 

In [230]:
tp += 30
fp += 2

In [201]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Owner's House State
Original,	 Cleaned

NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NV 	 NV
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NJ 	 NJ
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY
NY 	 NY




In [202]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Owner's House Zip Code
Original,	 Cleaned

10014 	 10014
10022 	 10022
10604 	 10604
11201 	 11201
11354 	 11354
10022 	 10022
10021 	 10021
11219 	 11219
10155 	 10155
10709 	 10709
11366 	 11366
10022 	 10022
10031 	 10031
10025 	 10025
11101 	 11101
10021 	 10021
10036 	 10036
10019 	 10019
11753 	 11753
10470 	 10470
10025 	 10025
11101 	 11101
10021 	 10021
10170 	 10170
10065 	 10065
10017 	 10017
10081 	 10081
11042 	 11042
10469 	 10469
11530 	 11530
10017 	 10017
11228 	 11228
10028 	 10028
11106 	 11106
10022 	 10022
11211 	 11211
11021 	 11021
11103 	 11103
10036 	 10036
10475 	 10475
07054 	 07054
10037 	 10037
10036 	 10036
11230 	 11230
10019 	 10019
10010 	 10010
11378 	 11378
10018 	 10018
11357 	 11357
11422 	 11422




In [203]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Filing Date
Original,	 Cleaned

2009-04-17  	 2009-04-17 00:00:00
1998-12-09  	 1998-12-09 00:00:00
2019-12-04  	 2019-12-04 00:00:00
2013-06-10  	 2013-06-10 00:00:00
2014-07-25  	 2014-07-25 00:00:00
2009-07-16  	 2009-07-16 00:00:00
2011-04-13  	 2011-04-13 00:00:00
2002-11-15  	 2002-11-15 00:00:00
1994-02-01  	 1994-02-01 00:00:00
2013-06-10  	 2013-06-10 00:00:00
2006-10-12  	 2006-10-12 00:00:00
1994-01-13  	 1994-01-13 00:00:00
2008-08-12  	 2008-08-12 00:00:00
2008-12-04  	 2008-12-04 00:00:00
2000-12-20  	 2000-12-20 00:00:00
1993-08-03  	 1993-08-03 00:00:00
2017-04-06  	 2017-04-06 00:00:00
2017-11-01  	 2017-11-01 00:00:00
2008-05-28  	 2008-05-28 00:00:00
2012-06-12  	 2012-06-12 00:00:00
1996-09-05  	 1996-09-05 00:00:00
2006-02-23  	 2006-02-23 00:00:00
1998-06-04  	 1998-06-04 00:00:00
2019-10-24  	 2019-10-24 00:00:00
2010-12-02  	 2010-12-02 00:00:00
2012-07-25  	 2012-07-25 00:00:00
1992-01-21  	 1992-01-21 00:00:00
2011-05-04  	 2011-05-04 00:00:00
2018-10

In [231]:
tp += 3

In [204]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Issuance Date
Original,	 Cleaned

2009-04-17  	 2009-04-17 00:00:00
1998-12-09  	 1998-12-09 00:00:00
2019-12-04  	 2019-12-04 00:00:00
2013-06-10  	 2013-06-10 00:00:00
2014-07-25  	 2014-07-25 00:00:00
2009-07-16  	 2009-07-16 00:00:00
2011-04-13  	 2011-04-13 00:00:00
2002-11-15  	 2002-11-15 00:00:00
1994-02-01  	 1994-02-01 00:00:00
2013-06-10  	 2013-06-10 00:00:00
2006-10-12  	 2006-10-12 00:00:00
1994-01-13  	 1994-01-13 00:00:00
2008-08-12  	 2008-08-12 00:00:00
2008-12-05  	 2008-12-05 00:00:00
2000-12-20  	 2000-12-20 00:00:00
1993-08-03  	 1993-08-03 00:00:00
2017-04-06  	 2017-04-06 00:00:00
2017-11-01  	 2017-11-01 00:00:00
2008-05-28  	 2008-05-28 00:00:00
2012-06-12  	 2012-06-12 00:00:00
1996-09-05  	 1996-09-05 00:00:00
2006-02-23  	 2006-02-23 00:00:00
1998-06-04  	 1998-06-04 00:00:00
2019-10-24  	 2019-10-24 00:00:00
2010-12-02  	 2010-12-02 00:00:00
2012-07-25  	 2012-07-25 00:00:00
1992-01-21  	 1992-01-21 00:00:00
2011-05-04  	 2011-05-04 00:00:00
2019-

In [232]:
tp += 3

In [205]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Expiration Date
Original,	 Cleaned

2010-02-18 	 2010-02-18 00:00:00
1999-10-29 	 1999-10-29 00:00:00
2020-12-03 	 2020-12-03 00:00:00
2014-06-10 	 2014-06-10 00:00:00
2015-07-25 	 2015-07-25 00:00:00
2010-07-16 	 2010-07-16 00:00:00
2011-12-10 	 2011-12-10 00:00:00
2003-09-21 	 2003-09-21 00:00:00
1994-06-04 	 1994-06-04 00:00:00
2014-06-10 	 2014-06-10 00:00:00
2006-12-31 	 2006-12-31 00:00:00
1994-11-10 	 1994-11-10 00:00:00
2009-02-12 	 2009-02-12 00:00:00
2009-07-01 	 2009-07-01 00:00:00
2001-06-03 	 2001-06-03 00:00:00
1994-08-04 	 1994-08-04 00:00:00
2018-04-01 	 2018-04-01 00:00:00
2018-11-01 	 2018-11-01 00:00:00
2009-05-28 	 2009-05-28 00:00:00
2013-05-01 	 2013-05-01 00:00:00
1997-04-17 	 1997-04-17 00:00:00
2006-05-01 	 2006-05-01 00:00:00
1999-03-01 	 1999-03-01 00:00:00
2020-10-23 	 2020-10-23 00:00:00
2011-07-01 	 2011-07-01 00:00:00
2013-07-01 	 2013-07-01 00:00:00
1992-11-07 	 1992-11-07 00:00:00
2012-02-18 	 2012-02-18 00:00:00
2019-03-02 	 2019-03-02 00:00:0

In [233]:
tp += 3

In [206]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Job Start Date
Original,	 Cleaned

2009-04-17 	 2009-04-17 00:00:00
1998-12-09 	 1998-12-09 00:00:00
2015-11-12 	 2015-11-12 00:00:00
2013-06-10 	 2013-06-10 00:00:00
2014-07-25 	 2014-07-25 00:00:00
2008-04-01 	 2008-04-01 00:00:00
2011-04-13 	 2011-04-13 00:00:00
2002-06-10 	 2002-06-10 00:00:00
1994-02-01 	 1994-02-01 00:00:00
2008-02-19 	 2008-02-19 00:00:00
2005-08-29 	 2005-08-29 00:00:00
1994-01-13 	 1994-01-13 00:00:00
2008-08-12 	 2008-08-12 00:00:00
2008-12-05 	 2008-12-05 00:00:00
2000-12-20 	 2000-12-20 00:00:00
1993-03-11 	 1993-03-11 00:00:00
2017-04-06 	 2017-04-06 00:00:00
2017-11-01 	 2017-11-01 00:00:00
2008-05-28 	 2008-05-28 00:00:00
2010-05-04 	 2010-05-04 00:00:00
1996-09-05 	 1996-09-05 00:00:00
2006-02-23 	 2006-02-23 00:00:00
1998-06-04 	 1998-06-04 00:00:00
2019-10-24 	 2019-10-24 00:00:00
2010-12-02 	 2010-12-02 00:00:00
2012-02-09 	 2012-02-09 00:00:00
1992-01-21 	 1992-01-21 00:00:00
2009-10-15 	 2009-10-15 00:00:00
2019-02-19 	 2019-02-19 00:00:00

In [234]:
tp += 3

In [207]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  DOB Run Date
Original,	 Cleaned

2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2019-12-05  	 2019-12-05 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2019-10-25  	 2019-10-25 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2017-11-03  	 2017-11-03 00:00:00
2019-0

In [235]:
tp += 3

In [209]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's First Name
Original,	 Cleaned

MARK 	 MARK
ROI 	 ROI
JERRY 	 JERRY
MARK 	 MARK
MORRIS 	 MORRIS
STEVEN 	 STEVEN
TASSAWAR 	 TASSAWAR
SYED 	 SYED
YORAM 	 YORAM
AHMET 	 AHMET
MUNESHWAR 	 MUNESHWAR
JAY 	 JAY
KAZIMIERZ 	 KAZIMIERZ
SLAWEK 	 SLAWEK
STEVE 	 STEVE
TODD 	 TODD
JOHN 	 JOHN
PETER 	 PETER
JOHN 	 JOHN
PRABJIT 	 PRABJIT
LARRY 	 LARRY
GREG 	 GREG
JOHN 	 JOHN
JACEK 	 JACEK
MARC 	 MARC
WILLIAMS 	 WILLIAMS
MORTI 	 MORTI
RUSSELL 	 RUSSELL
KHALIDA 	 KHALIDA
LINZHONG 	 LINZHONG
WILLIE 	 WILLIE
ARGENTINA 	 ARGENTINA
DOMINICK 	 DOMINICK
STUART 	 STUART
MICHAEL 	 MICHAEL
SINGH 	 SINGH
DAVID 	 DAVID
MISAEL 	 MISAEL
NAZMI 	 NAZMI
ANTHONY 	 ANTHONY
JAMES 	 JAMES
GUILIO 	 GUILIO
JONATHAN 	 JONATHAN
BARBARA 	 BARBARA
LAWRENCE 	 LAWRENCE
HERCULES 	 HERCULES
YOGESHWARIE 	 YOGESHWARIE
JOSE 	 JOSE
VINCENZO 	 VINCENZO
BARRY 	 BARRY




In [210]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's Last Name
Original,	 Cleaned

BICKERSTAFFE 	 BICKERSTAFFE
YANG 	 YANG
DIFALCO 	 DIFALCO
MCGOEY 	 MCGOEY
MILLER 	 MILLER
STEIN 	 STEIN
AHSAN 	 AHSAN
ELLIAS 	 ELLIAS
FINKELSTEIN 	 FINKELSTEIN
SANGIRAY 	 SANGIRAY
BUDHU 	 BUDHU
KOPT 	 KOPT
WADOLOWSKI 	 WADOLOWSKI
RATOMSKI 	 RATOMSKI
CHON 	 CHON
PHILLIPS 	 PHILLIPS
DOWLING 	 DOWLING
MIRZ 	 MIRZ
ERCOLANO 	 ERCOLANO
SINGH 	 SINGH
LANDAU 	 LANDAU
BLINN 	 BLINN
PATTEN 	 PATTEN
OSTROWSKI 	 OSTROWSKI
FERGUSON 	 FERGUSON
REYNOLDS 	 REYNOLDS
HIRSCH 	 HIRSCH
ASCH 	 ASCH
SHOUKAT 	 SHOUKAT
ZHUO 	 ZHUO
HIRSH 	 HIRSH
AMATO 	 AMATO
RUTIGLIANO 	 RUTIGLIANO
ISER 	 ISER
CALISTO 	 CALISTO
JASWANT 	 JASWANT
WEHMEYER 	 WEHMEYER
MORALES 	 MORALES
AHMETOVIC 	 AHMETOVIC
RASULO 	 RASULO
JOHANSON 	 JOHANSON
CIANCI 	 CIANCI
LEWIS 	 LEWIS
RIVERA 	 RIVERA
LEVINE 	 LEVINE
ARGYRIOU 	 ARGYRIOU
NARMA 	 NARMA
RUIZ 	 RUIZ
COSTANZA 	 COSTANZA
COBUCCI 	 COBUCCI




In [211]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's Business Name
Original,	 Cleaned

RIGHT ANGLE INC 	 RIGHT ANGLE INC
278 CONSTRUCTION, INC 	 278 CONSTRUCTION, INC
PLBG INC 	 PLBG INC
ACTIVE FIRE CONTROL, INC 	 ACTIVE FIRE CONTROL, INC
BARMOR REHAB INC 	 BARMOR REHAB INC
REEM PLBG & HTG CORP. 	 REEM PLBG & HTG CORP.
AP-TECH CONTRACTING CORP 	 AP-TECH CONTRACTING CORP
GENERAL CONTRACTORS HOME IMPROVE 	 GENERAL CONTRACTORS HOME IMPROVE
URBATCH GENL. CONTR & CONSTRU 	 URBATCH GENL. CONTR & CONSTRU
P.S.I. MECHANICAL 	 P.S.I. MECHANICAL
UNIVERSAL CONSTR & BUILD INC 	 UNIVERSAL CONSTR & BUILD INC
CMA ENTERPRISES LTD 	 CMA ENTERPRISES LTD
MZZ WADOLOWSKI CORP 	 MZZ WADOLOWSKI CORP
MK WOODWORKING INC 	 MK WOODWORKING INC
NOVA PLUMBING & HEATING INC. 	 NOVA PLUMBING & HEATING INC.
LEHR CONST.CORP. 	 LEHR CONST.CORP.
FOLOR INC 	 FOLOR INC
PACE PLUMBING CORP 	 PACE PLUMBING CORP
JME ENTERPRISES INC 	 JME ENTERPRISES INC
ROCK SCAFFOLDING CORP 	 ROCK SCAFFOLDING CORP
TRADE CONSTRUCTION CORP. 	 TRADE CONSTRUCTION CORP.
REGIONAL 

In [236]:
fn += 4

In [212]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's Phone #
Original,	 Cleaned

2126271399 	 2126271399
2129659155 	 2129659155
7182633946 	 7182633946
7187290450 	 7187290450
2125790859 	 2125790859
2126739700 	 2126739700
7189628472 	 7189628472
7188715295 	 7188715295
2127215500 	 2127215500
7186998005 	 7186998005
3476249398 	 3476249398
2129247353 	 2129247353
7182381474 	 7182381474
2124860020 	 2124860020
7188868781 	 7188868781
2123531160 	 2123531160
6464227536 	 6464227536
9172029429 	 9172029429
7187079361 	 7187079361
7182990100 	 7182990100
2126898211 	 2126898211
7188816200 	 7188816200
2125995599 	 2125995599
2129496606 	 2129496606
4804610777 	 4804610777
2127668800 	 2127668800
7188348300 	 7188348300
5165966046 	 5165966046
3473228550 	 3473228550
7184396600 	 7184396600
6465027193 	 6465027193
7188338502 	 7188338502
7185858300 	 7185858300
2129255620 	 2129255620
7183839301 	 7183839301
9172732026 	 9172732026
7182310010 	 7182310010
7184397924 	 7184397924
9144242875 	 9144242875
7183203171 	 71

In [213]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's License Type
Original,	 Cleaned

GC 	 GC
GC 	 GC
MP 	 MP
FS 	 FS
GC 	 GC
FS 	 FS
GC 	 GC
GC 	 GC
nan 	 nan
MP 	 MP
GC 	 GC
nan 	 nan
GC 	 GC
GC 	 GC
MP 	 MP
nan 	 nan
GC 	 GC
FS 	 FS
FS 	 FS
GC 	 GC
nan 	 nan
GC 	 GC
GC 	 GC
FS 	 FS
GC 	 GC
GC 	 GC
FS 	 FS
GC 	 GC
GC 	 GC
GC 	 GC
GC 	 GC
OW 	 OW
OB 	 OB
MP 	 MP
MP 	 MP
GC 	 GC
SI 	 SI
MP 	 MP
GC 	 GC
GC 	 GC
GC 	 GC
GC 	 GC
GC 	 GC
GC 	 GC
MP 	 MP
GC 	 GC
GC 	 GC
FS 	 FS
GC 	 GC
GC 	 GC




In [237]:
fn += 4

In [214]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's License #
Original,	 Cleaned

0009642 	 0009642
0011878 	 0011878
0001223 	 0001223
0000172 	 0000172
0022309 	 0022309
0000061 	 0000061
0025311 	 0025311
0010066 	 0010066
0 	 0
0001790 	 0001790
0034602 	 0034602
0 	 0
0039727 	 0039727
0601655 	 0601655
0001416 	 0001416
0 	 0
0035361 	 0035361
0000980 	 0000980
0000895 	 0000895
0602613 	 0602613
nan 	 nan
0001936 	 0001936
0001740 	 0001740
0000793 	 0000793
0026402 	 0026402
0039625 	 0039625
0000086 	 0000086
0025196 	 0025196
0616880 	 0616880
0614967 	 0614967
0036326 	 0036326
nan 	 nan
0002718 	 0002718
0008241 	 0008241
0001668 	 0001668
0034387 	 0034387
0000199 	 0000199
0002084 	 0002084
0031469 	 0031469
0001962 	 0001962
0610430 	 0610430
0013996 	 0013996
0007320 	 0007320
0027501 	 0027501
0000161 	 0000161
0603805 	 0603805
0601448 	 0601448
0000692 	 0000692
0613537 	 0613537
0009084 	 0009084




In [238]:
fn += 5

In [216]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Permittee's Other Title
Original,	 Cleaned

nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
G.C. 	 G.C.
nan 	 nan
nan 	 nan
GC 	 GC
nan 	 nan
nan 	 nan
nan 	 nan
G.C. 	 G.C.
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
GC 	 GC
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan
nan 	 nan




In [239]:
fn += 48

In [240]:
tp

197

In [241]:
fp

2

In [242]:
fn

67

In [243]:
precision(tp, fp)

0.9899497487437185

In [244]:
recall(tp,fn)

0.7462121212121212