# Data Profiling and Cleaning

We profiled and cleaned the NYC opendata `DOB Job Application Filings` data using pandas and openclean

Run all the cells in order to profile and clean the data

Robert Ronan, Sheng Tong, Jerry Lee

In [1]:
import openclean
import glob
import pandas as pd
import numpy as np
import re

# Data Downloading

Download the data using openClean

In [2]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

dataset = Socrata().dataset('pitm-atqc')
datafile = './pitm-atqc.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Using 'Open Restaurant Applications' in file ./pitm-atqc.tsv.gz of size 1.51 MB


# Data Loading

Load the data into pandas and openClean dataset object

In [72]:
import pandas as pd
from openclean.pipeline import stream

df  = pd.read_csv(datafile, dtype='object', sep='\t')
ds = stream(datafile, delim='\t')

In [73]:
np.__version__

'1.21.3'

In [74]:
pd.__version__

'1.3.4'

In [75]:
import glob

In [76]:
glob.glob("*")

['DOB_Job_Application_Filings.csv',
 'DOB_Job_Cleaning.ipynb',
 'DOB_Job_Cleaning_Facades_Compliance_Filings_rr.ipynb',
 'DOB_Job_Cleaning_Open_Restaurant_Applications_rr.ipynb',
 'DOB_Job_Cleaning_Permit_Issuance_rr.ipynb',
 'ic3t-wcy2.tsv.gz',
 'ipu4-2q9a.tsv.gz',
 'pitm-atqc.tsv.gz',
 'README.md',
 'xubg-57si.tsv.gz']

### Get some basic info about the dataset columns

In [77]:
df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13027 entries, 0 to 13026
Data columns (total 35 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   objectid                                  13027 non-null  object
 1   globalid                                  13027 non-null  object
 2   Seating Interest (Sidewalk/Roadway/Both)  13027 non-null  object
 3   Restaurant Name                           13026 non-null  object
 4   Legal Business Name                       13026 non-null  object
 5   Doing Business As (DBA)                   13015 non-null  object
 6   Building Number                           12776 non-null  object
 7   Street                                    13027 non-null  object
 8   Borough                                   13027 non-null  object
 9   Postcode                                  13027 non-null  object
 10  Business Address                          1302

If any rows are complete duplicates, drop them

In [78]:
df = df.drop_duplicates()

Take an a look at some of the rows to get an idea of what the datset looks like

In [79]:
df = df.dropna(axis=1, thresh = 100)

In [80]:
df

Unnamed: 0,objectid,globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance_terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,5900,{3B07E4C0-07B7-4079-8333-64446CC3EE03},sidewalk,Seasoned Vegan,"Seasoned Vegan, LLC","Seasoned Vegan, LLC",55,St. Nicholas Avenue,Manhattan,10026,...,yes,06/26/2020 08:38:00 PM,40.8005,-73.952507,10,9,216,1054995,1018220052,Central Harlem South
1,13018,{137C575D-DC14-4F9D-83D9-A3FFE513B3B8},sidewalk,AMERICAS CAFE &amp; GRILL,68TH GRILL INC,AMERICAS CAFE &amp; GRILL,,1159 3 AVENUE,Manhattan,10065,...,yes,10/22/2021 11:01:00 AM,40.766845,-73.962708,8,4,118,1043896,1014220048,Lenox Hill-Roosevelt Island
2,11630,{15270732-2A78-4C24-89DD-BE8DD916F115},roadway,SUSHI SEKI,SEKI INC,SUSHI SEKI,undefined,208 WEST 23 STREET,Manhattan,10011,...,yes,12/14/2020 07:54:00 PM,40.744338,-73.99624,4,3,91,1014129,1007720056,Hudson Yards-Chelsea-Flatiron-Union Square
3,7753,{AE2F76BE-D0EC-4D10-890F-B7AFBDB87188},both,Lucky Seven Restaurant & Bar LLC,Lucky Seven Restaurant & Bar LLC,Lucky Seven Tapas Bar,1455,St Nicholas Avenue,Manhattan,10033,...,yes,07/08/2020 03:58:00 PM,40.850323,-73.933011,12,10,271,1063948,1021650038,Washington Heights North
4,5010,{07AE4461-6BB3-499D-8332-4F2966E0514F},sidewalk,El Nuevo Carribeno,El Nuevo Carribeno Inc.,El Nuevo Carribeno Inc.,1675,Lexington Avenue,Manhattan,10029,...,yes,06/24/2020 04:02:00 PM,40.79167,-73.946688,11,8,172,1051992,1016330019,East Harlem South
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13022,9799,{108C6784-54DB-480B-8627-CF0495D27A0E},both,KING OF SPADES INC.,KING OF SPADES INC.,KING OF SPADES INC.,1425,COLLEGE POINT BLVD,Queens,11356,...,yes,08/04/2020 03:27:00 PM,40.784739,-73.845776,7,19,929,4098349,4040850057,College Point
13023,8215,{7B8EC275-D470-4106-819E-03FC0E1F9717},both,LA QUEEN TEA HOUSE INC,LA QUEEN TEA HOUSE INC,LA QUEEN TEA HOUSE INC,752B,61ST ST,Brooklyn,11220,...,yes,07/13/2020 03:53:00 PM,40.635543,-74.01122,7,38,118,3143885,3057940027,Sunset Park East
13024,2895,{476f88d2-d3e8-4e83-8686-8bac2f752cf5},sidewalk,aahar indina cuisine,vidhan bhatt inc.,vidhan bhatt inc.,10,murray street,Manhattan,10007,...,yes,06/21/2020 01:46:00 PM,40.713298,-74.007773,1,1,21,1001399,1001240004,SoHo-TriBeCa-Civic Center-Little Italy
13025,607,ff0d04ef-1d9a-47d6-8f20-8bdbdbce2792,sidewalk,le cafe coffee,le cafe coffee llc,le cafe coffee llc,145,fourth av,Manhattan,10003,...,yes,06/19/2020 01:19:00 PM,40.733916,-73.989872,3,2,42,1077569,1005590009,East Village


In [81]:
# Need 382 examples for sample

In [82]:
df_sample = df.sample(382).copy()

In [83]:
df_sample

Unnamed: 0,objectid,globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance_terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
10885,12661,{4B917111-A85A-43E5-87C1-1908E3B68258},sidewalk,LE PAIN QUOTIDIEN,"MPQ 921 BROADWAY, LLC","LPQ USA, LLC - Disbursements Account",undefined,921 BROADWAY,Manhattan,10010,...,yes,07/07/2021 02:01:00 PM,40.739976,-73.989564,5,2,56,1016239,1008500001,Hudson Yards-Chelsea-Flatiron-Union Square
10052,4260,{68594DDD-8C45-4C03-81BC-AA37C70A2E6F},sidewalk,Tarachi,Tarachi NYC LLC,Tarachi NYC LLC,222,Greene Ave,Brooklyn,11238,...,yes,06/23/2020 12:19:00 PM,40.687145,-73.961988,2,35,231,3056252,3019660008,Clinton Hill
10530,2217,{d31dc454-43c9-4a71-870e-4cf6ca5fc394},both,Briciola,Briciola Corp,Briciola Corp,370,w 51st,Manhattan,10019,...,yes,06/20/2020 12:42:00 PM,40.763861,-73.987918,4,3,133,1025187,1010410061,Clinton
392,8019,{430DB8B8-C29B-4B7A-8CB9-F7FA2C88FF25},openstreets,Starbucks Reserve Roastery,Siren Retail Corporation,Siren Retail Corporation,61,9th Avenue,Manhattan,10011,...,yes,07/10/2020 01:11:00 PM,40.74144,-74.00502,4,3,83,1090217,1007120036,Hudson Yards-Chelsea-Flatiron-Union Square
5386,7910,{299B1389-19CF-4152-8EAB-B31DA0AFE60F},sidewalk,starbucks,Sstarbucks coffee company,Sstarbucks coffee company,124,eight avenue,Manhattan,10011,...,yes,07/09/2020 03:50:00 PM,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2286,8812,{2152DE7B-C389-4D2C-80D5-D6C49CD4F3BF},openstreets,An Choi,SaigonNYC Ltd,An Choi,85,Orchard,Manhattan,10002,...,yes,07/17/2020 07:21:00 PM,,,,,,,,
2749,10452,{19DE33E1-8174-4CE1-8683-BCDD963E1D5B},both,The EAR Inn,EAR Inn Inc,EAR Inn Inc,326 & 330,Spring Street,Manhattan,10013,...,yes,08/19/2020 04:47:00 PM,,,,,,,,
10222,1443,ac030024-7524-4ec0-8f5d-ede2d96af62d,sidewalk,The Castello Plan,Shakeen LLC,The Castello Plan,1213,Cortelyou Road,Brooklyn,11218,...,yes,06/19/2020 04:35:00 PM,40.640139,-73.966847,14,40,1522,3118320,3051430084,Flatbush
7284,8292,{7846E68C-B47B-421C-8523-3632944DBFEA},both,Balthazar Bakery,Humphrey's Bakery inc,Balthazar Bakery,80,Spring St,Manhattan,10012,...,yes,07/14/2020 07:34:00 AM,40.72271,-73.99811,2,1,45,1007238,1004830017,SoHo-TriBeCa-Civic Center-Little Italy


## Describe columns in groups so they fit on screen

In [84]:
df[df.columns[:20]].describe()

Unnamed: 0,objectid,globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,Business Address,Food Service Establishment Permit #,Sidewalk Dimensions (Length),Sidewalk Dimensions (Width),Sidewalk Dimensions (Area),Roadway Dimensions (Length),Roadway Dimensions (Width),Roadway Dimensions (Area),Approved for Sidewalk Seating,Approved for Roadway Seating
count,13027,13027,13027,13026,13026,13015,12776,13027,13027,13027,13027,13022,10983,10983,10983,8350,8350,8350,13027,13027
unique,13020,13024,4,10678,11392,10998,4315,6495,5,245,12004,10272,200,48,831,172,45,401,2,2
top,7065,{98116B37-C5FA-4989-81B1-57246EACC2A0},both,SWEETGREEN,SWEETGREEN NEW YORK LLC,SWEETGREEN,undefined,Broadway,Manhattan,10003,"445 Albee Square West, Brooklyn, NY",0,20,8,120,20,8,160,yes,yes
freq,2,2,6670,31,30,29,2232,282,6373,547,9,28,978,1361,318,893,6594,721,10983,8350


In [85]:
# Notes:
# Building Type looks binary and has 2 values + maybe NAN
# Cluster looks binary and has 2 values + maybe NAN
# Landmarked looks binary and has 4 values + maybe NAN
# Adult Establishment looks binary and has 2 values + maybe NAN
# Loft Board looks binary and has 2 values + maybe NAN
# City Owned looks binary and has 4 values + maybe NAN
# Little e looks binary and has 5 values + maybe NAN


In [86]:
df[df.columns[20:40]].describe()

Unnamed: 0,Qualify Alcohol,SLA Serial Number,SLA License Type,Landmark District or Building,landmarkDistrict_terms,healthCompliance_terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
count,13027,8741,8741,13027,1956,13027,13027,11731.0,11731.0,11731,11731,11731,11651,11651,11731
unique,2,5367,7,2,1,1,10166,8722.0,8661.0,19,51,874,8338,8240,186
top,yes,##############################################...,OP,no,yes,yes,10/19/2021 04:36:00 PM,40.690833,-73.983452,2,3,38,3397861,3001497501,West Village
freq,8741,47,6315,11071,1956,13027,14,18.0,18.0,1752,1386,177,18,18,685


In [87]:
# PC Filed -- Other all look binary, and have 1-2 values + maybe NAN
#
# Take a look at Other Description for weird strings

#  Lots of the same First and Last name

# Check names and titles

# APPLICATNT LICENCSE # NEEDS TO BE A STRING TO PERSERVE THE 0 ON IT (PROBABLY)

# Professional Cert looks binary and has 5 values + maybe NAN



In [88]:
#df[df.columns[40:60]].describe()

In [89]:
# Need to convert date columns to pd.datetime
# RENAME PAID TO PAID DATE
# RENAME FULLY PAID TO FULLY PAID DATE
# RENAME ASSISGNED TO ASSIGNED DATE
# RENAME APPROVED TO APPROVED DATE

# CHECK COHERENCE OF PAID DATE <= FULLY PAID DATE
# CHECK COHERENCE OF PRE FILING DATE <= PAID DATE
# CHECK COHERENCE OF ASSIGNED DATE <= APPROVED DATE
# 

# REMOVE $ FROM Initial Cost and Total Estimated Fee, and put them in column name, convert values to floats

# Check What fee status is

# Check Existing Zoning Sqft, Propsed Zoning Sqft, Enlargement SQ Footage for reasonable values
# Change either Sqft to SQ Footage or vvice-versa

# Horizontal Enlargement and Vertical Enlargement are booleans + NAN
# Change Enlrgmt to Enlargement

#Chcek ExisitngNo. of Stories and Proposed # of Stories for reasonableness
# Add space between Existing and No.
# Change either Job# to Job No. or vice versa
# maybe just change all the No./# to "number"

# Check Existing and proposed height for reasonableness. Add unit to column name

# check Existing Dwelling Units for reasonableness 



In [90]:
#df[df.columns[60:80]].describe()

In [91]:
# check Proposed Dwelling Units
## Why does Existing Occupancy have fewer cats than Proposed Occupancy. Check those.

# What is Site Fill. 

# Get list of NYC ZOning Districts and Special Districts

# Checmk Owner Typer for spelling issues

# Non Profit is binary

# Check Owners's First and last name

# Owner'sBuisness Name should not be "OWNER"

# Owner's house number, streeet name, city, state and zip have almost no values

# Why is the same phone number so common

In [92]:
#df[df.columns[80:100]].describe()

In [93]:

# That is a lot of unique job descriptions

# Add spaces to DOB Run Date name
# make DOB Run Date a datetime

# What is Job_S1_NO. It uses underscores.

# All the remaining columns have ALL CAPS NAMES WITH UNDERSCORES 
# TOTAL_CONSTRUCTION_FLOOR_AREA, WITHDRAWAL_FLAG

# SIGNOFF_DATE needs to be datetime
# SPECIAL_ACTION_STATUS
# SPECIAL_ACTION_DATE needs to be datetime
# BUILDING_CLASS
# What is JOB_NO_GOOD_COUNT
#
# maybe need GIS DATA
# GIS_LATITUDE
# GIS_LONGITUDE
# GIS_COUNCIL_DISTRICT
# GIS_CENSUS_TRACT
# GIS_NTA_NAME
# GIS_BIN
# 

In [94]:
df.columns

Index(['objectid', 'globalid', 'Seating Interest (Sidewalk/Roadway/Both)',
       'Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)',
       'Building Number', 'Street', 'Borough', 'Postcode', 'Business Address',
       'Food Service Establishment Permit #', 'Sidewalk Dimensions (Length)',
       'Sidewalk Dimensions (Width)', 'Sidewalk Dimensions (Area)',
       'Roadway Dimensions (Length)', 'Roadway Dimensions (Width)',
       'Roadway Dimensions (Area)', 'Approved for Sidewalk Seating',
       'Approved for Roadway Seating', 'Qualify Alcohol', 'SLA Serial Number',
       'SLA License Type', 'Landmark District or Building',
       'landmarkDistrict_terms', 'healthCompliance_terms',
       'Time of Submission', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

## Renaming columns

In [107]:
rename_list = list(df.columns)
rename_dict = dict()

for i in rename_list:
    col_name = str(i)
    
    col_name = col_name.strip().replace("_", " ").replace("’", "'").replace(".", "")
        
   
    # https://stackoverflow.com/questions/2277352/split-a-string-at-uppercase-letters
    # Split on upper case to seperate cocnatenated words:
    if (not col_name.islower()) and (not col_name.isupper()) and (col_name.find(" ") == -1):
        col_name = " ".join(re.sub("([A-Z])", r" \1", col_name).split())
    
  
    if col_name.islower(): #(not col_name.islower()) and (not col_name.isupper()):
        col_name = col_name.title()
        
    if (col_name.isupper()) and (col_name.find(" ") != -1):
        col_name = col_name.title()
        
    #col_name[0] = col_name[0].upper()
        
    # Split on underscores and make Title Case 
    col_name = col_name.replace("No", "Number")
    col_name = col_name.replace("#", "Number")

    rename_dict[i] = col_name

In [108]:
rename_dict

{'objectid': 'Objectid',
 'globalid': 'Globalid',
 'Seating Interest (Sidewalk/Roadway/Both)': 'Seating Interest (Sidewalk/Roadway/Both)',
 'Restaurant Name': 'Restaurant Name',
 'Legal Business Name': 'Legal Business Name',
 'Doing Business As (DBA)': 'Doing Business As (DBA)',
 'Building Number': 'Building Number',
 'Street': 'Street',
 'Borough': 'Borough',
 'Postcode': 'Postcode',
 'Business Address': 'Business Address',
 'Food Service Establishment Permit #': 'Food Service Establishment Permit Number',
 'Sidewalk Dimensions (Length)': 'Sidewalk Dimensions (Length)',
 'Sidewalk Dimensions (Width)': 'Sidewalk Dimensions (Width)',
 'Sidewalk Dimensions (Area)': 'Sidewalk Dimensions (Area)',
 'Roadway Dimensions (Length)': 'Roadway Dimensions (Length)',
 'Roadway Dimensions (Width)': 'Roadway Dimensions (Width)',
 'Roadway Dimensions (Area)': 'Roadway Dimensions (Area)',
 'Approved for Sidewalk Seating': 'Approved for Sidewalk Seating',
 'Approved for Roadway Seating': 'Approved for R

In [109]:
df = df.rename(columns=rename_dict)

#Lower cased but not sure if this is necessary

# df = df.rename(columns={
#                          "PERMIT_SI_NO": "Permit Si #"
#                         , "TOTAL_CONSTRUCTION_FLOOR_AREA": "Total Construction Floor Area"
#                         , "WITHDRAWAL_FLAG": "Withdrawl Flag"
#                         , "SIGNOFF_DATE": "Signoff Date"
#                         , "SPECIAL_ACTION_STATUS": "Special Action Status"
#                         , "SPECIAL_ACTION_DATE": "Special Action Date"
#                         , "BUILDING_CLASS": "Building Class"
#                         , "JOB_NO_GOOD_COUNT": "Job No Good Count"
#                         , "LATITUDE": "GIS Latitude"
#                         , "LONGITUDE": "GIS Longitude"
#                         , "COUNCIL_DISTRICT": "GIS Council District"
#                         , "CENSUS_TRACT": "GIS Census Tract"
#                         , "NTA_NAME": "GIS NTA Name"
#                         , "GIS_BIN": "GIS Bin"
#                         })


#### Method to get an idea of the top 10 values of a column

In [110]:
def show_vals(column_name, show_rows=10, df=df):
    print("Top {} {}:\n".format(show_rows, column_name))
    try:
        print(df[column_name].value_counts(dropna=False)[:show_rows])
    except KeyError as e:
        print("Could not print column: ", column_name)
    print()

### Examining ~Job #s~ Objectid

Some repition in the Job #'s, but nothing major. We will check some of the repeated Job #s to be sure they actually refer to the same jobs

In [111]:
df['Objectid'].value_counts(dropna=False)

7065     2
12921    2
7068     2
12920    2
12919    2
        ..
10215    1
4302     1
1122     1
11251    1
6641     1
Name: Objectid, Length: 13020, dtype: int64

Nothing weird looking here

## Examining and reparing house #s

House #'s appear to be mostly ints

However, there are legitimate house numbers with dashes so we'll have to make them strings

In [114]:
house_num_cols = ["Building Number"]

In [115]:
for col in house_num_cols:
    show_vals(col)

Top 10 Building Number:

undefined    2232
NaN           251
200            32
25             32
1              32
2              31
5              29
43             27
99             26
55             25
Name: Building Number, dtype: int64



In [116]:
show_vals("Building Number", show_rows=10)

Top 10 Building Number:

undefined    2232
NaN           251
200            32
25             32
1              32
2              31
5              29
43             27
99             26
55             25
Name: Building Number, dtype: int64



Replace NaN values with empty strings, then convert column to string, and make everything uppercase


In [117]:
for col in house_num_cols:
    df[col].fillna('', inplace=True)
    df[col] = df[col].astype('str')
    df[col] = df[col].str.upper()
    

In [118]:
#df['House Number'].fillna('', inplace=True)
#df['House Number'] = df['House Number'].astype('str')
#df['House Number'] = df['House Number'].str.upper()

Check for numbers spelled out as words

In [121]:
for col in house_num_cols:
    print(df.loc[(~df[col].isna())
       &(df[col].str.isalpha())][col])

2        UNDEFINED
5        UNDEFINED
8        UNDEFINED
13       UNDEFINED
17       UNDEFINED
           ...    
12997    UNDEFINED
13005    UNDEFINED
13006    UNDEFINED
13019    UNDEFINED
13021    UNDEFINED
Name: Building Number, Length: 2240, dtype: object


In [123]:
for col in house_num_cols:
    print(df.loc[(~df[col].str.contains('\\d', regex=True))][col])

1                 
2        UNDEFINED
5        UNDEFINED
8        UNDEFINED
13       UNDEFINED
           ...    
12997    UNDEFINED
13005    UNDEFINED
13006    UNDEFINED
13019    UNDEFINED
13021    UNDEFINED
Name: Building Number, Length: 2493, dtype: object


In [124]:
#df.loc[(~df['House Number'].str.contains('\\d', regex=True))]['House Number']

In [122]:
# df.loc[(~df['House Number'].isna())
#        &(df['House Number'].str.isalpha())]['House Number']

Maybe the house Number and borough were flipped in the 'manhattan' case?

Check if thses are empty strings:

In [32]:
#df.loc[(~df['House Number'].str.contains('\\d', regex=True))]['House Number']

Replace spelling of numbers with their value, and remove values 'PIER',  'MANHATTAN',  'NO NUMBER'

In [125]:
for col in house_num_cols:
    df.loc[df[col].str.strip('')=='ONE', col] = '1'
    df.loc[df[col].str.strip('')=='PIER', col] = ''
    df.loc[df[col].str.strip('')=='MANHATTAN', col] = ''
    df.loc[df[col].str.strip('')=='NO NUMBER', col] = ''

    # need to add B
    df.loc[df[col].str.strip('')=='B', col] = ''


Most of these will probably be legitimate house numbers, since house numbers can have dashes

In [35]:
#df.loc[(~df['House Number'].isna())
#       &(~df['House Number'].str.isdigit())]['House Number']

Check non-numeric house Number's that don't have dashes

In [36]:
# df.loc[(~df['House Number'].isna())
#        &(~df['House Number'].str.isdigit())
#       &(~df['House Number'].str.contains('-', regex=False))]['House Number'][:25]

We see a mix of reference to the house's garage, the rear house and single letters that likely indicate apartments in multi-occupancy venues. 

We will standardize the formatting, and maintain the reference to garage, rear, and appartment, since there is no apartment column for the job.

First split the numbers and words with a space

In [126]:
for col in house_num_cols:
    df[col] = df[col].str.replace(pat='(?P<one>\\d)(?P<two>[A-Z]+)', repl='\g<one> \g<two>', regex=True)
    df[col] = df[col].str.replace(pat='(?P<one>GAR$)', repl='GARAGE', regex=True)
    df[col] = df[col].str.replace(pat='NORTH([A-Z]+)?', repl='', regex=True)
    df[col] = df[col].str.replace(pat='EAST([A-Z]+)?', repl='', regex=True)
    df[col] = df[col].str.replace(pat='SOUTH([A-Z]+)?', repl='', regex=True)
    df[col] = df[col].str.replace(pat='WEST([A-Z]+)?', repl='', regex=True)

In [127]:
#df['House Number'] = df['House Number'].str.replace(pat='(?P<one>\\d)(?P<two>[A-Z]+)', repl='\g<one> \g<two>', regex=True)

Now we will fix the formatting for garage and 
remove references to north, south, east, west, since they should be in street Number

In [128]:
#df['House Number'] = df['House Number'].str.replace(pat='(?P<one>GAR$)', repl='GARAGE', regex=True)

In [129]:
#df['House Number'] = df['House Number'].str.replace(pat='NORTH([A-Z]+)?', repl='', regex=True)
#df['House Number'] = df['House Number'].str.replace(pat='EAST([A-Z]+)?', repl='', regex=True)
#df['House Number'] = df['House Number'].str.replace(pat='SOUTH([A-Z]+)?', repl='', regex=True)
#df['House Number'] = df['House Number'].str.replace(pat='WEST([A-Z]+)?', repl='', regex=True)

In [130]:
# ## Confirm that it worked correctly:
# df.loc[(~df['House Number'].isna())
#        &(~df['House Number'].str.isdigit())
#        &(~df['House Number'].str.contains('-', regex=False))]['House Number'][:30]

##### Looking at Binary/Pseudo-binary columns:

For these columns it's clear NaN idicates 'no', however some columns, like site-fill don't quite work

In [132]:
boolean_cols = []
for col in df.columns:
    if df[col].nunique() < 7:
        show_vals(col)
        boolean_cols.append(col)

Top 10 Seating Interest (Sidewalk/Roadway/Both):

both           6670
sidewalk       4313
roadway        1680
openstreets     364
Name: Seating Interest (Sidewalk/Roadway/Both), dtype: int64

Top 10 Borough:

Manhattan        6373
Brooklyn         3191
Queens           2565
Bronx             700
Staten Island     198
Name: Borough, dtype: int64

Top 10 Approved for Sidewalk Seating:

yes    10983
no      2044
Name: Approved for Sidewalk Seating, dtype: int64

Top 10 Approved for Roadway Seating:

yes    8350
no     4677
Name: Approved for Roadway Seating, dtype: int64

Top 10 Qualify Alcohol:

yes    8741
no     4286
Name: Qualify Alcohol, dtype: int64

Top 10 Landmark District or Building:

no     11071
yes     1956
Name: Landmark District or Building, dtype: int64

Top 10 landmarkDistrict terms:

NaN    11071
yes     1956
Name: landmarkDistrict terms, dtype: int64

Top 10 healthCompliance terms:

yes    13027
Name: healthCompliance terms, dtype: int64



In [133]:
boolean_cols

['Seating Interest (Sidewalk/Roadway/Both)',
 'Borough',
 'Approved for Sidewalk Seating',
 'Approved for Roadway Seating',
 'Qualify Alcohol',
 'Landmark District or Building',
 'landmarkDistrict terms',
 'healthCompliance terms']

In [136]:
boolean_cols.remove('Seating Interest (Sidewalk/Roadway/Both)')
boolean_cols.remove('Borough')

In [138]:
boolean_cols 

#df['Prior Status'].fillna('No Report Filed', inplace=True)
#df['Current Status'].fillna('No Report Filed', inplace=True)


['Approved for Sidewalk Seating',
 'Approved for Roadway Seating',
 'Qualify Alcohol',
 'Landmark District or Building',
 'landmarkDistrict terms',
 'healthCompliance terms']

In [None]:
for col in boolean_cols:
    df[col].fillna(False, inplace=True)
    df[col]

### This will not work, but it is what we did before:

In [141]:
for col in boolean_cols:
    df.loc[df[col]=='Y', col] = True
    df.loc[df[col]=='X', col] = True
    df.loc[df[col]=='N', col] = False
    df.loc[df[col]!=True, col] = False

In [None]:
# df['Plumbing'].fillna(False, inplace=True)

# df['Mechanical'].fillna(False, inplace=True)

# df['Boiler'].fillna(False, inplace=True)

# df['Fuel Burning'].fillna(False, inplace=True)

# df['Adult Establishment'].fillna(False, inplace=True)

# df['Fuel Storage'].fillna(False, inplace=True)

# df['Standpipe'].fillna(False, inplace=True)

# df['Sprinkler'].fillna(False, inplace=True)

# df['Fire Alarm'].fillna(False, inplace=True)

# df['Fire Suppression'].fillna(False, inplace=True)

# df['Curb Cut'].fillna(False, inplace=True)

# df['Other'].fillna(False, inplace=True)




In [None]:
# df.loc[df['Plumbing']!=False, 'Plumbing'] = True

# df.loc[df['Mechanical']!=False, 'Mechanical'] = True

# df.loc[df['Fuel Burning']!=False, 'Fuel Burning'] = True

# df.loc[df['Adult Establishment']!=False, 'Adult Establishment'] = True

# df.loc[df['Fuel Storage']!=False, 'Fuel Storage'] = True

# df.loc[df['Standpipe']!=False, 'Standpipe'] = True

# df.loc[df['Sprinkler']!=False, 'Sprinkler'] = True

# df.loc[df['Fire Alarm']!=False, 'Fire Alarm'] = True

# df.loc[df['Fire Suppression']!=False, 'Fire Suppression'] = True

# df.loc[df['Curb Cut']!=False, 'Curb Cut'] = True

# df.loc[df['Other']!=False, 'Other'] = True



In [None]:
# df['Plumbing'] = df['Plumbing'].astype('bool')

# df['Mechanical'] = df['Mechanical'].astype('bool')

# df['Boiler'] = df['Boiler'].astype('bool')

# df['Fuel Burning'] = df['Fuel Burning'].astype('bool')

# df['Adult Establishment'] = df['Adult Establishment'].astype('bool')

# df['Fuel Storage'] = df['Fuel Storage'].astype('bool')

# df['Standpipe'] = df['Standpipe'].astype('bool')

# df['Sprinkler'] = df['Sprinkler'].astype('bool')

# df['Fire Alarm'] = df['Fire Alarm'].astype('bool')

# df['Fire Suppression'] = df['Fire Suppression'].astype('bool')

# df['Curb Cut'] = df['Curb Cut'].astype('bool')

# df['Other'] = df['Other'].astype('bool')


# #Fill null values with False/'N'

# df['Landmarked'].fillna('N', inplace=True)

# df['Little e'].fillna('N', inplace=True)

# df.loc[df['Little e']=='X', 'Little e'] = 'Y' # fix this one-off value for Little e


# #For the clearly binary cases, fill 'Y' values with True, and the other values with False



In [None]:
# df.loc[df['Loft Board']=='Y', 'Loft Board'] = True

# df.loc[df['City Owned']=='Y', 'City Owned'] = True

# df.loc[df['PC Filed']=='Y', 'PC Filed'] = True

# df.loc[df['eFiling Filed']=='Y', 'eFiling Filed'] = True

# df.loc[df['Professional Cert']=='Y', 'Professional Cert'] = True

# df.loc[df['Non-Profit']=='Y', 'Non-Profit'] = True

# df.loc[df['Horizontal Enlargement']=='Y', 'Horizontal Enlargement'] = True

# df.loc[df['Vertical Enlargement']=='Y', 'Vertical Enlargement'] = True




In [None]:
# df.loc[df['Loft Board']!=True, 'Loft Board'] = False

# df.loc[df['City Owned']!=True, 'City Owned'] = False

# df.loc[df['PC Filed']!=True, 'PC Filed'] = False

# df.loc[df['eFiling Filed']!=True, 'eFiling Filed'] = False

# df.loc[df['Professional Cert']!=True, 'Professional Cert'] = False

# df.loc[df['Non-Profit']!=True, 'Non-Profit'] = False

# df.loc[df['Horizontal Enlargement']!=True, 'Horizontal Enlargement'] = False

# df.loc[df['Vertical Enlargement']!=True, 'Vertical Enlargement'] = False


In [None]:
#df['Prior Status'].fillna('No Report Filed', inplace=True)
#df['Current Status'].fillna('No Report Filed', inplace=True)


In [None]:
# show_vals('Non-Profit')
# show_vals('Self Cert')
# show_vals('Filing Status')
# show_vals('Site Fill')
# show_vals('Act as Superintendent')
# show_vals('Building Type')
# show_vals('Residential')
# show_vals('Oil Gas')

Replace the Nan values with False and replace the other values with True, and then cast the columns to be type bool

In [142]:
df.columns

Index(['Objectid', 'Globalid', 'Seating Interest (Sidewalk/Roadway/Both)',
       'Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)',
       'Building Number', 'Street', 'Borough', 'Postcode', 'Business Address',
       'Food Service Establishment Permit Number',
       'Sidewalk Dimensions (Length)', 'Sidewalk Dimensions (Width)',
       'Sidewalk Dimensions (Area)', 'Roadway Dimensions (Length)',
       'Roadway Dimensions (Width)', 'Roadway Dimensions (Area)',
       'Approved for Sidewalk Seating', 'Approved for Roadway Seating',
       'Qualify Alcohol', 'SLA Serial Number', 'SLA License Type',
       'Landmark District or Building', 'landmarkDistrict terms',
       'healthCompliance terms', 'Time of Submission', 'Latitude', 'Longitude',
       'Community Board', 'Council District', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

In [None]:
# list of Owner's columns:
#owner_cols = df.columns[np.where(np.char.find(np.array(list(df.columns)), 'Owner') > -1)[0]]

In [None]:
#np.where(np.char.find(np.array(list(df.columns)), 'Owner') > -1)[0]

In [None]:
#owner_cols

In [None]:
#for c in owner_cols:
#    show_vals(c)

In [None]:
#df['Owner Bus Street Name'].value_counts()

## Fixing owner's informations

In [None]:
#df.loc[~df["Owner Bus Name"].isna() & df["Owner Bus Name"].str.contains("(?i)new york city")]["Owner Bus Name"].value_counts()

Normalizes a couple of duplicate names

In [None]:
# df["Owner Bus Name"] = df["Owner Bus Name"].str.replace("NEW YORK CITY", "NYC")
# df["Owner Bus Name"] = df["Owner Bus Name"].str.upper()
# df["Owner Bus Name"] = df["Owner Bus Name"].str.replace(".", '', regex=False)
# df["Owner Bus Name"] = df["Owner Bus Name"].str.replace(",", '', regex=False)

All these are the same thing. Uses clusters to fix

In [None]:
# may have to use fuzzy/cluster to fix this problem
#df.loc[~df["Owner Bus Name"].isna() & df["Owner Bus Name"].str.contains("(?i)HOUSING AUTHORITY")]["Owner Bus Name"].value_counts()

Used clusters to try to fix the rest of them further below

In [None]:
#df.columns

A lot of the same phone numbers

In [None]:
#df["Owner's Phone Number"] = df["Owner's Phone Number"].astype('str')

In [None]:
#df.loc[df["Owner's Phone Number"].str.contains("7184728000")][["Owner's First Name", "Owner's Last Name","Owner Bus Name", "Owner's Phone Number"]]

All from the same business name so it makes sense

In [None]:
#df.loc[df["Owner's Phone Number"]=='nan']

Nothing wrong with these jobs without an owner's phone number

In [None]:
#df.loc[~df["Owner's Phone Number"].isna() & df["Owner's Phone Number"].str.contains("-")]["Owner's Phone Number"].value_counts()

Phone numbers should not contain "-"

In [None]:
#df.loc[~df["Owner's Phone Number"].isna() & df["Owner's Phone Number"].str.contains(" ")]["Owner's Phone Number"].value_counts()

Phone numbers should not contain empty space

### Cleaning phone number

#### removes non-numeric characters

In [None]:
#df["Owner's Phone Number"] = df["Owner's Phone Number"].str.extract('(\d+)', expand=False)
#df.loc[~df["Owner's Phone Number"].isna() & df["Owner's Phone Number"].str.contains(" ")]["Owner's Phone Number"].value_counts()

#### Turns phone numbers that start with 0, 1, and does not have 10 digits into nan

In [None]:
#df["Owner's Phone Number"] = df["Owner's Phone Number"].astype('str')
#df.loc[~df["Owner's Phone Number"].isna() & ((df["Owner's Phone Number"].str[0] == "0") | (df["Owner's Phone Number"].str[0] == "1") | (df["Owner's Phone Number"].apply(len) != 10)), ["Owner's Phone Number"]] = np.nan

#### Checks to see if there are any others not of length 10

In [None]:
#df["Owner's Phone Number"] = df["Owner's Phone Number"].astype('str')
#df.loc[(df["Owner's Phone Number"].apply(len) != 10)]["Owner's Phone Number"]

#### Check for non-numeric charaters

In [None]:
#df.loc[(~df["Owner's Phone Number"].str.isnumeric()) & (~(df["Owner's Phone Number"]=='nan'))]["Owner's Phone Number"]

# Fixing Qewi columns

In [None]:
# list of Owner's columns:
#qewi_cols = df.columns[np.where(np.char.find(np.array(list(df.columns)), 'Qewi') > -1)[0]]

In [None]:
#np.where(np.char.find(np.array(list(df.columns)), 'Qewi') > -1)[0]

In [None]:
#qewi_cols

In [None]:
#for c in qewi_cols:
#    show_vals(c)

In [None]:
#df['Qewi Bus Name']

In [None]:
#df.loc[~df["Qewi Bus Name"].isna() & df["Qewi Bus Name"].str.contains("(?i)new york city")]["Qewi Bus Name"].value_counts()

Normalizes a couple of duplicate names

In [None]:
# df["Qewi Bus Name"] = df["Qewi Bus Name"].str.replace("NEW YORK CITY", "NYC")
# df["Qewi Bus Name"] = df["Qewi Bus Name"].str.upper()
# df["Qewi Bus Name"] = df["Qewi Bus Name"].str.replace(".", '', regex=False)
# df["Qewi Bus Name"] = df["Qewi Bus Name"].str.replace(",", '', regex=False)

All these are the same thing. Uses clusters to fix

In [None]:
# may have to use fuzzy/cluster to fix this problem
#df.loc[~df["Qewi Bus Name"].isna() & df["Qewi Bus Name"].str.contains("(?i)HOUSING AUTHORITY")]["Qewi Bus Name"].value_counts()

Used clusters to try to fix the rest of them further below

In [None]:
#df.columns

A lot of the same phone numbers

In [None]:
#df["Qewi Phone Number"] = df["Qewi Phone Number"].astype('str')

In [None]:
#df.loc[df["Qewi Phone Number"].str.contains("7184728000")][["Qewi First Name", "Qewi Last Name","Qewi Bus Name", "Qewi Phone Number"]]

All from the same business name so it makes sense

In [None]:
#df.loc[df["Qewi Phone Number"]=='nan']

Nothing wrong with these jobs without an Qewi phone number

In [None]:
#df.loc[~df["Qewi Phone Number"].isna() & df["Qewi Phone Number"].str.contains("-")]["Qewi Phone Number"].value_counts()

Phone numbers should not contain "-"

In [None]:
#df.loc[~df["Qewi Phone Number"].isna() & df["Qewi Phone Number"].str.contains(" ")]["Qewi Phone Number"].value_counts()

Phone numbers should not contain empty space

### Cleaning phone number

#### removes non-numeric characters

In [None]:
#df["Qewi Phone Number"] = df["Qewi Phone Number"].str.extract('(\d+)', expand=False)
#df.loc[~df["Qewi Phone Number"].isna() & df["Qewi Phone Number"].str.contains(" ")]["Qewi Phone Number"].value_counts()

#### Turns phone numbers that start with 0, 1, and does not have 10 digits into nan

In [None]:
#df["Qewi Phone Number"] = df["Qewi Phone Number"].astype('str')
#df.loc[~df["Qewi Phone Number"].isna() & ((df["Qewi Phone Number"].str[0] == "0") | (df["Qewi Phone Number"].str[0] == "1") | (df["Qewi Phone Number"].apply(len) != 10)), ["Qewi Phone Number"]] = np.nan

#### Checks to see if there are any others not of length 10

In [None]:
#df["Qewi Phone Number"] = df["Qewi Phone Number"].astype('str')
#df.loc[(df["Qewi Phone Number"].apply(len) != 10)]["Qewi Phone Number"]

#### Check for non-numeric charaters

In [None]:
#df.loc[(~df["Qewi Phone Number"].str.isnumeric()) & (~(df["Qewi Phone Number"]=='nan'))]["Qewi Phone Number"]

In [None]:
df.columns

### Checking Monetary columns

## Checking Monetary Values for consistency

In [None]:
# show_vals("Late Filing Amt")
# show_vals("Failure To File Amt")
# show_vals("Failure To Collect Amt")

In [None]:
# df['Late Filing Amt'] = df['Late Filing Amt'].astype('str')
# df['Failure To File Amt'] = df['Failure To File Amt'].astype('str')
# df['Failure To Collect Amt'] = df['Failure To File Amt'].astype('str')

Check if any values are not decimals

In [None]:
#df.loc[~df["Late Filing Amt"].str.contains(".", regex=False)]["Late Filing Amt"].value_counts()

In [None]:
#df.loc[~df["Failure To File Amt"].str.contains(".", regex=False)]["Failure To File Amt"].value_counts()

In [None]:
#df.loc[~df["Failure To Collect Amt"].str.contains(".", regex=False)]["Failure To File Amt"].value_counts()

Dollar signs should be removed and added to column name so we can treat this column as a number

In [None]:
#df["Failure To File Amt"] = df["Failure To File Amt"].str.replace("$", '', regex=False)

In [None]:
#df["Late Filing Amt"] = df["Late Filing Amt"].str.replace("$", '', regex=False)

In [None]:
#df["Failure To Collect Amt"] = df["Failure To File Amt"].str.replace("$", '', regex=False)

In this case we could ~However we can't convert this to integer dollars because there are decimals.~

In [None]:
#df.loc[df["Late Filing Amt"].str.contains("\\.[^0]", regex=True)]["Late Filing Amt"]

Convert to floats

In [None]:
#df["Late Filing Amt"] = df["Late Filing Amt"].astype('float')
#df['Failure To File Amt'] = df['Failure To File Amt'].astype('float')
#df['Failure To Collect Amt'] = df['Failure To Collect Amt'].astype('float')

In [None]:
#show_vals("Late Filing Amt")

In [None]:
#df['Failure To File Amt'].value_counts(dropna=False)

For unclear reasons, adding xticks makes this plot incredibly slow to display.

In [None]:
#df["Late Filing Amt"].plot(kind='hist', loglog=True, title='Late Filing Amt Frequncy (log-log)')

Intial costs appear to be distributed roughly exponentially 

In [None]:
#df.loc[df["Late Filing Amt"]<10**8]["Late Filing Amt"].plot(kind='hist', logy=True, bins=50, title='Late Filing Amt Frequncy, values < 10^8 (log)')

Plotting Failure To File Amt shows it is also roughly exponential in distribution

In [None]:
#df["Failure To File Amt"].plot(kind='hist', loglog=True, title='Failure To File Amt Frequncy (log-log)')

In [None]:
#df.loc[df["Failure To File Amt"]<10**7]["Failure To File Amt"].plot(kind='hist', logy=True, bins=50, title='Failure To File Amt Frequncy, values < 10^7 (log)')

In [None]:
#df["Failure To Collect Amt"].plot(kind='hist', loglog=True, title='Failure To File Collect Frequncy (log-log)')

In [None]:
#df.loc[df["Failure To Collect Amt"]<10**7]["Failure To Collect Amt"].plot(kind='hist', logy=True, bins=50, title='Failure To Collect Amt Frequncy, values < 10^7 (log)')

Now that we've mapped these to numerical values, we can identifiy additional issues

#### Fixing Late Filing Amts

##### these look good here:

In [None]:
#df["Late Filing Amt"].min()

In [None]:
#df["Failure To File Amt"].min()

In [None]:
#df["Failure To Collect Amt"].min()

In [None]:
#df["Late Filing Amt"].max()

In [None]:
#df["Failure To File Amt"].max()

In [None]:
#df["Failure To Collect Amt"].max()

In [None]:
#df.loc[df['Late Filing Amt'] < 0]['Late Filing Amt']

It's likely that these values were mistakenly entered as negative, so we'll flip them to positive instead of setting them to zero

In [None]:
#df.loc[df['Late Filing Amt'] == -33000.0, 'Late Filing Amt'] = 33000.0
#df.loc[df['Late Filing Amt'] == -100.0, 'Late Filing Amt'] = 100.0

Looks fine ~This is test data:~

In [None]:
# FOLLOWING IS NO LONGER TRUE:
#the entire line is filled with fake and false information
# The street name is BIS TEST STREET, and almost every value is either the first possible value for the column type, or empty
#df.loc[df['Late Filing Amt'] == df["Late Filing Amt"].max()]

We will drop this row

In [None]:
#df.drop(labels=df.loc[df['Late Filing Amt'] == 999999999].index[0], axis=0, inplace=True)

In [None]:
#df.loc[df['Late Filing Amt'] == 999999999]

These seem to be real probably

In [None]:
#df.loc[df['Late Filing Amt']>= 9000000]["Late Filing Amt"]

In [None]:
#df["Late Filing Amt"].max()

In [None]:
#df["Late Filing Amt"].plot(kind='hist', logy=True, bins=50, title='Late Filing Amt Frequncy (log)')

#### Fixing Failure To File Amt

In [None]:
#df.loc[df['Failure To File Amt'] < 0 ]

We'll fix this as we did with negative monetary values before

In [None]:
#df.loc[df['Failure To File Amt'] == -85.8,  'Failure To File Amt'] = 85.8

In [None]:
#df['Failure To File Amt'].max()

Also seems to have real information

In [None]:
#df.loc[df['Failure To File Amt']==31500000.0]

### Looking at Zoning districts

In [None]:
#Residence (R), Commerical (C), Manufacturing (M)
#show_vals("Zoning District 1")
#show_vals("Zoning District 2")

Some districts may contain invalid formats

In [None]:
# .4-4 looks weird
#df["Zoning District 1"].value_counts(dropna=False)

In [None]:
#Checks for irregular values (values that do not start with (R), (C), (M))
#df["Zoning District 1"] = df["Zoning District 1"].astype('str')
#df.loc[(df["Zoning District 1"] != "nan") & ~df["Zoning District 1"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH"))]["Zoning District 1"].value_counts()

In [None]:
#Checks for irregular values (values that do not start with (R), (C), (M))
#df["Zoning District 2"] = df["Zoning District 2"].astype('str')
#df.loc[(df["Zoning District 2"] != "nan") & ~df["Zoning District 2"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH"))]["Zoning District 2"].value_counts()

In [None]:
#turning the values above into np.nan
#df.loc[(df["Zoning District 1"] != "nan") & ~df["Zoning District 1"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH")), ["Zoning District 1"]] = np.nan
#df.loc[(df["Zoning District 2"] != "nan") & ~df["Zoning District 2"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH")), ["Zoning District 2"]] = np.nan

### Looking at special districts

In [None]:
#show_vals("Special District 1")
#show_vals("Special District 2")

In [None]:
#Checks to see if there are lower case values
#df.loc[~df["Special District 1"].isna() & df["Special District 1"].str.islower()]

### Analysis

Zoning districts had some zones that were invalid such as number only values (ex.31010)  and we changed those values to nan

The typical format for Zoning districts start with C, R, and N. There are also some special districts like PARK and BPC that we also checked

Special Districts didn't have any noticable values that were out of place

In [143]:
df.columns

Index(['Objectid', 'Globalid', 'Seating Interest (Sidewalk/Roadway/Both)',
       'Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)',
       'Building Number', 'Street', 'Borough', 'Postcode', 'Business Address',
       'Food Service Establishment Permit Number',
       'Sidewalk Dimensions (Length)', 'Sidewalk Dimensions (Width)',
       'Sidewalk Dimensions (Area)', 'Roadway Dimensions (Length)',
       'Roadway Dimensions (Width)', 'Roadway Dimensions (Area)',
       'Approved for Sidewalk Seating', 'Approved for Roadway Seating',
       'Qualify Alcohol', 'SLA Serial Number', 'SLA License Type',
       'Landmark District or Building', 'landmarkDistrict terms',
       'healthCompliance terms', 'Time of Submission', 'Latitude', 'Longitude',
       'Community Board', 'Council District', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

### Quick look at GIS

In [145]:
show_vals("Latitude")
show_vals("Longitude")
show_vals("Council District")
show_vals("Census Tract")
show_vals("NTA")
show_vals("BIN")

Top 10 Latitude:

NaN          1296
40.690833      18
40.702763      11
40.741876       9
40.761819       8
40.687793       7
40.677135       6
40.706315       6
40.75084        6
40.775285       6
Name: Latitude, dtype: int64

Top 10 Longitude:

NaN           1296
-73.983452      18
-73.986681      12
-74.004713       9
-73.993889       8
-73.98976        7
-73.954481       6
-73.829381       6
-73.985316       6
-73.985596       6
Name: Longitude, dtype: int64

Top 10 Council District:

3      1386
NaN    1296
1      1113
2      1027
4       870
34      550
33      523
39      447
26      396
22      374
Name: Council District, dtype: int64

Top 10 Census Tract:

NaN    1296
38      177
67      139
32      132
41      132
65      127
21      109
58      102
133     101
7        99
Name: Census Tract, dtype: int64

Top 10 NTA:

NaN                                           1296
West Village                                   685
Midtown-Midtown South                          605
East V

In [146]:
# Manually looking at some of these
df[["Latitude", "Longitude", "Council District", "Census Tract", "NTA"]]

Unnamed: 0,Latitude,Longitude,Council District,Census Tract,NTA
0,40.8005,-73.952507,9,216,Central Harlem South
1,40.766845,-73.962708,4,118,Lenox Hill-Roosevelt Island
2,40.744338,-73.99624,3,91,Hudson Yards-Chelsea-Flatiron-Union Square
3,40.850323,-73.933011,10,271,Washington Heights North
4,40.79167,-73.946688,8,172,East Harlem South
...,...,...,...,...,...
13022,40.784739,-73.845776,19,929,College Point
13023,40.635543,-74.01122,38,118,Sunset Park East
13024,40.713298,-74.007773,1,21,SoHo-TriBeCa-Civic Center-Little Italy
13025,40.733916,-73.989872,2,42,East Village


In [147]:
# shouldn't be 0
df["Latitude"] = df["Latitude"].astype('float')
df["Latitude"].min()

40.511019

In [148]:
df["Latitude"].max()

40.91119

In [149]:
df.loc[df["Latitude"] == 0.0]

Unnamed: 0,Objectid,Globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA


#### Removed the the rows above because its obviously a filler job

In [150]:
df = df.drop(df[df["Latitude"] == 0.0].index)

In [151]:
df["Latitude"].min()

40.511019

#### The min and max makes sense as the values range from Staten Island to the Bronx

In [152]:
df["Longitude"] = df["Longitude"].astype('float')
df["Longitude"].min()

-74.248014

In [153]:
df["Longitude"].max()

-73.702668

In [154]:
df.loc[df["Longitude"] == -73.700376]

Unnamed: 0,Objectid,Globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA


#### These Longitudes and Latitudes range from Queens to Staten Island which is also consistent with our dataset

In [155]:
df["Council District"] = df["Council District"].astype('float')
df["Council District"].min()

1.0

In [156]:
df["Council District"].max()

51.0

#### 1-51 are all valid districts

In [157]:
df["Census Tract"] = df["Census Tract"].astype('float')
df["Census Tract"].min()

1.0

In [158]:
df["Census Tract"].max()

157903.0

In [159]:
df.loc[df["Census Tract"] == 157903]

Unnamed: 0,Objectid,Globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
4451,2795,{d3f9a12e-362e-452a-8b22-bf5dd02cb9c9},sidewalk,NANCY'S FIRESIDE,A.V.O. REST CORP,A.V.O. REST CORP,25541,JERICHO TPKE,Queens,11001,...,False,06/21/2020 11:10:00 AM,40.727421,-73.709622,13,23.0,157903.0,,,Glen Oaks-Floral Park-New Hyde Park


#### No irregulars for census tract

In [161]:
df["BIN"] = df["BIN"].astype('float')
df["BIN"].min()

1000000.0

In [162]:
df["BIN"] = df["BIN"].astype('float')
df["BIN"].max()

5169029.0

In [163]:
df.loc[df["BIN"] == 1000000.0]

Unnamed: 0,Objectid,Globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
128,10205,{734B44F0-4A16-4BB8-8693-D4DF1963880B},sidewalk,The Water Hazard,"Konkapot Entertainment, Inc","Konkapot Entertainment, Inc",59,Chelsea Piers,Manhattan,10011,...,False,08/13/2020 01:28:00 PM,40.747999,-74.008496,4,3.0,99.0,1000000.0,1006620011,Hudson Yards-Chelsea-Flatiron-Union Square
2113,12180,{F23DDE07-39E0-4DF3-88A5-7BA6A918E510},both,PORCHLIGHT,PORCHLIGHT WEST CHELSEA LLC,PORCHLIGHT,UNDEFINED,271 11TH AVE,Manhattan,10001,...,False,04/05/2021 04:46:00 PM,40.751743,-74.004941,4,3.0,99.0,1000000.0,1006730001,Hudson Yards-Chelsea-Flatiron-Union Square
2790,9765,{7C0981EC-DF23-45B4-8682-4D0B7077FF13},both,Le Pain Quotidien,"APQ E65 NY, LLC",Le Pain Quotidien,861,Lexington Ave,Manhattan,10028,...,False,08/03/2020 05:51:00 PM,40.765813,-73.965557,8,4.0,120.0,1000000.0,1013997502,Upper East Side-Carnegie Hill
2919,12247,{1C50A948-F16A-4685-864D-D7D64023876E},sidewalk,HOLE IN THE WALL,"HOLE IN THE WALL 626 1ST, LLC",Parched Hospitality Group,UNDEFINED,626 1 AVENUE,Manhattan,10016,...,False,04/14/2021 01:08:00 PM,40.744898,-73.972688,6,4.0,8601.0,1000000.0,1009670001,Turtle Bay-East Midtown
3035,10647,{709091FE-D8F0-4B1D-8BE9-80479B566390},both,TARALLUCCI E VINO,LA VECCHIA LLC,Tarallucci e Vino,UNDEFINED,44 EAST 28 STREET,Manhattan,10016,...,False,08/26/2020 11:19:00 AM,40.743777,-73.985171,5,2.0,56.0,1000000.0,1008577503,Hudson Yards-Chelsea-Flatiron-Union Square
3620,10708,{089C2F2E-3D3B-4592-85BE-EF60FEB14C8C},both,JUN-MEN RAMEN BAR,"MISSION 925,INC.",JUN-MEN RAMEN BAR,UNDEFINED,249 9 AVENUE,Manhattan,10001,...,False,08/28/2020 02:04:00 PM,40.747868,-74.000339,4,3.0,93.0,1000000.0,1007237502,Hudson Yards-Chelsea-Flatiron-Union Square
5300,8670,{BFAAF0EE-B764-472A-87C4-6EFECF67821F},openstreets,Adrienne's Pizza bar,Pizza on Stone LLC,Pizza on Stone LLC,87,Pearl Street,Manhattan,10004,...,False,07/16/2020 06:46:00 PM,40.704122,-74.010055,1,1.0,9.0,1000000.0,1000297504,Battery Park City-Lower Manhattan
7143,2121,{2601a91c-cbd3-4203-8bcb-29ac88a1d0f1},sidewalk,Sidewalk Cafe,Sidewalk Cafe,Sidewalk Cafe,1520,Park Ave,Manhattan,10006,...,False,06/20/2020 10:33:00 AM,40.79568,-73.945883,11,8.0,17402.0,1000000.0,1016160036,East Harlem South
9763,1186,de45b8d3-be89-4221-8385-0d662979be88,both,Da Claudio,Tre Monelli LLC,Da Claudio,21,Ann,Manhattan,10038,...,False,06/19/2020 03:10:00 PM,40.710778,-74.007694,1,1.0,1501.0,1000000.0,1000900017,Battery Park City-Lower Manhattan
10884,12239,{222143D3-DCD2-4B99-8DD1-ED55C0C660EC},roadway,FULGURANCES LAUNDROMAT,FULGURANCES NYC LLC,FULGURANCES LAUNDROMAT,UNDEFINED,132 FRANKLIN STREET,Brooklyn,10013,...,False,04/13/2021 11:54:00 AM,40.71916,-74.007201,1,1.0,33.0,1000000.0,1001897507,SoHo-TriBeCa-Civic Center-Little Italy


In [164]:
df.loc[df["BIN"] == df["BIN"].max()]

Unnamed: 0,Objectid,Globalid,Seating Interest (Sidewalk/Roadway/Both),Restaurant Name,Legal Business Name,Doing Business As (DBA),Building Number,Street,Borough,Postcode,...,healthCompliance terms,Time of Submission,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
1913,7642,{4F0AD928-F9D6-4918-803A-B1E7523D4C9F},sidewalk,JODY'S CLUB FOREST,415 FOREST REST INC,JODY'S CLUB FOREST,372,FOREST AVE,Staten Island,10301,...,False,07/07/2020 08:07:00 PM,40.631016,-74.101896,1,49.0,59.0,5169029.0,5002520009,New Brighton-Silver Lake
6308,620,19d82f24-262e-4a57-805f-40add873de3c,sidewalk,Jody's Club Forest,415 Forest Rest INC,Jody's Club Forest,372,Forest Ave,Staten Island,10301,...,False,06/19/2020 01:18:00 PM,40.631016,-74.101896,1,49.0,59.0,5169029.0,5002520009,New Brighton-Silver Lake
6851,9644,{11DB6F3F-ADB9-4B95-8A79-F7DB79EC1036},both,Jody's Club Forest,415 Forest Rest Inc.,Jody's Club Forest,372,Forest Ave,Staten Island,10301,...,False,07/31/2020 03:46:00 PM,40.631016,-74.101896,1,49.0,59.0,5169029.0,5002520009,New Brighton-Silver Lake


#### Nothing wrong with GIS BIN either

# Data Profilling for datetime columns


Find format problems and outliers in all datetime columns

Using openclean's sklearn modules to detect problems and outliers

In [165]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

def findDateOutliers(column_name, eps_setting = 0.05):
    datetime_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(datetime_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(datetime_data)))
    print(DBSCANOutliers().find(datetime_data))
    print(DBSCANOutliers(eps = eps_setting).find(datetime_data))
    print('\n==================================')

In [168]:
date_cols = []

print("Datetime Data columns:\n")
for col in ds.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        print(col)
        date_cols.append(col)

print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.02)

Datetime Data columns:

Time of Submission
----------------------------

Column:  Time of Submission
1.  10/19/2021 04:36:00 PM          14
2.  10/19/2021 04:31:00 PM          11
3.  06/19/2020 01:03:00 PM          11
4.  06/19/2020 02:22:00 PM          10
5.  06/19/2020 01:23:00 PM          10
6.  06/19/2020 02:45:00 PM          10
7.  06/19/2020 12:00:00 PM          10
8.  06/19/2020 02:27:00 PM           9
9.  10/19/2021 04:42:00 PM           9
10. 06/19/2020 03:11:00 PM           9

Total number of distinct values in Time of Submission is 10166
[]
['06/19/2020 02:27:00 PM', '06/19/2020 03:28:00 PM', '06/19/2020 12:00:00 PM', '06/22/2020 03:23:00 PM', '06/19/2020 04:53:00 PM', '06/26/2020 02:00:00 PM', '06/23/2020 02:00:00 PM', '06/22/2020 01:10:00 PM', '06/19/2020 11:29:00 AM', '06/19/2020 11:57:00 AM', '06/19/2020 01:10:00 PM', '06/19/2020 02:45:00 PM', '06/19/2020 01:19:00 PM', '10/19/2021 04:36:00 PM', '10/19/2021 04:34:00 PM', '06/19/2020 05:37:00 PM', '06/22/2020 02:00:00 PM',

Number Remember that after changing some of the column names, there are some columns that are also datetime data:

"Paid": "Paid Date"\
"Fully Paid": "Fully Paid Date"\
"Assigned": "Assigned Date"\
"Approved": "Approved Date"\
"Pre- Filing Date": "Pre-Filing Date"\
"DOB Run Date": "DOB Run Date"\
"SIGNOFF_DATE": "Signoff Date"\
"SPECIAL_ACTION_DATE": "Special Action Date"\

In [169]:
##date_cols = ["Filing Date","Issuance Date","Expiration Date","Job Start Date", "DOB Run Date"]

for col in date_cols:
    findDateOutliers(col, 0.02)

Column:  Time of Submission
1.  10/19/2021 04:36:00 PM          14
2.  10/19/2021 04:31:00 PM          11
3.  06/19/2020 01:03:00 PM          11
4.  06/19/2020 02:22:00 PM          10
5.  06/19/2020 01:23:00 PM          10
6.  06/19/2020 02:45:00 PM          10
7.  06/19/2020 12:00:00 PM          10
8.  06/19/2020 02:27:00 PM           9
9.  10/19/2021 04:42:00 PM           9
10. 06/19/2020 03:11:00 PM           9

Total number of distinct values in Time of Submission is 10166
[]
['06/19/2020 02:27:00 PM', '06/19/2020 03:28:00 PM', '06/19/2020 12:00:00 PM', '06/22/2020 03:23:00 PM', '06/19/2020 04:53:00 PM', '06/26/2020 02:00:00 PM', '06/23/2020 02:00:00 PM', '06/22/2020 01:10:00 PM', '06/19/2020 11:29:00 AM', '06/19/2020 11:57:00 AM', '06/19/2020 01:10:00 PM', '06/19/2020 02:45:00 PM', '06/19/2020 01:19:00 PM', '10/19/2021 04:36:00 PM', '10/19/2021 04:34:00 PM', '06/19/2020 05:37:00 PM', '06/22/2020 02:00:00 PM', '06/19/2020 11:01:00 AM', '06/19/2020 04:58:00 PM', '06/19/2020 03:57:00

# Analysis

the above results show the problems for the data cleaning task:
    
### Latest Action Date
outliers: '06//1403'
format: 'yyyy-mm-dd' and 'mm/dd/yyyy'

### Pre- Filing Date
no problem found

### DOB Run Date
format: 'yyyy-mm-dd' and 'mm/dd/yyyy 00:00:00'

### SIGNOFF_DATE
outliers: empty value

### SPECIAL_ACTION_DATE
outliers: empty value and '11//2006'

### Paid
outliers: empty value

### Fully Paid
outliers: empty value

### Assigned
outliers: empty value

### Approved
outliers: empty value

Number Data Cleaning for outliers in datetime columns

## Fixing Datetime columns format

In [170]:
# datetime_column_list = []
# for col in ds.columns:
#     if 'Date' in col or 'DATE' in col:
#         print(col)
#         datetime_column_list.append(col)

for col in date_cols:
    show_vals(rename_dict[col])

Top 10 Time of Submission:

10/19/2021 04:36:00 PM    14
06/19/2020 01:03:00 PM    11
10/19/2021 04:31:00 PM    11
06/19/2020 02:22:00 PM    10
06/19/2020 02:45:00 PM    10
06/19/2020 01:23:00 PM    10
06/19/2020 12:00:00 PM    10
06/19/2020 10:57:00 AM     9
06/19/2020 01:10:00 PM     9
06/19/2020 02:27:00 PM     9
Name: Time of Submission, dtype: int64



Check to see if any columns have values in year-month-day format

In [171]:
# for col in date_cols:
#     print(col, '\n', df.loc[df[col].str.contains('-', regex=False, na=False)][col], '\n\n')

#### Fix the remaining Datetime columns

In [172]:
for col in date_cols:
    print(col)
    df.loc[:,rename_dict[col]] = pd.to_datetime(df[rename_dict[col]])

Time of Submission


These should all be proper datetime64[ns] columns now:

In [173]:
df.select_dtypes(include='datetime')

Unnamed: 0,Time of Submission
0,2020-06-26 20:38:00
1,2021-10-22 11:01:00
2,2020-12-14 19:54:00
3,2020-07-08 15:58:00
4,2020-06-24 16:02:00
...,...
13022,2020-08-04 15:27:00
13023,2020-07-13 15:53:00
13024,2020-06-21 13:46:00
13025,2020-06-19 13:19:00


In [174]:
#for col in datetime_column_list:
#    show_vals(col)

### Check the coherence of datetime values

~These don't make sense, but it's not entirely clear if they should be swapped, or removed or what~

All these are okay!


In [175]:
# df.loc[(df['Field Inspection Completed Date'] > df['Filing Date'])
#       &(~df['Field Inspection Completed Date'].isna() & ~df['Filing Date'].isna())][['Field Inspection Completed Date', 'Filing Date']]

In [176]:
# df.loc[(df['Expiration Date'] < df['Issuance Date'])
#       &(~df['Expiration Date'].isna() & ~df['Issuance Date'].isna())][['Expiration Date', 'Issuance Date']]

Good, no expirations before issuance

Here it's not clear how a job could start after the issuance expires, but this may have actually happened

In [177]:
# df.loc[(df['Job Start Date'] > df['Expiration Date'])
#       &(~df['Job Start Date'].isna() & ~df['Expiration Date'].isna())][['Job Start Date', 'Expiration Date']]

##### These are, however a small percentage of of our total jobs

Fraction of jobs Assigned after they were already approved out of total jobs

In [178]:
#df.loc[df['Job Start Date'] > df['Expiration Date']]['Job Start Date'].count()/df['Job Start Date'].count()

Number Data Profilling for City and Other Description

Find format problems and outliers in City and Description columns

Using openclean's sklearn modules to detect problems and outliers

In [180]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    applicant_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(applicant_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(applicant_data)))
    print(DBSCANOutliers(eps = eps_setting).find(applicant_data))
    print('\n==================================')

In [181]:
#date_cols = ["Comments"]

In [182]:
#date_cols = ["COMMENTS"]
print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.1)

----------------------------

Column:  Time of Submission
1.  10/19/2021 04:36:00 PM          14
2.  10/19/2021 04:31:00 PM          11
3.  06/19/2020 01:03:00 PM          11
4.  06/19/2020 02:22:00 PM          10
5.  06/19/2020 01:23:00 PM          10
6.  06/19/2020 02:45:00 PM          10
7.  06/19/2020 12:00:00 PM          10
8.  06/19/2020 02:27:00 PM           9
9.  10/19/2021 04:42:00 PM           9
10. 06/19/2020 03:11:00 PM           9

Total number of distinct values in Time of Submission is 10166
['10/19/2021 04:36:00 PM']



Number Analysis

the above results show the problems for the data cleaning task:
    
### For City

There are many misspellings and abbreviations for city names. We can use both clustering and Soundex to detect misspellings and abbreviations. And we can check if our cleaning is right by refer to the U.S. Cities reference datasets in openclean.


### For Other Description

Other Description can be anything, so we just care about empty value and values that are too similar and are showing exactly same things (for example 'GC' and '___GC')

In [183]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

In [None]:
# upper = ds\
#     .select("QEWI_CITY")\
#     .update("QEWI_CITY", str.upper)

In [None]:
# from openclean.data.refdata import RefStore

# refdata = RefStore()
# city_df = refdata\
#     .load('encyclopaedia_britannica:us_cities', auto_download=True)\
#     .df()


In [None]:
# city_list = city_df['city']
# print(city_list)

Number An example of using soundex in openclean

However, using soundex for each of the city is too slow, the code below take nearly 4 mins for one sningle city.\
So we should use clustering first and then use hard code to clean the remianing city name that is not in the city_list.

In [None]:
# brooklyn = ds\
#     .select("QEWI_CITY")\
#     .update("QEWI_CITY", str.upper)\
#     .filter(And(Eval("QEWI_CITY", Soundex()) == soundex('BROOKLYN'), Col("QEWI_CITY") != 'BROOKLYN'))\
#     .distinct()

# print('RANK\tCOUNT\tNAME')
# for i, entry in enumerate(brooklyn.most_common()):
#     key, count = entry
#     print('{}.\t{}\t{}'.format(i + 1, count, key))

Number Data Cleaning for Applicant columns

* how to deal with empty values has not decided yet

Number Transform all city names to upper case

### Remember that we have changed some column names:
"City ": "Owner's House City"\
"State": "Owner's House State"

In [None]:
#df["Qewi City"] = df["Qewi City"].str.upper()

In [None]:
# Convert similar values to suggested value using kNN clustering

In [None]:
# # Cluster string using kNN clusterer (with the default n-gram setting)
# # using the Levenshtein distance as the similarity measure.

# from openclean.cluster.knn import knn_clusters
# from openclean.function.similarity.base import SimilarityConstraint
# from openclean.function.similarity.text import LevenshteinDistance
# from openclean.function.value.threshold import GreaterThan

# def getClusters(col, minsize = 2, preds = 0.5):
#     dba = ds.select(col).distinct()
#     clusters = knn_clusters(
#         values=dba,
#         sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(preds)),
#         minsize=minsize
#     )
#     return clusters

# def print_cluster(cnumber, cluster):
#     item_count = 0
#     print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
#     for val, count in cluster.items():
#         item_count += 1
#         if item_count <= 10:
#             print('{} ({})'.format(val, count))
#     if item_count>10:
#         print(".......{} more items".format(item_count-10))
#     print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

# def updateUsingClusters(col, clusters, isPrint = False):
    
#     orignal_list = []
#     suggestion_list = []
#     clusters.sort(key=lambda c: len(c), reverse=True)
       
#     for i, cluster in enumerate(clusters):        
#         suggestion = cluster.suggestion()
#         orignal_list = []
#         suggestion_list = []
#         if isPrint and i <5:
#             print_cluster(i, cluster)
        
#         for val, count in cluster.items(): 
#             orignal_list.append(val)
#             suggestion_list.append(suggestion)
    
#     df[col] = df[col].replace(orignal_list, suggestion_list)

In [None]:
# date_cols = ["QEWI_CITY"]#,'Other Description' ]


# # print("kNN cluster for ", "Qewi City")
# # col_clusters = getClusters("Qewi City ")
# # print("updating column ", "Qewi City")
# # print("----------------------\nTop 5 Cluster:\n----------------------")
# # updateUsingClusters("Qewi City", col_clusters, True)
# print("================")

# print("kNN cluster for ", date_cols[0])
# col_clusters = getClusters(date_cols[0])
# print("updating column ", date_cols[0])
# print("----------------------\nTop 5 Cluster:\n----------------------")
# updateUsingClusters("Qewi City", col_clusters, True)
# print("================")

In [None]:
# After clustering, find data that is not in the reference city dataset, hard code to clean them

In [None]:
# upper_city_list = []
# for item in city_list:
#     upper_city_list.append(str(item.upper()))

# outlier_cities = df.loc[(~df['Qewi City'].str.upper().isin(upper_city_list)) & (~df['Qewi City'].isna())]['Qewi City'].drop_duplicates()
# print(outlier_cities)

In [None]:
# Print standardized cities and found outliers

In [None]:
# standardized_cities = df.loc[(df['Qewi City'].str.upper().isin(upper_city_list)) & (~df['Qewi City'].isna())]['Qewi City'].drop_duplicates()
# print(standardized_cities)

In [None]:
# outlier_city_list = []
# for item in outlier_cities:
#     outlier_city_list.append(str(item))
    
# print(outlier_city_list)

In [None]:
# Search for similar city names in reference city dataset, and hard code to replace those outliers

In [None]:
# def findCityName(str):
#     print(city_df['city'].loc[city_df['city'].str.contains(str)].drop_duplicates())
#     print("------------------------\n")
    
# findCityName("Rich")
# findCityName("Island")
# findCityName("White")
# findCityName("Philadelphia")
# findCityName("Morris")
# findCityName("Nassau")
# findCityName("Westchester")
  

In [None]:
# outlier_city_list = ['NEW YORK', 'BKLYN', 'ROOKLYN', 'RICHMOND HILL', 'BX', 'NY', 'OLD WESTBURY', 'N.Y.', 'HOLLIS', 'MAHATTAN', 'LAKE SUCCESS', 'BROKKLYN', 'BETHESDA', 'JAMAICA', 'SECAUCUS', 'LIC', 'MASPETH', 'JAMAICA ESTATES', 'SOUTH OZONE PAR', 'BAYSIDE', 'JAM', 'PARMUS', 'KEW GARDENS', 'WOONSECKET', 'LI', 'ST. ALBANS', 'MASSAPEQUA', 'SI', 'FLORAL PARK', 'ROSLYN HEIGHTS', 'HOWARD BEACH', 'WHITEPLAINS', 'JACKSON HEIGHTS', 'REGO PARK', 'NEW HYDE PARK', 'REGO', 'ARVERNE', 'OZONE PARK', 'VALLEY STREAM', 'NEPONSIT', 'ROCKVILLE CENTR', 'BRIARWOOD', 'BRKLYN', 'MOUNT LAUREL', 'QUEEEN', 'ELMSFORD', 'NYC', 'GILLFORD', 'PARSIPPANY', 'WOODSIDE', 'LONG ISLAND CIT', 'QUEEN', 'VAALLEY STREAAM', 'BRONS', 'COLLEGE POINT', 'ROCKAWAY POINT', 'DOUGLASTON', 'ENGLEWOOD CLIFF', 'QNS', 'LYNBROOK', 'SYOSSET', 'FRESH MEADOWS', 'LITTLE NECK', 'WOODHAVEN', 'HARTSDALE', 'ATLANTIC BEACH', 'SAN JUAN CAPIST', 'CALDE PLACE', 'RIVERDALE', 'TUCKAHOE', 'SEAFORD', 'L.I.C.', 'REGO PK', "B'KLYN"]
# clean_city_list = ['NEW YORK CITY', 'BROOKLYN', 'BROOKLYN', 'RICHMOND', 'BRONX', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'MANHATTAN', 'NEW YORK CITY', 'BROOKLYN', 'BETHESDA', 'NEW YORK CITY', 'SECAUCUS', 'LONG ISLAND CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'PARAMUS', 'NEW YORK CITY', 'WOONSOCKET', 'LONG ISLAND CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'STATEN ISLAND', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'WHITE PLAINS', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'BROOKLYN', 'PHILADELPHIA', 'QUEEENS', 'NEW YORK CITY', 'NEW YORK CITY', 'GILLFORD', 'MORRIS', 'NEW YORK CITY', 'LONG ISLAND CITY', 'QUEENS', 'BRONX', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'QUEENS', 'NEW YORK CITY', 'NASSAU', 'QUEENS', 'QUEENS', 'WOODHAVEN', 'NEW YORK CITY', 'NASSAU', 'SAN JUAN CAPISTRANO', 'BROOKLYN', 'RIVERDALE', 'STATEN ISLAND', 'NASSAU', 'LONG ISLAND CITY', 'QUEENS', "BROOKLYN"]

# df['Qewi City'] = df['Qewi City'].replace(outlier_city_list, clean_city_list)

In [None]:
# Check State Column

In [None]:
#state_col = 'QEWI_STATE'
#findDateOutliers(state_col, 0.1)

In [None]:
#ds.select('QEWI_STATE').distinct()

In [None]:
# Find functional dependencies violations on City -> State

In [None]:
# from openclean.operator.collector.count import distinct
# from openclean.operator.map.violations import fd_violations

# groups = fd_violations(df, lhs='Qewi City', rhs='Qewi State')

# print('City         \t|            State')
# print('=============\t|  ===============')
# for key in groups:
#     conflicts = distinct(groups.get(key), 'Qewi State').most_common()
#     state, count = conflicts[0]
#     print('{:<12} \t| {} x {}'.format(key, count, state))
#     for state, count in conflicts[1:]:
#         print('             \t| {} x {}'.format(count, state))
#     print('-------------\t|  ---------------')

In [None]:
#There is a row that has "NEW YORK CITY" as city, but have "NJ" as State, fix its state to "NY"

In [None]:
#index = df['Qewi State'].loc[(df['Qewi City'] == "NEW YORK CITY") & (df['Qewi State'] == "NJ")].index[0]
#df['Qewi State'].update(pd.Series(['NY'], index = [index]))

In [None]:
#df['Qewi State'].loc[(df['Qewi City'] == "NEW YORK CITY") & (df['Qewi State'] == "NJ")]

In [None]:
# Apply similar operation on Owner Bus Name

In [None]:
#bn_col = "Qewi Bus Name"
#findDateOutliers(bn_col)

In [None]:
# Using clustering for Business Name takes too much time, we can only clean those empty data for now 

In [None]:
#df[bn_col] = df[bn_col].replace(['N/A', '', 'NA','NONE'], [None,None,None,None])

### Data Profilling for applicant columns

#Find format problems and outliers in all applicant columns

#Using openclean's sklearn modules to detect problems and outliers

In [184]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    applicant_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(applicant_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(applicant_data)))
    print(DBSCANOutliers(eps = eps_setting).find(applicant_data))
    print('\n==================================')

In [None]:
# date_cols = []

# print("Qewi Data columns:\n")
# for col in ds.columns:
#     if 'Applicant' in col:
#         print(col)
#         date_cols.append(col)

In [None]:
# date_cols = []

# print("Applicant Data columns:\n")
# for col in ds.columns:
#     if 'Applicant' in col:
#         print(col)
#         date_cols.append(col)

# print("----------------------------\n")        
        
# for col in date_cols:
#     findDateOutliers(col, 0.1)

In [None]:
# Analysis

#the above results show the problems for the data cleaning task:
    
### For name data

#in "Applicant's First Name", "Applicant's Last Name", "Applicant Professional Title", there are many outliers which are illegal input, and there are many similar values. We need first converts evident outliers to legal values, then use kNN clusterer to standardize similar values.


### Applicant License Number

#Applicant License Number is made of 6 digits, there are outliers that do not satisfy the 6-digit format. We can not use kNN clusterer to standardize because many License Number are similar. 

In [None]:
# Data Cleaning for Applicant columns

#* how to deal with empty values has not decided yet

In [None]:
# Number mapping list to replace outliers
# outlier1 = ['', 'MR. ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM 11', 'JOSEP;H``', 'DAID/11/2007', 'CHUNG   LUN', '718 9215010', 'ANTHONY', 'HSIA0-NAN', 'JOSEPH', '``````````', 'ROBERT  `', 'RAJENDRA9956700', '2', 'G.B.M.', 'EUGENE......JR', '6312100', 'CLAUDE,JR.', 'THOMAS``', 'ALAN  L', 'Nab53', 'MR. Y. B', 'J.J', 'PH8ILIP', 'I. M', 'RICHARD', 'ALBERTA S 111 D', 'P ;', 'GENECG.C. ENG &', 'J.J.', '2126202794', 'SHAW  HWA', 'HARRY         H', 'MR DOU8GLAS', '`1D', 'PAUL', 'K. T.', 'JOHN', '...NORMAN', 'EVAN   D', '7184361278BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MAD/Y/ARNI', 'ES ON SCH B', 'EUGENE.......JR', 'NEAL', 'F._ERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0-TECH', 'RODNEY   __', 'DAVID', 'G. L.', 'JAMES', 'LESLI8E', '7186054055', 'GEORGE', 'G.B.M', 'DAVID    JON', 'CHUNG---YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', '1P', 'JUDE.....N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD/HON-AN', 'GLEN  A.L.', 'J.B. Jr.', 'LORENZO..A', 'J J', '..RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY 2', '...JOSEPH', 'RUSSELL 111', 'THOMAS', 'H./E./CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', '--young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']
# mapping1 = [None, 'ROSS ADAM C', 'MICHAEL', 'N. J.', 'WILLIAM', 'JOSEPH', None, 'CHUNG LUN', None, 'ANTHONY', 'HSIA0 NAN', 'JOSEPH', None, 'ROBERT', 'RAJENDRA', None, 'G.B.M.', 'EUGENEJR', None, 'CLAUDE JR.', 'THOMAS', 'ALAN  L', 'Nab', 'MR. Y. B', 'J.J', 'PHILIP', 'I. M', 'RICHARD', 'ALBERTA', None, 'GENECG.C. ENG', 'J.J.', None, 'SHAW HWA', 'HARRYH', 'MR DOUGLAS', None, 'PAUL', 'K. T.', 'JOHN', 'NORMAN', 'EVAND', 'BERNA', 'S.D. DON', 'KY00 SUK', 'JJ', 'YURI.`', 'MADYARNI', 'ES ON SCH B', 'EUGENEJR', 'NEAL', 'FERIC', 'RYAN,  JR', 'AASDFASDFASDF', 'LA0 TECH', 'RODNEY', 'DAVID', 'G. L.', 'JAMES', 'LESLIE', None, 'GEORGE', 'G.B.M', 'DAVID JON', 'CHUNG YAO', 'PETER', 'YUBUN(JACK)', 'GLEN A. L.', None, 'JUDE N.O', 'LEONARD--', 'WILLIAM', 'ANTHONY,111', 'WU(WOODY)', 'GAD HON-AN', 'GLEN A.L.', 'J.B. Jr.', 'LORENZOA', 'J J', 'RAMSEY', 'HUI LI I', 'ANTONIO9', 'ROBERT', '0.BERT', 'DUMMY', 'JOSEPH', 'RUSSELL', 'THOMAS', 'H.E.CAMELLE', 'LALAL', 'M.E. P.E', 'R0OBIN VINCENT', 'young', 'AKM', 'LE1', 'IK.T.', 'LEO, JR.', 'J. Butch A. Jr.', 'WU (WOODY0', 'PAUL   N', 'CHRISTOPHER']

# outlier2 = ['SHARMA Number0', "0'CONNOR", 'RUSHTON    UEL', 'UDDIN   Z', 'HINKLEY 1', 'O&Number039;CONNOR, P.E.', '.OOK', 'SAMUELS111', 'O&Number039;CONNOR', 'CALIENDO', 'SMITH   JR.', 'LO  BUE', '7AN', '+-+ETTIERI', 'SMITH, 111', 'KAMEN   1', '.EE', 'MASS, 1', '.EI', 'Zagaroli 3rd', 'RINI   II', 'KAMEN   R', 'RYAN 11', 'SPI8EZIA L S', 'MUFTIC..A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL, P.E.', 'HAMA07', 'HINLEY,1', '1212', "O  ' CONNELL", 'HURT,JR.,', 'WESOLOWSKI', 'CHEN', '`ING, R.A', 'MARTARELLA 111', 'Gandhi, Ph.D., P.E.', '90I', 'ENNIS 2', 'COSTELLO R A A I A', '3UI', 'N/A', 'HURT,  JR', 'LEHR,1', 'KOHLER, 111', 'GERAZOUNIS', 'Alexander,1', 'LUBOW, R.A. LEED AP', 'RINI,111', '08CZAK', '````````````````````', 'CHAO  R.A.', 'Geier 11', '08NGEL', '08SOLOWSKI', 'I11', 'HINKLEY, 1', 'RUDIKOFF, P.E.', "O'CONNOR", 'SHAH   EZ', 'MIELE, JR., P.E.', 'RITTENHOUSE 111', 'AMADI   ISIOFIA', 'HINKLEY,1', 'RENFORE````````', "O'HARA,JR.", '73020012', 'PHAGOO   I', 'BRAY.....,', 'LLL', 'BHATHIA,1', 'GANDHI, PH. D., P.E', 'KO K', 'VASSALOTTI 11', 'HURT, JR .', '0018LKLE', 'RINI -111', 'PARIHAR', 'EE', 'L00802', 'ELISE.111', 'KING , R.A', 'CHRYSLER  P E', 'LEHR 1', 'Walters   Jr.', 'LEE', 'RINI  III', 'D&Number039;ANGELO', '0UDOLPH III', 'VIEHE-NAESS 111', ',MO', '08E', '47DIKOFF', 'Yu,', '420865380', 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', '901BEN', '4153LOO', 'SYED-NAQVI', 'RYAN , JR.', 'K O K O R I S', 'ELISEO111', 'O&Number039;CONNELL', 'ZEID61', '---Lewis', '00CHELI', 'MOHAMMAD       +++++', 'METZLER  P E', 'BAILEY', 'GANDHI, PH. D., P.E.', 'TIEMANN.111', 'SMITH.111', 'DI GER0NIMO', 'GANDHI, PH,D., P.E', 'III', 'J C', 'MAGAMI-QAIM-MAGAMI', '+M', 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', 'Y10007OR', 'SMITH,111', 'KING R A FAIA', 'RYAN III, AIA', '08AN', 'STARK 1', 'MASS', 'VICTORI0, R.A', 'RIZVI   A', '21029677', "3'CONNOR", 'Wong /  Lai', 'KAPLAN 3', 'GRAICHEN.JR./DAWN/DI', 'GROSSMAN ,PE,F.A.C.I']
# mapping2 = ['SHARMA ', "CONNOR", 'RUSHTON UEL', 'UDDIN Z', 'HINKLEY ', 'CONNOR P.E.', None, 'SAMUELS', 'CONNOR', 'CALIENDO', 'SMITH JR.', 'LO BUE', None, 'ETTIERI', 'SMITH', 'KAMEN', '.EE', 'MASS', '.EI', 'Zagaroli', 'RINI', 'KAMEN R', 'RYAN', 'SPIEZIA L S', 'MUFTIC.A.I.A', 'COSTELLO9 RA A I A', 'CALVANICO', 'LLC.', 'POEPPEL P.E.', 'HAMA', 'HINLEY', None, "CONNELL", 'HURT JR.', 'WESOLOWSKI', 'CHEN', 'ING R.A', 'MARTARELLA', 'Gandhi', None, 'ENNIS ', 'COSTELLO R A A I A', None, None, 'HUR  JR', 'LEHR', 'KOHLER 111', 'GERAZOUNIS', 'Alexander', 'LUBOW R.A. LEED AP', 'RINI',None, None, 'CHAO R.A.', 'Geier', None, 'SOLOWSKI', None, 'HINKLEY', 'RUDIKOFF, P.E.', "CONNOR", 'SHAH EZ', 'MIELE JR. P.E.', 'RITTENHOUSE', 'AMADI   ISIOFIA', 'HINKLEY', 'RENFORE', "O'HARA,JR.", None, 'PHAGOO I', 'BRAY,', 'LLL', 'BHATHIA', 'GANDHI', 'KO K', 'VASSALOTTI', 'HURT JR.',None, 'RINI', 'PARIHAR', 'EE', None, 'ELISE', 'KING R.A', 'CHRYSLER  P E', 'LEHR', 'Walters Jr.', 'LEE', 'RINI  III', 'ANGELO', '0UDOLPH III', 'VIEHE-NAESS', 'MO', '08E', None, 'Yu,', None, 'COPELAND', 'ZWIEFEL 3RD', 'PETERSEN', 'King, R.A.,', 'RINI, III', '7APA', 'CHEN   S', 'Hurt  Jr.', 'KATZ', 'NIZAMBAD.(P.E.)', None, None, None, 'RYAN JR.', 'KOKORIS', 'ELISE', 'CONNELL', None, 'Lewis', 'CHELI', 'MOHAMMAD', 'METZLER  P E', 'BAILEY', 'GANDHI', 'TIEMANN', 'SMITH', 'DI GER0NIMO', 'GANDHI', 'III', 'J C', 'MAGAMI QAIM MAGAMI', None, 'LO G1UDICE', 'HOQUE', 'RUDIKOFF', None, 'SMITH', 'KING R A FAIA', 'RYAN III AIA', None, 'STARK', 'MASS', 'VICTORI0 R.A', 'RIZVIA', None, "CONNOR", 'Wong Lai', 'KAPLAN', 'GRAICHEN.JR. DAWN DI', 'GROSSMAN']

# outlier3 = ['', '....DEMO', '050069', 'DEM. CONTR.,', 'XXXXX', 'G/C 10114H9', 'CGWC10114H99', '00', 'X S000155', '082-36-1245', 'G.G', 'LESSEE', '......GC', "'", '..OWNER', 'GC 2293', '--', 'XXXXXX', 'LS 31,721', '...GC', 'gen.cont.', 'G.C TKNumber4592', 'PE', 'RLA - 818', '.....OWNER', 'RLA 16077', 'G C', 'X 4129892', 'G. C.', 'R.L.A', 'GC 1028350', 'WC10114H99', 'LEESEE', 'GEN.CONT.', 'SIGN..HANGER', 'DEMO 20451', 'D8615', '.X', 'P.L.L.C', '..DEMO', 'G .C', 'L A', 'G.C NY11101', '32820', '....OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC99792', 'X 1341946', 'TRACKNumber 1390', 'EXPED.R4466', 'PLLC 9599691', 'G.C 1110101', '029649', '(CHECK)', 'DEM. CONTR,', 'EXPEDIT(H66172)', '.........GC', 'CITY OF N Y', 'GC 1170386', 'G. C', 'CO0OWNER', '(CHECKED)', 'C.C', '23392 1159774', 'DEMO {', 'RA', 'T. 31132', '....GC', 'RLA-787', 'TRACK Number1390', 'D C', 'G.CONTR.', 'DEMO  CONT', '1GC', 'CC', 'demo G.C.', 'TRACK. Number1390', 'M.F.S.P.C.', '...DEMO', 'DEMO G C', '13328', 'GEN  CONT', 'GC 1221073', "GC;'", 'DEMO 1341946', '11234', 'G.C.,', '.....GC', 'LIC.133668259 1', '?', '0WNER', 'C10892', 'GEN..CONT']
# mapping3 = [None, 'DEMO', None, 'DEM. CONTR', None, 'G/C', 'CGWC', None, 'X S', None, 'G.G', 'LESSEE', 'GC', None, 'OWNER', 'GC', None, None, 'LS ', 'GC', 'gen.cont.', 'G.C TK', 'PE', 'RLA ', 'OWNER', 'RLA ', 'G C', 'X', 'G. C.', 'R.L.A', 'GC', 'WC', 'LEESEE', 'GEN.CONT.', 'SIGN.HANGER', 'DEMO', None,None, 'P.L.L.C', 'DEMO', 'G.C', 'L A', 'G.C ', None, 'OWNER', 'GC(DEMO)', 'C0NTRACTOR', 'EXPEDITORC', None, 'TRACK', 'EXPED.R', 'PLLC ', 'G.C', None, None, 'DEM. CONTR,', 'EXPEDIT', 'GC', None, 'GC', 'G.C', 'CO0OWNER', None, 'C.C', None, 'DEMO', 'RA', None, 'GC', 'RLA', None, 'D C', 'G.CONTR.', 'DEMO  CONT', 'GC', 'CC', 'demo G.C.', None, 'M.F.S.P.C.', 'DEMO', 'DEMO G C', None, 'GEN  CONT', 'GC ', "GC ", 'DEMO ', None, 'G.C.', 'GC', 'LIC', None, '0WNER',None, 'GEN.CONT']

# outlier4 = ['', '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '99998', '000N/A', '65569+', '01827O', 'R9526', 'LP0256', 'N/A', '1964', 'ISLAND', '1609', '000PW1', '00DEMO', '0688.6', '00000', '.20929', 'LP0258', '000TOR', '0D8615', '0SWITA', '818', 'O02200', 'DEMO', '196', '1075', '0000NT', '215', '0', '00000`', "D'ALTO", '0455', '22377', 'DD8615', '050579', '226', 'SWITA', 'DD6815', 'X02689']
# mapping4 = [None, '0000GC', '083278', 'DD5615', '0000PB', '00ASB4', 'B81923', '099998', '000000', '065569', '01827O', '0R9526', 'LP0256',None, '001964',None, '001609', '000PW1', '00DEMO', '006886', '000000', '020929', 'LP0258', '000TOR', '0D8615', '0SWITA', '000818', 'O02200', None, '000196', '001075', '0000NT', '000215', '000000', '000000', None, '000455', '022377', 'DD8615', '050579', '000226', None, 'DD6815', 'X02689']

# outliers = [outlier1, outlier2, outlier3, outlier4]
# mappings = [mapping1, mapping2, mapping3, mapping4]



In [None]:
# Remove evident outliers using hard coded mapping

In [None]:
# i = 0
# for col in date_cols:
#     df[col] = df[col].replace(outliers[i], mappings[i])
#     i += 1

In [None]:
## Convert similar values to suggested value using kNN clustering

In [185]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(col, minsize = 2):
    dba = ds.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.75)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    item_count = 0
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        item_count += 1
        if item_count <= 10:
            print('{} ({})'.format(val, count))
    if item_count>10:
        print(".......{} more items".format(item_count-10))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def updateUsingClusters(col, clusters, isPrint = False):
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i < 5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
            
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [186]:
cluster_cols = ['Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)',]

In [187]:
for col in cluster_cols:
    print("kNN cluster for ", col)
    col_clusters = getClusters(col)
    print("updating column ", col)
    print("----------------------\nTop 5 Cluster:\n----------------------")
    updateUsingClusters(col, col_clusters, True)
    print("================")

kNN cluster for  Restaurant Name
updating column  Restaurant Name
----------------------
Top 5 Cluster:
----------------------
Cluster 0 (of size 35)

Veranda Restaurant (1)
Frank Restaurant (1)
Lasagna Restaurant (1)
Safari Restaurant (1)
Lali Restaurant (1)
Kalina Restaurant (1)
Manaba Restaurante (1)
Salvi Restaurant (1)
Amarachi Restaurant (1)
Giano Restaurant (2)
.......25 more items

Suggested value: Giano Restaurant


Cluster 1 (of size 35)

Lasagna Restaurant (1)
Caridad Restaurant (2)
Lali Restaurant (1)
Elia Restaurant (1)
Aita Restaurant (1)
Salvi Restaurant (1)
Karczma Restaurant (1)
Jahns Restaurant (1)
Palm Restaurant (1)
Kefi Restaurant (1)
.......25 more items

Suggested value: Caridad Restaurant


Cluster 2 (of size 31)

Padishah Restaurant (1)
Lasagna Restaurant (1)
Caridad Restaurant (2)
Lali Restaurant (1)
Mimi's Restaurant (1)
Kalina Restaurant (1)
Manaba Restaurante (1)
Elia Restaurant (1)
Aita Restaurant (1)
Jahns Restaurant (1)
.......21 more items

Suggested va

In [None]:
#Number Save cleaned data

In [None]:
#Number outputpath = 'cleaned_data.csv'
#Number df.to_csv(outputpath,sep=',',index=False,header=True) 

# Some discussion

We have profiled and cleaned most of the columns, we first change some of the column names so that they present right information about the data, then we look at each of these columns to detect outliers and wrong format.

However, there are still some issues, first we keep most of the empty value as NaN, and we don't know if clustering is the best way to clean the name data since it might convert similar names to one same name. And, business names are too long that we can not perform clustering on them so we only fixed empty values. Also, there are some column names in upper case, we do not know if we should convert them to lower case as other columns.

In [None]:
df.columns

## Precision and Recall

In [188]:
cleaned_columns = ['Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)','Time of Submission', 
                   "Latitude", "Longitude", "Council District", "Census Tract", "NTA"]

cleaned_columns.extend(boolean_cols)
cleaned_columns.extend(house_num_cols)


In [189]:
#for col in df.columns:
#    if 'Permittee' in col:
#        cleaned_columns.append(col)

In [190]:
df_sample_data = df_sample.rename(columns=rename_dict)


In [191]:
df_sample_data.columns

Index(['Objectid', 'Globalid', 'Seating Interest (Sidewalk/Roadway/Both)',
       'Restaurant Name', 'Legal Business Name', 'Doing Business As (DBA)',
       'Building Number', 'Street', 'Borough', 'Postcode', 'Business Address',
       'Food Service Establishment Permit Number',
       'Sidewalk Dimensions (Length)', 'Sidewalk Dimensions (Width)',
       'Sidewalk Dimensions (Area)', 'Roadway Dimensions (Length)',
       'Roadway Dimensions (Width)', 'Roadway Dimensions (Area)',
       'Approved for Sidewalk Seating', 'Approved for Roadway Seating',
       'Qualify Alcohol', 'SLA Serial Number', 'SLA License Type',
       'Landmark District or Building', 'landmarkDistrict terms',
       'healthCompliance terms', 'Time of Submission', 'Latitude', 'Longitude',
       'Community Board', 'Council District', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

In [192]:
df_sample_data = df_sample_data[cleaned_columns]

In [193]:
df_temp = df.loc[df_sample_data.index][cleaned_columns].copy()

In [194]:
# for col in cleaned_columns:
#     print("column: ", col)
#     print("Original,\t Cleaned\n")
#     for i in range(50):
#         print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])
        
#     print('======================\n\n')

In [195]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

In [196]:
cleaned_columns

['Restaurant Name',
 'Legal Business Name',
 'Doing Business As (DBA)',
 'Time of Submission',
 'Latitude',
 'Longitude',
 'Council District',
 'Census Tract',
 'NTA',
 'Approved for Sidewalk Seating',
 'Approved for Roadway Seating',
 'Qualify Alcohol',
 'Landmark District or Building',
 'landmarkDistrict terms',
 'healthCompliance terms',
 'Building Number']

In [197]:
col_idx = 0

In [198]:
tp = 0
fp = 0
fn = 0

In [199]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Restaurant Name
Original,	 Cleaned

LE PAIN QUOTIDIEN 	 LE PAIN QUOTIDIEN
Tarachi 	 Tarachi
Briciola 	 Briciola
Starbucks Reserve Roastery 	 Starbucks Reserve Roastery
starbucks 	 starbucks
Judy's Spanish Restaurant 	 Judy's Spanish Restaurant
Kung Fu Tea 	 Kung Fu Tea
Blackbird's Bar and Restaurant 	 Blackbird's Bar and Restaurant
MAYA FUSION CAFE CORP 	 MAYA FUSION CAFE CORP
LE PAIN QUOTIDIEN 	 LE PAIN QUOTIDIEN
JAPAS NY 	 JAPAS NY
Blueprint Bar 	 Blueprint Bar
Hernandez 	 Hernandez
ANGEL OF HARLEM 	 ANGEL OF HARLEM
FUJI EAST 	 FUJI EAST
ANTIKA PIZZA 	 ANTIKA PIZZA
Taboo 	 Taboo
Toad Hall 	 Toad Hall
Butler Soho LLC 	 Butler Soho LLC
Lazzara's pizza Cafe & Restaurant 	 Lazzara's pizza Cafe & Restaurant
BOHEMIAN SPIRIT RESTAURANT 	 BOHEMIAN SPIRIT RESTAURANT
FANDI MATA 	 FANDI MATA
7B Horseshoe Bar 	 7B Horseshoe Bar
Ajihei 	 Ajihei
ROSA MEXICANO 	 ROSA MEXICANO
D HACK'S HALAL 	 D HACK'S HALAL
Thai72 	 Thai72
Percy's Pizza 	 Percy's Pizza
Suited 	 Suited
EL ROCHE LEBANON 	 EL

In [200]:
fn += 31

In [201]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Legal Business Name
Original,	 Cleaned

MPQ 921 BROADWAY, LLC 	 MPQ 921 BROADWAY, LLC
Tarachi NYC LLC 	 Tarachi NYC LLC
Briciola Corp 	 Briciola Corp
Siren Retail Corporation 	 Siren Retail Corporation
Sstarbucks coffee company 	 Sstarbucks coffee company
JMF Restaurant Corp 	 JMF Restaurant Corp
TKK New York LLC 	 TKK New York LLC
Best Bar Ever LLC 	 Best Bar Ever LLC
MAYA FUSION CAFE CORP 	 MAYA FUSION CAFE CORP
APQ UNION SQUARE NY, LLC 	 APQ UNION SQUARE NY, LLC
JAY ENTERTAINMENT CORP 	 JAY ENTERTAINMENT CORP
196 Fifth LLC 	 196 Fifth LLC
CHOPT Creative Salad Company 	 CHOPT Creative Salad Company
ANGEL OF HARLEM LLC 	 ANGEL OF HARLEM LLC
FUJI EAST OF ROOSEVELT ISLAND INC 	 FUJI EAST OF ROOSEVELT ISLAND INC
DON GENNARO LLC 	 DON GENNARO LLC
Navic Invest Group LLC 	 Navic Invest Group LLC
57 Grand Street Cafe Corp 	 57 Grand Street Cafe Corp
BUTLER SOHO LLC 	 BUTLER SOHO LLC
Sebastians Pizzeria. Inc 	 Sebastians Pizzeria. Inc
AMBI NEW YORK LLC 	 AMBI NEW YORK LLC
74 BAYARD S

In [202]:
fn += 26

In [203]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Doing Business As (DBA)
Original,	 Cleaned

LPQ USA, LLC - Disbursements Account 	 LPQ USA, LLC - Disbursements Account
Tarachi NYC LLC 	 Tarachi NYC LLC
Briciola Corp 	 Briciola Corp
Siren Retail Corporation 	 Siren Retail Corporation
Sstarbucks coffee company 	 Sstarbucks coffee company
JMF Restaurant Corp 	 JMF Restaurant Corp
TKK New York LLC 	 TKK New York LLC
Blackbird's 	 Blackbird's
MAYA FUSION CAFE CORP 	 MAYA FUSION CAFE CORP
LE PAIN QUOTIDIEN 	 LE PAIN QUOTIDIEN
JAPAS NY 	 JAPAS NY
196 Fifth LLC 	 196 Fifth LLC
CHOPT 	 CHOPT
ANGEL OF HARLEM 	 ANGEL OF HARLEM
FUJI EAST 	 FUJI EAST
ANTIKA PIZZA 	 ANTIKA PIZZA
Navic Invest Group LLC 	 Navic Invest Group LLC
57 Grand Street Cafe Corp 	 57 Grand Street Cafe Corp
Butler 	 Butler
Lazzara's Pizza Cafe. & Restaurant 	 Lazzara's Pizza Cafe. & Restaurant
Bohemian Spirit Restaurant 	 Bohemian Spirit Restaurant
FANDI MATA 	 FANDI MATA
7B Horseshoe Bar 	 7B Horseshoe Bar
Ajihei 	 Ajihei
Rosa Mexicano Company, Inc. 	 Rosa Mexicano

In [204]:
fn += 37

In [205]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Time of Submission
Original,	 Cleaned

07/07/2021 02:01:00 PM 	 2021-07-07 14:01:00
06/23/2020 12:19:00 PM 	 2020-06-23 12:19:00
06/20/2020 12:42:00 PM 	 2020-06-20 12:42:00
07/10/2020 01:11:00 PM 	 2020-07-10 13:11:00
07/09/2020 03:50:00 PM 	 2020-07-09 15:50:00
06/25/2020 04:29:00 PM 	 2020-06-25 16:29:00
08/03/2020 05:56:00 PM 	 2020-08-03 17:56:00
06/19/2020 02:19:00 PM 	 2020-06-19 14:19:00
06/29/2020 11:11:00 AM 	 2020-06-29 11:11:00
09/27/2020 02:27:00 PM 	 2020-09-27 14:27:00
09/30/2020 03:50:00 PM 	 2020-09-30 15:50:00
06/22/2020 12:38:00 PM 	 2020-06-22 12:38:00
07/07/2020 09:00:00 AM 	 2020-07-07 09:00:00
11/03/2020 02:10:00 PM 	 2020-11-03 14:10:00
02/25/2021 10:26:00 PM 	 2021-02-25 22:26:00
06/19/2020 04:58:00 PM 	 2020-06-19 16:58:00
08/03/2020 04:44:00 PM 	 2020-08-03 16:44:00
06/22/2020 12:57:00 PM 	 2020-06-22 12:57:00
08/30/2021 10:36:00 AM 	 2021-08-30 10:36:00
06/22/2020 12:04:00 PM 	 2020-06-22 12:04:00
03/09/2021 02:22:00 PM 	 2021-03-09 14:22:00
12/26/2

In [206]:
tp += 50

In [207]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Latitude
Original,	 Cleaned

40.739976 	 40.739976
40.687145 	 40.687145
40.763861 	 40.763861
40.74144 	 40.74144
nan 	 nan
40.786676 	 40.786676
nan 	 nan
40.763418 	 40.763418
40.685896 	 40.685896
40.732566 	 40.732566
40.750628 	 40.750628
40.676791 	 40.676791
40.728347 	 40.728347
40.808166 	 40.808166
40.759166 	 40.759166
40.764935 	 40.764935
40.749869 	 40.749869
nan 	 nan
40.720497 	 40.720497
40.753873 	 40.753873
40.769198 	 40.769198
40.719092 	 40.719092
40.724992 	 40.724992
40.704289 	 40.704289
40.738242 	 40.738242
40.683663 	 40.683663
40.777924 	 40.777924
40.729261 	 40.729261
40.709241 	 40.709241
40.767555 	 40.767555
40.784727 	 40.784727
40.746021 	 40.746021
40.752331 	 40.752331
40.722519 	 40.722519
40.726983 	 40.726983
40.741688 	 40.741688
40.731418 	 40.731418
40.740915 	 40.740915
40.849175 	 40.849175
40.855061 	 40.855061
40.736608 	 40.736608
40.774728 	 40.774728
40.853756 	 40.853756
40.702516 	 40.702516
40.744619 	 40.744619
40.732953 

In [208]:
fn += 3

In [209]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Longitude
Original,	 Cleaned

-73.989564 	 -73.989564
-73.961988 	 -73.961988
-73.987918 	 -73.987918
-74.00502 	 -74.00502
nan 	 nan
-73.950325 	 -73.950325
nan 	 nan
-73.913966 	 -73.913966
-73.91179 	 -73.91179
-73.991265 	 -73.991265
-73.981918 	 -73.981918
-73.980218 	 -73.980218
-74.005343 	 -74.005343
-73.952625 	 -73.952625
-73.952006 	 -73.952006
-73.917231 	 -73.917231
-73.995142 	 -73.995142
nan 	 nan
-73.998662 	 -73.998662
-73.989612 	 -73.989612
-73.957169 	 -73.957169
-73.949228 	 -73.949228
-73.981283 	 -73.981283
-73.928003 	 -73.928003
-73.991152 	 -73.991152
-73.83387 	 -73.83387
-73.980029 	 -73.980029
-74.001404 	 -74.001404
-74.008 	 -74.008
-73.912025 	 -73.912025
-73.973444 	 -73.973444
-73.896275 	 -73.896275
-73.950751 	 -73.950751
-73.982625 	 -73.982625
-74.001959 	 -74.001959
-73.981076 	 -73.981076
-73.954472 	 -73.954472
-73.992083 	 -73.992083
-73.936821 	 -73.936821
-73.887997 	 -73.887997
-73.987284 	 -73.987284
-73.908698 	 -73.908698
-73.889

In [210]:
fn += 3

In [211]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Council District
Original,	 Cleaned

2 	 2.0
35 	 35.0
3 	 3.0
3 	 3.0
nan 	 nan
5 	 5.0
nan 	 nan
22 	 22.0
37 	 37.0
2 	 2.0
4 	 4.0
39 	 39.0
3 	 3.0
9 	 9.0
5 	 5.0
22 	 22.0
3 	 3.0
nan 	 nan
1 	 1.0
3 	 3.0
5 	 5.0
33 	 33.0
2 	 2.0
34 	 34.0
2 	 2.0
28 	 28.0
6 	 6.0
3 	 3.0
1 	 1.0
22 	 22.0
6 	 6.0
26 	 26.0
26 	 26.0
2 	 2.0
3 	 3.0
2 	 2.0
33 	 33.0
3 	 3.0
10 	 10.0
15 	 15.0
2 	 2.0
22 	 22.0
15 	 15.0
24 	 24.0
26 	 26.0
2 	 2.0
2 	 2.0
4 	 4.0
15 	 15.0
6 	 6.0




In [212]:
fn += 3

In [213]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Census Tract
Original,	 Cleaned

56 	 56.0
231 	 231.0
133 	 133.0
83 	 83.0
nan 	 nan
15802 	 15802.0
nan 	 nan
147 	 147.0
401 	 401.0
61 	 61.0
82 	 82.0
131 	 131.0
37 	 37.0
220 	 220.0
23801 	 23801.0
63 	 63.0
101 	 101.0
nan 	 nan
45 	 45.0
113 	 113.0
126 	 126.0
515 	 515.0
32 	 32.0
427 	 427.0
52 	 52.0
98 	 98.0
157 	 157.0
65 	 65.0
1502 	 1502.0
143 	 143.0
169 	 169.0
263 	 263.0
19 	 19.0
2202 	 2202.0
49 	 49.0
68 	 68.0
565 	 565.0
54 	 54.0
265 	 265.0
391 	 391.0
50 	 50.0
113 	 113.0
391 	 391.0
214 	 214.0
7 	 7.0
61 	 61.0
34 	 34.0
76 	 76.0
23702 	 23702.0
157 	 157.0




In [214]:
fn += 3

In [215]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  NTA
Original,	 Cleaned

Hudson Yards-Chelsea-Flatiron-Union Square 	 Hudson Yards-Chelsea-Flatiron-Union Square
Clinton Hill 	 Clinton Hill
Clinton 	 Clinton
Hudson Yards-Chelsea-Flatiron-Union Square 	 Hudson Yards-Chelsea-Flatiron-Union Square
nan 	 nan
East Harlem South 	 East Harlem South
nan 	 nan
Astoria 	 Astoria
Bushwick South 	 Bushwick South
West Village 	 West Village
Murray Hill-Kips Bay 	 Murray Hill-Kips Bay
Park Slope-Gowanus 	 Park Slope-Gowanus
SoHo-TriBeCa-Civic Center-Little Italy 	 SoHo-TriBeCa-Civic Center-Little Italy
Central Harlem South 	 Central Harlem South
Lenox Hill-Roosevelt Island 	 Lenox Hill-Roosevelt Island
Astoria 	 Astoria
Midtown-Midtown South 	 Midtown-Midtown South
nan 	 nan
SoHo-TriBeCa-Civic Center-Little Italy 	 SoHo-TriBeCa-Civic Center-Little Italy
Midtown-Midtown South 	 Midtown-Midtown South
Lenox Hill-Roosevelt Island 	 Lenox Hill-Roosevelt Island
North Side-South Side 	 North Side-South Side
East Village 	 East Village
Bushwick No

In [216]:
fn += 21

In [217]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Approved for Sidewalk Seating
Original,	 Cleaned

yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False




In [218]:
tp += 7
fp += 43

In [219]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Approved for Roadway Seating
Original,	 Cleaned

no 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
no 	 False




In [220]:
tp += 18
fp += 32


In [221]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Qualify Alcohol
Original,	 Cleaned

no 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False




In [222]:
tp += 17
fp += 33

In [223]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Landmark District or Building
Original,	 Cleaned

no 	 False
no 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
no 	 False
yes 	 False
yes 	 False
no 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
yes 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
no 	 False
yes 	 False
no 	 False
no 	 False
yes 	 False




In [224]:
tp += 37
fp += 13

In [225]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  landmarkDistrict terms
Original,	 Cleaned

nan 	 False
nan 	 False
nan 	 False
yes 	 False
yes 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
yes 	 False
yes 	 False
nan 	 False
yes 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
yes 	 False
nan 	 False
nan 	 False
nan 	 False
yes 	 False
yes 	 False
nan 	 False
nan 	 False
yes 	 False
nan 	 False
nan 	 False
yes 	 False
nan 	 False
yes 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
nan 	 False
yes 	 False
nan 	 False
nan 	 False
yes 	 False




In [226]:
tp += 37
fp += 13

In [227]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  healthCompliance terms
Original,	 Cleaned

yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False
yes 	 False




In [228]:
fp += 50

In [229]:
col = cleaned_columns[col_idx]
print("column: ", col)
print("Original,\t Cleaned\n")
for i in range(50):
    print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

print('======================\n\n')
col_idx += 1

column:  Building Number
Original,	 Cleaned

undefined 	 UNDEFINED
222 	 222
370 	 370
61 	 61
124 	 124
1505 	 1505
115 	 115
41-19 	 41-19
1337 	 1337
undefined 	 UNDEFINED
undefined 	 UNDEFINED
196 	 196
200 	 200
undefined 	 UNDEFINED
undefined 	 UNDEFINED
3608 	 3608
406 	 406
57 	 57
undefined 	 UNDEFINED
221 	 221
undefined 	 UNDEFINED
undefined 	 UNDEFINED
108 	 108
156 	 156
undefined 	 UNDEFINED
undefined 	 UNDEFINED
128 	 128
190 	 190
45 	 45
undefined 	 UNDEFINED
483 	 483
4004 	 4004
44051 	 44051
210 	 210
undefined 	 UNDEFINED
384 	 384
941 	 941
48 	 48
undefined 	 UNDEFINED
2373 	 2373
123 	 123
35-19 	 35-19
2307 	 2307
138-11 	 138-11
undefined 	 UNDEFINED
64 	 64
162 	 162
43 W 	 43 W
2462 	 2462
undefined 	 UNDEFINED




In [230]:
fn += 15

In [232]:
# col = cleaned_columns[col_idx]
# print("column: ", col)
# print("Original,\t Cleaned\n")
# for i in range(50):
#     print(df_sample_data[col].iloc[i], '\t', df_temp[col].iloc[i])

# print('======================\n\n')
# col_idx += 1

In [233]:
tp

166

In [234]:
fp

184

In [235]:
fn

142

In [236]:
precision(tp, fp)

0.4742857142857143

In [237]:
recall(tp,fn)

0.538961038961039