# Data Downloading

Download the data using openClean

In [1]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

dataset = Socrata().dataset('ic3t-wcy2')
datafile = './ic3t-wcy2.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Using 'DOB Job Application Filings' in file ./ic3t-wcy2.tsv.gz of size 257.98 MB


# Data Loading

Load the data into pandas and openClean dataset object

In [2]:
import pandas as pd
from openclean.pipeline import stream

df  = pd.read_csv(datafile, dtype='object', sep='\t')
ds = stream(datafile)

# Data Profilling for City and Other Description

Find format problems and outliers in City and Description columns

Using openclean's sklearn modules to detect problems and outliers

In [3]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    applicant_data = ds.distinct(column_name)
    print("Column: ",column_name)
    
    for rank, val in enumerate(applicant_data.most_common(10)):        
        st, freq = val
        print('{:<3} {:>8}  {:>10}'.format('{}.'.format(rank + 1), st, '{:,}'.format(freq)))

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(applicant_data)))
    print(DBSCANOutliers(eps = eps_setting).find(applicant_data))
    print('\n==================================')

In [13]:
date_cols = ["City ", "Other Description"]

In [14]:
date_cols = ["City ", "Other Description"]
print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.1)

----------------------------

Column:  City 
1.             1,773,830
2.  BROOKLYN         309
3.  NEW YORK         273
4.    QUEENS          56
5.        NY          37
6.     BRONX          32
7.  STATEN ISLAND          19
8.       LIC          18
9.  New York          18
10. FLUSHING          15

Total number of distinct values in City  is 115
['', 'WILLIAMSBURG', 'L.I.C.', 'NEW YORK CITY', 'REGO PARK', 'New York City', 'HILLSBOROUGH', 'E. ELMHURST', 'NEW HYDE PARK', 'ST. ALBANS', 'REGO PK', 'New York', "B'KLYN", 'MANHATTAN', 'N.Y.', 'NEW YORK', 'FORT LEE']

Column:  Other Description
1.               734,887
2.  GEN. CONSTR.     316,603
3.        GC     116,243
4.  GEN. CONSTR      87,351
5.  STRUCTURAL      55,576
6.       BPP      50,685
7.  GEN CONST      42,153
8.  SOLAR TAX      33,561
9.  GEN.CONSTRUCTN.      26,544
10.   FACADE      22,633

Total number of distinct values in Other Description is 16629
['', 'S.O.E.', 'FD & S.O.E.', 'ENL17000', 'L15/73', 'B S A', 'LOCAL LAW 11

# Analysis

the above results show the problems for the data cleaning task:
    
### For City

There are many misspellings and abbreviations for city names. We can use both clustering and Soundex to detect misspellings and abbreviations. And we can check if our cleaning is right by refer to the U.S. Cities reference datasets in openclean.


### For Other Description

Other Description can be anything, so we just care about empty value and values that are too similar and are showing exactly same things (for example 'GC' and '___GC')

In [24]:
from openclean.function.eval.base import Col, Eval
from openclean.function.eval.logic import And
from openclean.function.value.phonetic import Soundex, soundex

In [27]:
upper = ds\
    .select('City ')\
    .update('City ', str.upper)

In [18]:
print('RANK\tCOUNT\tNAME')
for i, entry in enumerate(brooklyn.most_common()):
    key, count = entry
    print('{}.\t{}\t{}'.format(i + 1, count, key))

RANK	COUNT	NAME
1.	2	BROKKLYN
2.	1	BRKLYN


In [20]:
from openclean.data.refdata import RefStore

refdata = RefStore()
city_df = refdata\
    .load('encyclopaedia_britannica:us_cities', auto_download=True)\
    .df()


            city    state
0      Demopolis  Alabama
1      Sylacauga  Alabama
2           Troy  Alabama
3         Dothan  Alabama
4       Prichard  Alabama
5     Scottsboro  Alabama
6   Guntersville  Alabama
7         Auburn  Alabama
8     Huntsville  Alabama
9      Chickasaw  Alabama
10     Tuscumbia  Alabama
11      Bessemer  Alabama
12       Eufaula  Alabama
13       Opelika  Alabama
14    Fort Payne  Alabama
15     Andalusia  Alabama
16       Decatur  Alabama
17        Mobile  Alabama
18    Enterprise  Alabama
19        Jasper  Alabama


In [50]:
city_list = city_df['city']
print(city_list)

0          Demopolis
1          Sylacauga
2               Troy
3             Dothan
4           Prichard
            ...     
1956          Powell
1957        Riverton
1958        Sheridan
1959    Rock Springs
1960         Buffalo
Name: city, Length: 1961, dtype: object


# An example of using soundex in openclean

However, using soundex for each of the city is too slow, the code below take nearly 4 mins for one sningle city.\
So we should use clustering first and then use hard code to clean the remianing city name that is not in the city_list.

In [38]:
brooklyn = ds\
    .select('City ')\
    .update('City ', str.upper)\
    .filter(And(Eval('City ', Soundex()) == soundex('BROOKLYN'), Col('City ') != 'BROOKLYN'))\
    .distinct()

print('RANK\tCOUNT\tNAME')
for i, entry in enumerate(brooklyn.most_common()):
    key, count = entry
    print('{}.\t{}\t{}'.format(i + 1, count, key))

RANK	COUNT	NAME
1.	2	BROKKLYN
2.	1	BRKLYN


# Data Cleaning for Applicant columns

* how to deal with empty values has not decided yet

# Transform all city names to upper case

In [69]:
df['City '] = df['City '].str.upper()

# Convert similar values to suggested value using kNN clustering

In [7]:
# Cluster string using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

def getClusters(col, minsize = 2, preds = 0.5):
    dba = ds.select(col).distinct()
    clusters = knn_clusters(
        values=dba,
        sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(preds)),
        minsize=minsize
    )
    return clusters

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def updateUsingClusters(col, clusters, isPrint = False):
    
    orignal_list = []
    suggestion_list = []
    clusters.sort(key=lambda c: len(c), reverse=True)
       
    for i, cluster in enumerate(clusters):        
        suggestion = cluster.suggestion()
        orignal_list = []
        suggestion_list = []
        if isPrint and i <5:
            print_cluster(i, cluster)
        
        for val, count in cluster.items(): 
            orignal_list.append(val)
            suggestion_list.append(suggestion)
    
    df[col] = df[col].replace(orignal_list, suggestion_list)

In [49]:
for col in date_cols:
    print("kNN cluster for ", col)
    col_clusters = getClusters(col)
    print("updating column ", col)
    print("----------------------\nTop 5 Cluster:\n----------------------")
    updateUsingClusters(col, col_clusters, True)
    print("================")

kNN cluster for  City 
updating column  City 
----------------------
Top 5 Cluster:
----------------------
Cluster 0 (of size 2)

NEW YORK CITY (2)
NEW YORK (273)

Suggested value: NEW YORK


Cluster 1 (of size 2)

ROOKLYN (1)
BROOKLYN (309)

Suggested value: BROOKLYN


Cluster 2 (of size 2)

MANHATTAN (8)
MAHATTAN (2)

Suggested value: MANHATTAN


Cluster 3 (of size 2)

OZONE PARK (1)
SOUTH OZONE PAR (2)

Suggested value: SOUTH OZONE PAR


Cluster 4 (of size 2)

JACKSON HEIGHTS (1)
ROSLYN HEIGHTS (1)

Suggested value: JACKSON HEIGHTS


kNN cluster for  Other Description
updating column  Other Description
----------------------
Top 5 Cluster:
----------------------
Cluster 0 (of size 773)

GEN. CONSTR. (316603)
GENERAL CONSTR (283)
GEN. CONSTR (87351)
GN CONSTRUCTION (1)
GEN CONSTRUCT (3644)
GEN.CONSTRUCTN. (26544)
GENERAL CONSTRU (2192)
CONSTR (2510)
GENERAL CONSTRT (117)
GEN. CONSTRUCT. (36)
GEN'L CONSTRUCT (343)
GENCONSTRUCTION (54)
GEN.CONSTRUCT. (345)
GEN. CONSTRUCTI (36)
GEN CONS

GEN CONSTRUT (11)
INT. CONSTRUC (4)
GENREAL CONSTRU (1)
CONSTRX (3)
GEN CONSTRXZ (1)
GEN. CONSTRN. (2)
ST,GENER CONSTR (1)
GENL CONSTRUCTI (3)
DEMO/GEN CONSTR (1)
GEN CONSTRUTION (1)
GEN. CONSTRUCTN (2)
GENR'L CONSTR (1)
GEN.CONSTRUC (2)
GEN.CONSTRUCION (1)
GEN CONSTR/ENL (1)
GEN CONSTRUCTON (1)
TEMP.CONSTR (1)
LEG. CONSTRUCTI (2)
MINOR CONSTRUTI (3)
NON CONSTRUCTIO (1)
TEMP CONSTRUCT (6)
GEN.CONSTRCTION (1)
OT-GEN. CONSTR. (1)
OT-GEN. CONSTRU (1)
GEN CONSTRUCT` (6)
ARCH/GEN CONSTR (1)
MARQUEE CONSTRU (1)
GNRL CONSTRUCTI (1)
TEMP CONSTR OFF (6)
GEN  CONSTRU (1)
DEMO.& CONSTRCT (1)
GEN CONSTR/ STR (2)
GEN CONSTR,STRU (1)
ENLGEN.CONSTRU (1)
GEN.CONSTRUC6TN (1)
GENERAL CONSTRC (4)
ENLGEN.CONSTRN. (7)
GEN. CONSTRUCTR (1)
GENCONSTR/STRUC (1)
GEN CONSTR/ST (1)
GEN.CONSTRCTN. (4)
ZGEN CONSTRUCT (1)
GENCONSTR (4)
GEN.CONSTRUCTI (27)
GEN.CONSTR/GATE (1)
GEN CONSTRU CT (8)
GEN  CONSTRUCT (6)
GEN CONSTRUCYT (1)
GEN CONSTRUC T (3)
GEN CONSTREUCT (1)
GEN CONSTRUC TG (1)
GEN NCONSTRUCT (1)
GEN CONST



# After clustering, find data that is not in the reference city dataset, hard code to clean them

In [76]:
upper_city_list = []
for item in city_list:
    upper_city_list.append(str(item.upper()))

outlier_cities = df.loc[(~df['City '].str.upper().isin(upper_city_list)) & (~df['City '].isna())]['City '].drop_duplicates()
print(outlier_cities)

2348          NEW YORK
10332            BKLYN
10466          ROOKLYN
31713    RICHMOND HILL
38548               BX
             ...      
76789         TUCKAHOE
77178          SEAFORD
78833           L.I.C.
80175          REGO PK
80367           B'KLYN
Name: City , Length: 73, dtype: object


# Print standardized cities and found outliers

In [81]:
standardized_cities = df.loc[(df['City '].str.upper().isin(upper_city_list)) & (~df['City '].isna())]['City '].drop_duplicates()
print(standardized_cities)

4040            BROOKLYN
52605              BRONX
69921      STATEN ISLAND
73527             QUEENS
73910            ASTORIA
74098             ALBANY
74206            HOBOKEN
74402       NEW ROCHELLE
74427    PORT WASHINGTON
74632           FLUSHING
74784       FOREST HILLS
74853          MANHATTAN
76042          HEMPSTEAD
76043        LOS ANGELES
76087       HILLSBOROUGH
76105             BOSTON
76107         GREAT NECK
76171          RIDGEWOOD
76187        GARDEN CITY
76219       WILLIAMSBURG
76295             CORONA
76300      NEW YORK CITY
76332          RIVERSIDE
76496           ELMHURST
76499         MORRISTOWN
76601           FORT LEE
76647          TARRYTOWN
76661           LAKEWOOD
76688         LOUISVILLE
76728            ROANOKE
76757            YONKERS
76771       WHITE PLAINS
76773         HUNTINGTON
76967        KANSAS CITY
Name: City , dtype: object


In [78]:
outlier_city_list = []
for item in outlier_cities:
    outlier_city_list.append(str(item))
    
print(outlier_city_list)

['NEW YORK', 'BKLYN', 'ROOKLYN', 'RICHMOND HILL', 'BX', 'NY', 'OLD WESTBURY', 'N.Y.', 'HOLLIS', 'MAHATTAN', 'LAKE SUCCESS', 'BROKKLYN', 'BETHESDA', 'JAMAICA', 'SECAUCUS', 'LIC', 'MASPETH', 'JAMAICA ESTATES', 'SOUTH OZONE PAR', 'BAYSIDE', 'JAM', 'PARMUS', 'KEW GARDENS', 'WOONSECKET', 'LI', 'ST. ALBANS', 'MASSAPEQUA', 'SI', 'FLORAL PARK', 'ROSLYN HEIGHTS', 'HOWARD BEACH', 'WHITEPLAINS', 'JACKSON HEIGHTS', 'REGO PARK', 'NEW HYDE PARK', 'REGO', 'ARVERNE', 'OZONE PARK', 'VAALLEY STREAAM', 'NEPONSIT', 'ROCKVILLE CENTR', 'BRIARWOOD', 'BRKLYN', 'MOUNT LAUREL', 'QUEEEN', 'ELMSFORD', 'NYC', 'GILLFORD', 'PARSIPPANY', 'WOODSIDE', 'LONG ISLAND CIT', 'QUEEN', 'BRONS', 'COLLEGE POINT', 'ROCKAWAY POINT', 'DOUGLASTON', 'ENGLEWOOD CLIFF', 'QNS', 'LYNBROOK', 'SYOSSET', 'FRESH MEADOWS', 'LITTLE NECK', 'WOODHAVEN', 'HARTSDALE', 'ATLANTIC BEACH', 'SAN JUAN CAPIST', 'CALDE PLACE', 'RIVERDALE', 'TUCKAHOE', 'SEAFORD', 'L.I.C.', 'REGO PK', "B'KLYN"]


# Search for similar city names in reference city dataset, and hard code to replace those outliers

In [123]:
def findCityName(str):
    print(city_df['city'].loc[city_df['city'].str.contains(str)].drop_duplicates())
    print("------------------------\n")
    
findCityName("Rich")
findCityName("Island")
findCityName("White")
findCityName("Philadelphia")
findCityName("Morris")
findCityName("Nassau")
findCityName("Westchester")
  

137       Richmond
1728    Richardson
1860      Richland
Name: city, dtype: object
------------------------

498       Rock Island
1060     Grand Island
1286    Staten Island
1294     Coney Island
Name: city, dtype: object
------------------------

363             White Springs
1266             White Plains
1881    White Sulphur Springs
Name: city, dtype: object
------------------------

997         Philadelphia
1418    New Philadelphia
Name: city, dtype: object
------------------------

1151    Morristown
Name: city, dtype: object
------------------------

Series([], Name: city, dtype: object)
------------------------

Series([], Name: city, dtype: object)
------------------------



In [125]:
clean_city_list = ['NEW YORK CITY', 'BROOKLYN', 'BROOKLYN', 'RICHMOND', 'BRONX', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'MANHATTAN', 'NEW YORK CITY', 'BROOKLYN', 'BETHESDA', 'NEW YORK CITY', 'SECAUCUS', 'LONG ISLAND CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'PARAMUS', 'NEW YORK CITY', 'WOONSOCKET', 'LONG ISLAND CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'STATEN ISLAND', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'WHITE PLAINS', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'BROOKLYN', 'PHILADELPHIA', 'QUEEENS', 'NEW YORK CITY', 'NEW YORK CITY', 'GILLFORD', 'MORRIS', 'NEW YORK CITY', 'LONG ISLAND CITY', 'QUEENS', 'BRONX', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'NEW YORK CITY', 'QUEENS', 'NEW YORK CITY', 'NASSAU', 'QUEENS', 'QUEENS', 'WOODHAVEN', 'NEW YORK CITY', 'NASSAU', 'SAN JUAN CAPISTRANO', 'BROOKLYN', 'RIVERDALE', 'STATEN ISLAND', 'NASSAU', 'LONG ISLAND CITY', 'QUEENS', "BROOKLYN"]

df['City '] = df['City '].replace(outlier_city_list, clean_city_list)

# Check State Column

In [152]:
state_col = 'State'
findDateOutliers(state_col, 0.1)

Column:  State
1.             1,773,830
2.        NY         981
3.        NJ          11
4.        CA           3
5.        CT           3
6.        MD           1
7.        MA           1
8.        KY           1
9.        VA           1
10.       MO           1

Total number of distinct values in State is 10
['']



In [153]:
ds.select('State').distinct()

Counter({'': 1773830,
         'NY': 981,
         'MD': 1,
         'NJ': 11,
         'CA': 3,
         'MA': 1,
         'CT': 3,
         'KY': 1,
         'VA': 1,
         'MO': 1})

# Find functional dependencies violations on City -> State

In [138]:
from openclean.operator.collector.count import distinct
from openclean.operator.map.violations import fd_violations

groups = fd_violations(df, lhs='City ', rhs='State')

print('City         \t|            State')
print('=============\t|  ===============')
for key in groups:
    conflicts = distinct(groups.get(key), 'State').most_common()
    state, count = conflicts[0]
    print('{:<12} \t| {} x {}'.format(key, count, state))
    for state, count in conflicts[1:]:
        print('             \t| {} x {}'.format(count, state))
    print('-------------\t|  ---------------')

City         	|            State
NEW YORK CITY 	| 421 x NY
             	| 1 x NJ
-------------	|  ---------------


There is a row that has "NEW YORK CITY" as city, but have "NJ" as State, fix its state to "NY"

In [149]:
index = df['State'].loc[(df['City '] == "NEW YORK CITY") & (df['State'] == "NJ")].index[0]
df['State'].update(pd.Series(['NY'], index = [index]))

In [150]:
df['State'].loc[(df['City '] == "NEW YORK CITY") & (df['State'] == "NJ")]

Series([], Name: State, dtype: object)

# Apply similar operation on Owner's Business Name

In [5]:
bn_col = "Owner's Business Name"
findDateOutliers(bn_col)

Column:  Owner's Business Name
1.       N/A     260,197
2.                44,184
3.        NA      42,506
4.     OWNER      11,053
5.  NY SCHOOL CONSTRUCTION AUTHORITY       9,758
6.      NONE       9,559
7.   NYC SCA       9,228
8.  VORNADO OFFICE MANAGEMENT       6,990
9.     NYCHA       6,652
10. SL GREEN REALTY CORP.       5,696

Total number of distinct values in Owner's Business Name is 412716



# Using clustering for Business Name takes too much time, we can only clean those empty data for now 

In [9]:
df[bn_col] = df[bn_col].replace(['N/A', '', 'NA','NONE'], ['','','',''])