In [64]:
import openclean
import glob
import pandas as pd
import numpy as np


# Data Downloading

Download the data using openClean

In [65]:
import gzip
import humanfriendly
import os

from openclean.data.source.socrata import Socrata

dataset = Socrata().dataset('hg8x-zxpr')
datafile = './hg8x-zxpr.tsv.gz'

if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        dataset.write(f)


fsize = humanfriendly.format_size(os.stat(datafile).st_size)
print("Using '{}' in file {} of size {}".format(dataset.name, datafile, fsize))

Using 'Housing New York Units by Building' in file ./hg8x-zxpr.tsv.gz of size 256.84 KB


# Data Loading

Load the data into pandas and openClean dataset object

In [66]:
import pandas as pd
from openclean.pipeline import stream

df  = pd.read_csv(datafile, dtype='object', sep='\t')
ds = stream(datafile)

In [67]:
np.__version__

'1.18.5'

In [68]:
pd.__version__

'1.2.4'

In [69]:
import glob

In [70]:
glob.glob("*")

['Buildings-Selected-for-the-Alternative-Enforcement-hcir-3275.ipynb',
 'DOB-NOW-Build-Approved-Permits-rbx6-tga4.ipynb',
 'hcir-3275.tsv.gz',
 'hcir-3275_cleaned_data.csv',
 'hg8x-zxpr.tsv.gz',
 'hg8x-zxpr_cleaned_data.csv',
 'Housing-New-York-Units-by-Building-hg8x-zxpr.ipynb',
 'ic3t-wcy2.tsv.gz',
 'rbx6-tga4.tsv.gz',
 'rbx6-tga4_cleaned_data.csv',
 'Untitled.ipynb']

### Get some basic info about the dataset columns

In [71]:
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5362 entries, 0 to 5361
Data columns (total 41 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Project ID                          5362 non-null   object
 1   Project Name                        5362 non-null   object
 2   Project Start Date                  5362 non-null   object
 3   Project Completion Date             3754 non-null   object
 4   Building ID                         4341 non-null   object
 5   Number                              5362 non-null   object
 6   Street                              5362 non-null   object
 7   Borough                             5362 non-null   object
 8   Postcode                            4279 non-null   object
 9   BBL                                 4208 non-null   object
 10  BIN                                 2778 non-null   object
 11  Community Board                     5362 non-null   obje

  df.info(verbose=True, null_counts=True)


If any rows are complete duplicates, drop them

In [72]:
df = df.drop_duplicates()

Take an a look at some of the rows to get an idea of what the datset looks like

In [73]:
df

Unnamed: 0,Project ID,Project Name,Project Start Date,Project Completion Date,Building ID,Number,Street,Borough,Postcode,BBL,...,2-BR Units,3-BR Units,4-BR Units,5-BR Units,6-BR+ Units,Unknown-BR Units,Counted Rental Units,Counted Homeownership Units,All Counted Units,Total Units
0,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,927737,335,RALPH AVENUE,Brooklyn,11233,3015560003,...,11,0,0,0,0,0,0,13,13,13
1,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,969695,35,ROCHESTER AVENUE,Brooklyn,11233,3017090010,...,6,0,0,0,0,0,0,8,8,8
2,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,975702,18-22,SUYDAM PLACE,Brooklyn,11233,3017090027,...,1,0,0,0,0,0,0,15,15,15
3,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,977564,329/331,RALPH AVENUE,Brooklyn,,,...,7,0,0,0,0,0,0,10,10,10
4,58871,SHERMAN CLUSTER HDFC.YR15.FY21,06/30/2021,,64740,149,EAST 165 STREET,Bronx,10452,2024720001,...,10,0,0,0,0,0,20,0,20,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5357,55697,CONFIDENTIAL,01/14/2014,01/14/2014,,----,----,Brooklyn,,,...,0,0,0,0,0,1,0,1,1,1
5358,55773,CONFIDENTIAL,01/10/2014,01/10/2014,,----,----,Staten Island,,,...,0,0,0,0,0,1,0,1,1,1
5359,57341,CONFIDENTIAL,01/10/2014,01/10/2014,,----,----,Staten Island,,,...,0,0,0,0,0,1,0,1,1,1
5360,55647,CONFIDENTIAL,01/07/2014,01/07/2014,,----,----,Brooklyn,,,...,0,0,0,0,0,1,0,1,1,1


## Describe columns in groups so they fit on screen

In [74]:
df[df.columns[:20]].describe()

Unnamed: 0,Project ID,Project Name,Project Start Date,Project Completion Date,Building ID,Number,Street,Borough,Postcode,BBL,BIN,Community Board,Council District,Census Tract,NTA - Neighborhood Tabulation Area,Latitude,Longitude,Latitude (Internal),Longitude (Internal),Building Completion Date
count,5362,5362,5362,3754,4341,5362,5362,5362,4279,4208,2778,5362,5354,4285,4285,4279.0,4279.0,4201.0,4201.0,3970
unique,2817,1797,1255,1134,4300,2102,1143,5,143,3539,2715,61,51,643,158,4135.0,4128.0,3221.0,3495.0,1231
top,53017,CONFIDENTIAL,06/30/2021,06/20/2019,967819,----,----,Brooklyn,11221,2051410120,1077585,BK-03,42,1070,BK82,40.837941,-73.895254,40.87501,-73.828362,06/20/2019
freq,83,1021,134,103,3,1021,1021,2194,184,77,6,422,457,116,250,8.0,8.0,77.0,77.0,103


In [75]:
# Notes:
# Building Type looks binary and has 2 values + maybe NAN
# Cluster looks binary and has 2 values + maybe NAN
# Landmarked looks binary and has 4 values + maybe NAN
# Adult Establishment looks binary and has 2 values + maybe NAN
# Loft Board looks binary and has 2 values + maybe NAN
# City Owned looks binary and has 4 values + maybe NAN
# Little e looks binary and has 5 values + maybe NAN


In [76]:
df[df.columns[20:]].describe()

Unnamed: 0,Reporting Construction Type,Extended Affordability Only,Prevailing Wage Status,Extremely Low Income Units,Very Low Income Units,Low Income Units,Moderate Income Units,Middle Income Units,Other Income Units,Studio Units,...,2-BR Units,3-BR Units,4-BR Units,5-BR Units,6-BR+ Units,Unknown-BR Units,Counted Rental Units,Counted Homeownership Units,All Counted Units,Total Units
count,5362,5362,5362,5362,5362,5362,5362,5362,5362,5362,...,5362,5362,5362,5362,5362,5362,5362,5362,5362,5362
unique,2,2,2,146,176,200,89,107,4,123,...,151,78,29,9,6,20,271,118,295,351
top,Preservation,No,Non Prevailing Wage,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
freq,3172,4414,5259,4165,3338,1938,4485,4262,4533,3809,...,1942,3065,4959,5317,5342,4989,1576,3680,1025,1016


In [77]:
df.columns

Index(['Project ID', 'Project Name', 'Project Start Date',
       'Project Completion Date', 'Building ID', 'Number', 'Street', 'Borough',
       'Postcode', 'BBL', 'BIN', 'Community Board', 'Council District',
       'Census Tract', 'NTA - Neighborhood Tabulation Area', 'Latitude',
       'Longitude', 'Latitude (Internal)', 'Longitude (Internal)',
       'Building Completion Date', 'Reporting Construction Type',
       'Extended Affordability Only', 'Prevailing Wage Status',
       'Extremely Low Income Units', 'Very Low Income Units',
       'Low Income Units', 'Moderate Income Units', 'Middle Income Units',
       'Other Income Units', 'Studio Units', '1-BR Units', '2-BR Units',
       '3-BR Units', '4-BR Units', '5-BR Units', '6-BR+ Units',
       'Unknown-BR Units', 'Counted Rental Units',
       'Counted Homeownership Units', 'All Counted Units', 'Total Units'],
      dtype='object')

#### Method to get an idea of the top 10 values of a column

In [78]:
def show_vals(column_name, show_rows=10, df=df):
    print("Top {} {}:\n".format(show_rows, column_name))
    print(df[column_name].value_counts(dropna=False)[:show_rows])
    print()

### Examining Project IDs

Some repition in the Project ID's, but nothing major. We will check some of the repeated Project IDs to be sure they actually refer to the same jobs

In [79]:
df['Project ID'].value_counts(dropna=False)

53017    83
68530    79
66670    79
53157    75
52756    56
         ..
66983     1
70797     1
61713     1
61842     1
60020     1
Name: Project ID, Length: 2817, dtype: int64

Nothing weird looking here

In [80]:
df['Project ID'].min()

'44218'

In [81]:
df['Project ID'].max()

'71173'

No Project IDs starting with 0, so there are integer values, unlike Doc # which has values 01, 02, etc.

In [82]:
df.loc[df['Project ID'].str.startswith('0')]

Unnamed: 0,Project ID,Project Name,Project Start Date,Project Completion Date,Building ID,Number,Street,Borough,Postcode,BBL,...,2-BR Units,3-BR Units,4-BR Units,5-BR Units,6-BR+ Units,Unknown-BR Units,Counted Rental Units,Counted Homeownership Units,All Counted Units,Total Units


Fraction of Rows with unique job numbers

In [83]:
df['Project ID'].nunique()/df['Project ID'].count()

0.5253636702722865

Group by Project ID and check if latitude and longitude are the same all the same for the job, which would indicate different instances of the Project ID all refer to the same Job.

In [84]:
df_temp = df.sort_values(['Project ID'])

Most of these are just missing lat and long values.

The others look to be Jobs that manage multiple houses/lots in a small area, so are probably correct

In [85]:
df_temp

Unnamed: 0,Project ID,Project Name,Project Start Date,Project Completion Date,Building ID,Number,Street,Borough,Postcode,BBL,...,2-BR Units,3-BR Units,4-BR Units,5-BR Units,6-BR+ Units,Unknown-BR Units,Counted Rental Units,Counted Homeownership Units,All Counted Units,Total Units
1810,44218,MEC E. 125TH ST. PARCEL B WEST,12/31/2018,,987329,2319,3 AVENUE,Manhattan,10035,1017900046,...,129,15,0,0,0,0,297,0,297,404
3,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,977564,329/331,RALPH AVENUE,Brooklyn,,,...,7,0,0,0,0,0,0,10,10,10
0,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,927737,335,RALPH AVENUE,Brooklyn,11233,3015560003,...,11,0,0,0,0,0,0,13,13,13
1,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,969695,35,ROCHESTER AVENUE,Brooklyn,11233,3017090010,...,6,0,0,0,0,0,0,8,8,8
2,44223,ROCHESTER SUYDAM PHASE 1,06/30/2021,,975702,18-22,SUYDAM PLACE,Brooklyn,11233,3017090027,...,1,0,0,0,0,0,0,15,15,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,71168,3405 FARRAGUT ROAD,06/30/2021,,995382,3405,FARRAGUT ROAD,Brooklyn,11210,3049980008,...,2,0,0,0,0,0,3,0,3,10
131,71169,1527 NEW YORK AVENUE,06/30/2021,,992764,1527,NEW YORK AVENUE,Brooklyn,11210,,...,1,0,0,0,0,0,3,0,3,8
132,71170,255 GRAFTON STREET,06/30/2021,,995443,255,GRAFTON STREET,Brooklyn,11212,3035670016,...,3,0,0,0,0,0,3,0,3,8
133,71171,133 BROWN PLACE,06/30/2021,,994447,133,BROWN PLACE,Bronx,10454,2022780053,...,1,0,0,0,0,0,5,0,5,16


Check if any Project IDs have non-digit values

In [86]:
df['Project ID'] = df['Project ID'].astype('str')

In [87]:
df.loc[(~df['Project ID'].isna())
       &(~df['Project ID'].str.isdigit())]['Project ID']

Series([], Name: Project ID, dtype: object)

All Project IDs entirely composed of digits, so we cast them to ints

In [88]:
df['Project ID'] = df['Project ID'].astype('str')

In [89]:
df['Project ID'].describe()

count      5362
unique     2817
top       53017
freq         83
Name: Project ID, dtype: object

## Examining and reparing Building IDs

Building ID's appear to be mostly ints

However, there are legitimate house numbers with dashes so we'll have to make them strings

In [90]:
show_vals('Building ID', show_rows=10)

Top 10 Building ID:

NaN       1021
967819       3
307342       2
358498       2
990327       2
322293       2
927213       2
299448       2
953668       2
331152       2
Name: Building ID, dtype: int64



Replace NaN values with empty strings, then convert column to string, and make everything uppercase


In [91]:
df['Building ID'].fillna('', inplace=True)
df['Building ID'] = df['Building ID'].astype('str')
df['Building ID'] = df['Building ID'].str.upper()

Check for numbers spelled out as words

In [92]:
df.loc[(~df['Building ID'].isna())
       &(df['Building ID'].str.isalpha())]['Building ID']

Series([], Name: Building ID, dtype: object)

Maybe the Building ID and borough were flipped in the 'manhattan' case?

In [93]:
# Nope:
df.loc[df['Building ID'].str.strip('')=='MANHATTAN'][['Building ID', 'Borough']]

Unnamed: 0,Building ID,Borough


Check if thses are empty strings:

In [94]:
df.loc[(~df['Building ID'].str.contains('\\d', regex=True))]['Building ID']

84       
111      
161      
162      
165      
       ..
5357     
5358     
5359     
5360     
5361     
Name: Building ID, Length: 1021, dtype: object

Replace spelling of numbers with their value, and remove values 'PIER',  'MANHATTAN',  'NO NUMBER'

In [95]:
df.loc[df['Building ID'].str.strip('')=='ONE', 'Building ID'] = '1'
df.loc[df['Building ID'].str.strip('')=='PIER', 'Building ID'] = ''
df.loc[df['Building ID'].str.strip('')=='MANHATTAN', 'Building ID'] = ''
df.loc[df['Building ID'].str.strip('')=='NO NUMBER', 'Building ID'] = ''


Most of these will probably be legitimate house numbers, since house numbers can have dashes

In [96]:
df.loc[(~df['Building ID'].isna())
       &(~df['Building ID'].str.isdigit())]['Building ID']

84       
111      
161      
162      
165      
       ..
5357     
5358     
5359     
5360     
5361     
Name: Building ID, Length: 1021, dtype: object

Check non-numeric Building ID's that don't have dashes

In [97]:
df.loc[(~df['Building ID'].isna())
       &(~df['Building ID'].str.isdigit())
      &(~df['Building ID'].str.contains('-', regex=False))]['Building ID'][:25]

84     
111    
161    
162    
165    
212    
213    
251    
286    
297    
298    
303    
306    
307    
333    
334    
344    
346    
349    
351    
360    
361    
366    
367    
377    
Name: Building ID, dtype: object

We see a mix of reference to the house's garage, the rear house and single letters that likely indicate apartments in multi-occupancy venues. 

We will standardize the formatting, and maintain the reference to garage, rear, and appartment, since there is no apartment column for the job.

First split the numbers and words with a space

In [98]:
df['Building ID'] = df['Building ID'].str.replace(pat='(?P<one>\\d)(?P<two>[A-Z]+)', repl='\g<one> \g<two>', regex=True)

Now we will fix the formatting for garage and 
remove references to north, south, east, west, since they should be in street #

In [99]:
df['Building ID'] = df['Building ID'].str.replace(pat='(?P<one>GAR$)', repl='GARAGE', regex=True)

In [100]:
df['Building ID'] = df['Building ID'].str.replace(pat='NORTH([A-Z]+)?', repl='', regex=True)
df['Building ID'] = df['Building ID'].str.replace(pat='EAST([A-Z]+)?', repl='', regex=True)
df['Building ID'] = df['Building ID'].str.replace(pat='SOUTH([A-Z]+)?', repl='', regex=True)
df['Building ID'] = df['Building ID'].str.replace(pat='WEST([A-Z]+)?', repl='', regex=True)

In [101]:
# Confirm that it worked correctly:
df.loc[(~df['Building ID'].isna())
       &(~df['Building ID'].str.isdigit())
       &(~df['Building ID'].str.contains('-', regex=False))]['Building ID'][:30]

84     
111    
161    
162    
165    
212    
213    
251    
286    
297    
298    
303    
306    
307    
333    
334    
344    
346    
349    
351    
360    
361    
366    
367    
377    
384    
385    
393    
447    
456    
Name: Building ID, dtype: object

### Checking additional numerical columns for coherency

In [102]:
numerical_columns = ['Extremely Low Income Units', 'Very Low Income Units',
       'Low Income Units', 'Moderate Income Units', 'Middle Income Units',
       'Other Income Units', 'Studio Units', '1-BR Units', '2-BR Units',
       '3-BR Units', '4-BR Units', '5-BR Units', '6-BR+ Units',
       'Unknown-BR Units', 'Counted Rental Units',
       'Counted Homeownership Units', 'All Counted Units', 'Total Units']

Try to convert these all to numeric values

In [103]:
for col in numerical_columns:
    df[col] = df[col].astype('str')

In [104]:
for col in numerical_columns:
    print(col)
    print(df.loc[(~df[col].str.isdigit())
      &(df[col]!='nan')][col])
    print()

Extremely Low Income Units
Series([], Name: Extremely Low Income Units, dtype: object)

Very Low Income Units
Series([], Name: Very Low Income Units, dtype: object)

Low Income Units
Series([], Name: Low Income Units, dtype: object)

Moderate Income Units
Series([], Name: Moderate Income Units, dtype: object)

Middle Income Units
Series([], Name: Middle Income Units, dtype: object)

Other Income Units
Series([], Name: Other Income Units, dtype: object)

Studio Units
Series([], Name: Studio Units, dtype: object)

1-BR Units
Series([], Name: 1-BR Units, dtype: object)

2-BR Units
Series([], Name: 2-BR Units, dtype: object)

3-BR Units
Series([], Name: 3-BR Units, dtype: object)

4-BR Units
Series([], Name: 4-BR Units, dtype: object)

5-BR Units
Series([], Name: 5-BR Units, dtype: object)

6-BR+ Units
Series([], Name: 6-BR+ Units, dtype: object)

Unknown-BR Units
Series([], Name: Unknown-BR Units, dtype: object)

Counted Rental Units
Series([], Name: Counted Rental Units, dtype: object)



##### Try to convert to numerical values now:

In [105]:
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [106]:
for col in numerical_columns:
    show_vals(col)

Top 10 Extremely Low Income Units:

0     4165
1      207
2      142
3       78
4       53
6       48
7       43
5       35
8       34
10      22
Name: Extremely Low Income Units, dtype: int64

Top 10 Very Low Income Units:

0     3338
1      232
2      206
3      164
6      138
4      136
5       93
7       70
8       63
12      58
Name: Very Low Income Units, dtype: int64

Top 10 Low Income Units:

0     1938
1      910
2      277
3      223
4      188
8      118
6      116
5       98
7       89
10      72
Name: Low Income Units, dtype: int64

Top 10 Moderate Income Units:

0    4485
1     280
2     124
3      91
4      57
6      33
8      25
5      23
7      16
9      12
Name: Moderate Income Units, dtype: int64

Top 10 Middle Income Units:

0    4262
3     305
1     145
2     108
5      58
6      57
4      50
7      36
8      33
9      30
Name: Middle Income Units, dtype: int64

Top 10 Other Income Units:

0    4533
1     814
2      14
3       1
Name: Other Income Units, dtype: int

Lets replace NaN values with 0 here and convert to ints

In [107]:
for col in numerical_columns:
    df[col].fillna(0, inplace=True)

In [108]:
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')

In [109]:
for col in numerical_columns:
    print(col)
    print("Minimum: ", df[col].min())
    print("Maximum: ", df[col].max())
    print()

Extremely Low Income Units
Minimum:  0
Maximum:  429

Very Low Income Units
Minimum:  0
Maximum:  352

Low Income Units
Minimum:  0
Maximum:  536

Moderate Income Units
Minimum:  0
Maximum:  454

Middle Income Units
Minimum:  0
Maximum:  448

Other Income Units
Minimum:  0
Maximum:  3

Studio Units
Minimum:  0
Maximum:  569

1-BR Units
Minimum:  0
Maximum:  312

2-BR Units
Minimum:  0
Maximum:  305

3-BR Units
Minimum:  0
Maximum:  193

4-BR Units
Minimum:  0
Maximum:  52

5-BR Units
Minimum:  0
Maximum:  13

6-BR+ Units
Minimum:  0
Maximum:  28

Unknown-BR Units
Minimum:  0
Maximum:  536

Counted Rental Units
Minimum:  0
Maximum:  917

Counted Homeownership Units
Minimum:  0
Maximum:  489

All Counted Units
Minimum:  0
Maximum:  917

Total Units
Minimum:  1
Maximum:  1175



## Street

### Looking at Zoning Street

In [110]:
#Residence (R), Commerical (C), Manufacturing (M)
show_vals("Street")
show_vals("Street")

Top 10 Street:

----                       1021
SCHROEDERS AVENUE            74
FULTON STREET                43
ADAM C POWELL BOULEVARD      39
BRISTOL STREET               30
BEDFORD AVENUE               28
WEST 135 STREET              28
AMSTERDAM AVENUE             27
LAFAYETTE AVENUE             26
3 AVENUE                     25
Name: Street, dtype: int64

Top 10 Street:

----                       1021
SCHROEDERS AVENUE            74
FULTON STREET                43
ADAM C POWELL BOULEVARD      39
BRISTOL STREET               30
BEDFORD AVENUE               28
WEST 135 STREET              28
AMSTERDAM AVENUE             27
LAFAYETTE AVENUE             26
3 AVENUE                     25
Name: Street, dtype: int64



Some Street may contain invalid formats

In [111]:
#.4-4 looks weird
df["Street"].value_counts(dropna=False)

----                       1021
SCHROEDERS AVENUE            74
FULTON STREET                43
ADAM C POWELL BOULEVARD      39
BRISTOL STREET               30
                           ... 
VAN DUZER STREET              1
GOLD STREET                   1
WALLACE AVENUE                1
CARROLL STREET                1
GUY R BREWER BOULEVARD        1
Name: Street, Length: 1143, dtype: int64

In [112]:
#Checks for irregular values (values that do not start with (R), (C), (M))
df["Street"] = df["Street"].astype('str')
df.loc[(df["Street"] != "nan") & ~df["Street"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH"))]["Street"].value_counts()

----                       1021
SCHROEDERS AVENUE            74
FULTON STREET                43
ADAM C POWELL BOULEVARD      39
BRISTOL STREET               30
                           ... 
FOSTER AVENUE                 1
SOUTH 4 STREET                1
ONDERDONK AVENUE              1
SCHERMERHORN STREET           1
27 STREET                     1
Name: Street, Length: 979, dtype: int64

In [113]:
#Checks for irregular values (values that do not start with (R), (C), (M))
df["Street"] = df["Street"].astype('str')
df.loc[(df["Street"] != "nan") & ~df["Street"].str.startswith(("C", "R", "M", "PARK", "BPC", "LH"))]["Street"].value_counts()

----                       1021
SCHROEDERS AVENUE            74
FULTON STREET                43
ADAM C POWELL BOULEVARD      39
BRISTOL STREET               30
                           ... 
FOSTER AVENUE                 1
SOUTH 4 STREET                1
ONDERDONK AVENUE              1
SCHERMERHORN STREET           1
27 STREET                     1
Name: Street, Length: 979, dtype: int64

### Analysis

Zoning Street had some zones that were invalid such as number only values (ex.31010)  and we changed those values to nan

The typical format for Zoning Street start with C, R, and N. There are also some special Street like PARK and BPC that we also checked

Special Street didn't have any noticable values that were out of place

## Quick look at GIS

In [114]:
for col in [ 'Latitude','Longitude', 'Latitude (Internal)', 'Longitude (Internal)']:
    show_vals(col)


Top 10 Latitude:

NaN          1083
40.837941       8
40.871873       5
40.686032       4
40.856665       3
40.828165       3
40.686914       2
40.811856       2
40.686938       2
40.658302       2
Name: Latitude, dtype: int64

Top 10 Longitude:

NaN           1083
-73.895254       8
-73.827965       5
-73.949199       3
-73.929736       3
-73.927466       3
-73.828139       3
-73.868258       3
-73.857542       3
-73.915752       2
Name: Longitude, dtype: int64

Top 10 Latitude (Internal):

NaN         1161
40.87501      77
40.73172      35
40.67613      21
40.73487      21
40.67796      18
40.67887      17
40.66291      15
40.67574      13
40.68584      13
Name: Latitude (Internal), dtype: int64

Top 10 Longitude (Internal):

NaN           1161
-73.828362      77
-73.977896      35
-73.976982      21
-73.773084      20
-73.929283      18
-73.928799      17
-73.91128       15
-73.8724        13
-73.910943      13
Name: Longitude (Internal), dtype: int64



# Data Profilling for datetime columns


Find format problems and outliers in all datetime columns

Using openclean's sklearn modules to detect problems and outliers

In [115]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

def findDateOutliers(column_name, eps_setting = 0.05):
    print("Column: ",column_name)
    datetime_data = df[column_name]
    
    show_vals(column_name)

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(datetime_data)))
    print(DBSCANOutliers().find(datetime_data))
    print(DBSCANOutliers(eps = eps_setting).find(datetime_data))
    print('\n==================================')

In [116]:
data_cols = []


print("Datetime Data columns:\n")
for col in ds.columns:
    if 'Date' in col or 'DATE' in col:
        print(col)
        data_cols.append(col)

Datetime Data columns:

Project Start Date
Project Completion Date
Building Completion Date


In [117]:
date_cols = ["Project Start Date",
"Project Completion Date",
"Building Completion Date"]
print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.01)

----------------------------

Column:  Project Start Date
Top 10 Project Start Date:

06/30/2021    134
06/20/2019    108
06/27/2019     97
06/25/2015     90
06/27/2016     87
03/30/2020     79
06/26/2017     79
10/27/2015     76
06/29/2018     70
06/30/2017     66
Name: Project Start Date, dtype: int64


Total number of distinct values in Project Start Date is 5362
[]
['05/25/2018', '06/26/2014', '03/31/2021', '06/13/2019', '10/27/2015', '06/28/2018', '06/29/2017', '04/18/2019', '10/07/2015', '12/28/2016', '06/30/2020', '12/16/2014', '03/08/2016', '06/19/2018', '06/28/2016', '07/01/2016', '06/29/2015', '06/28/2017', '10/31/2017', '06/30/2014', '06/28/2021', '06/24/2016', '06/30/2017', '06/19/2015', '06/25/2014', '06/30/2021', '06/26/2018', '06/30/2015', '06/29/2016', '03/31/2020', '06/13/2017', '12/18/2015', '06/27/2016', '12/28/2017', '06/18/2014', '06/28/2019', '06/27/2018', '12/30/2015', '12/04/2014', '03/30/2020', '06/27/2014', '02/26/2015', '10/21/2020', '07/02/2018', '06/19/2014

In [118]:
df['Building Completion Date'] = df['Building Completion Date'].replace('nan', None)

In [119]:
df['Project Completion Date'] = df['Project Completion Date'].replace('nan', None)

## Fixing Datetime columns format

In [120]:
for col in date_cols:
    show_vals(col)

Top 10 Project Start Date:

06/30/2021    134
06/20/2019    108
06/27/2019     97
06/25/2015     90
06/27/2016     87
03/30/2020     79
06/26/2017     79
10/27/2015     76
06/29/2018     70
06/30/2017     66
Name: Project Start Date, dtype: int64

Top 10 Project Completion Date:

NaN           1608
06/20/2019     103
05/28/2019      84
08/03/2018      76
06/26/2017      70
12/21/2016      59
06/23/2017      58
12/18/2015      57
05/25/2018      54
10/31/2017      43
Name: Project Completion Date, dtype: int64

Top 10 Building Completion Date:

NaN           1392
06/20/2019     103
12/31/2017      73
06/26/2017      70
12/21/2016      59
12/18/2015      57
05/25/2018      54
10/31/2017      43
06/13/2017      42
07/02/2018      39
Name: Building Completion Date, dtype: int64



Check to see if any columns have values in year-month-day format

In [121]:
for col in date_cols:
    print(col, '\n', df.loc[df[col].str.contains('-', regex=False, na=False)][col], '\n\n')

Project Start Date 
 Series([], Name: Project Start Date, dtype: object) 


Project Completion Date 
 Series([], Name: Project Completion Date, dtype: object) 


Building Completion Date 
 Series([], Name: Building Completion Date, dtype: object) 




### Check the coherence of datetime values

These don't make sense, but it's not entirely clear if they should be swapped, or removed or what

In [122]:
df.loc[(pd.to_datetime(df['Project Start Date']) > pd.to_datetime(df['Project Completion Date']))
      &(~df['Project Start Date'].isna() & ~df['Project Completion Date'].isna())][['Project Start Date', 'Project Completion Date']]

Unnamed: 0,Project Start Date,Project Completion Date
529,10/29/2020,10/09/2020


# Data Profilling for City and Other Description

Find format problems and outliers in City and Description columns

Using openclean's sklearn modules to detect problems and outliers

In [123]:
from openclean.profiling.anomalies.sklearn import DBSCANOutliers

# Print the ten most frequent values for the 'Vehicle Expiration Date' column.
def findDateOutliers(column_name, eps_setting = 0.05):
    applicant_data = df[column_name]
    print("Column: ",column_name)
    
    show_vals(column_name)

    print('\nTotal number of distinct values in {} is {}'.format(column_name, len(applicant_data)))
    print(DBSCANOutliers(eps = eps_setting).find(applicant_data))
    print('\n==================================')

In [124]:
date_cols = ["Borough",'Street']

In [125]:
print("----------------------------\n")        
        
for col in date_cols:
    findDateOutliers(col, 0.1)

----------------------------

Column:  Borough
Top 10 Borough:

Brooklyn         2194
Bronx            1291
Manhattan         983
Queens            654
Staten Island     240
Name: Borough, dtype: int64


Total number of distinct values in Borough is 5362
['Brooklyn', 'Queens', 'Staten Island', 'Bronx', 'Manhattan']

Column:  Street
Top 10 Street:

----                       1021
SCHROEDERS AVENUE            74
FULTON STREET                43
ADAM C POWELL BOULEVARD      39
BRISTOL STREET               30
BEDFORD AVENUE               28
WEST 135 STREET              28
AMSTERDAM AVENUE             27
LAFAYETTE AVENUE             26
3 AVENUE                     25
Name: Street, dtype: int64


Total number of distinct values in Street is 5362
['----', '31 ROAD', '62 DRIVE', 'AVENUE A', 'E. 138TH STREET', 'BROADWAY', 'WEST 137TH ST', '30 DRIVE', 'DR M L KING JR BOULEVARD']



# Save cleaned data

In [126]:
outputpath = datafile[2:11]+'_cleaned_data.csv'
df.to_csv(outputpath,sep=',',index=False,header=True) 

# Precision and Recall

In [130]:
df2  = pd.read_csv(datafile, dtype='object', sep='\t')
df2 = df2.drop_duplicates()

sample_size = 50

df_sample_data =  df2.sample(sample_size).copy()

In [131]:
cleaned_columns = ['Project ID','Building ID','Extremely Low Income Units', 'Very Low Income Units',
       'Low Income Units', 'Moderate Income Units', 'Middle Income Units',
       'Other Income Units', 'Studio Units', '1-BR Units', '2-BR Units',
       '3-BR Units', '4-BR Units', '5-BR Units', '6-BR+ Units',
       'Unknown-BR Units', 'Counted Rental Units',
       'Counted Homeownership Units', 'All Counted Units', 'Total Units',"Street",'Latitude','Longitude',
        'Latitude (Internal)', 'Longitude (Internal)','Project Start Date','Project Completion Date','Building Completion Date',
        "Borough",'Street'
                   
]

In [132]:
df_sample_data = df_sample_data[cleaned_columns]

In [133]:
def precision(tp, fp):
    return tp/(tp+fp)

def recall(tp, fn):
    return tp/(tp+fn)

In [134]:
print('sample size: ',sample_size)
print('total size: ',sample_size * len(cleaned_columns))
print('======================\n\n')

same = 0

for col in cleaned_columns:
    print("column: ", col)
    print("Original,\t Cleaned\n")
    for i in range(sample_size):
        if df[col].iloc[i]== df2[col].iloc[i]:
            same += 1
        else:
            print(df[col].iloc[i], '\t', df2[col].iloc[i], '\t*')
    
    print('.......', same, ' same records ......\n')
    same = 0
    
    print('======================\n\n')

sample size:  50
total size:  1500


column:  Project ID
Original,	 Cleaned

....... 50  same records ......



column:  Building ID
Original,	 Cleaned

....... 50  same records ......



column:  Extremely Low Income Units
Original,	 Cleaned

0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
3 	 3 	*
15 	 15 	*
7 	 7 	*
27 	 27 	*
6 	 6 	*
10 	 10 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
1 	 1 	*
1 	 1 	*
2 	 2 	*
7 	 7 	*
0 	 0 	*
33 	 33 	*
12 	 12 	*
11 	 11 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
1 	 1 	*
0 	 0 	*
2 	 2 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
....... 0  same records ......



column:  Very Low Income Units
Original,	 Cleaned

0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
17 	 17 	*
9 	 9 	*
12 	 12 	*
36 	 36 	*
30 	 30 	*
12 	 12 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0

In [135]:
print('sample size: ',sample_size)
print('total size: ',sample_size * len(cleaned_columns))
print('======================\n\n')

same = 0

for col in cleaned_columns:
    print("column: ", col)
    print("Original,\t Cleaned\n")
    for i in range(sample_size):
        if df[col].iloc[i]== df2[col].iloc[i]:
            print(df[col].iloc[i], '\t', df2[col].iloc[i], '\t')
        else:
            print(df[col].iloc[i], '\t', df2[col].iloc[i], '\t*')

    print('======================\n\n')

sample size:  50
total size:  1500


column:  Project ID
Original,	 Cleaned

44223 	 44223 	
44223 	 44223 	
44223 	 44223 	
44223 	 44223 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
58871 	 58871 	
64543 	 64543 	
64543 	 64543 	
64543 	 64543 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65315 	 65315 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
65358 	 65358 	
66909 	 66909 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
67910 	 67910 	
69280 	 69280 	
69428 	 69428 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69763 	 69763 	
69765 	 69765 	
69765 	 69765 	


column:  Building ID
Original,	 Cleaned

927737 	 927737 	
969695 	 969695 	
975702 	 975702 	
977564 	 977564 	
64740 	 

0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*


column:  Counted Rental Units
Original,	 Cleaned

0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
20 	 20 	*
24 	 24 	*
20 	 20 	*
64 	 64 	*
37 	 37 	*
22 	 22 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
2 	 2 	*
2 	 2 	*
2 	 2 	*
2 	 2 	*
2 	 2 	*
2 	 2 	*
1 	 1 	*
2 	 2 	*
1 	 1 	*
2 	 2 	*
2 	 2 	*
9 	 9 	*
8 	 8 	*
9 	 9 	*
15 	 15 	*
15 	 15 	*
16 	 16 	*
20 	 20 	*
18 	 18 	*
133 	 133 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
54 	 54 	*
207 	 207 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*


column:  Counted Homeownership Units
Original,	 Cleaned

13 	 13 	*
8 	 8 	*
15 	 15 	*
10 	 10 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
0 	 0 	*
9 	 9 	*
21 	 21 	*
14 	 14 	*
1 	 1 	*
1 	 1 	*
1 	 1 	*
1 	 1 	*
1 	 1 	*
1 	 1 	*
1 	 1 	*
1

Bronx 	 Bronx 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	
Queens 	 Queens 	


column:  Street
Original,	 Cleaned

RALPH AVENUE 	 RALPH AVENUE 	
ROCHESTER AVENUE 	 ROCHESTER AVENUE 	
SUYDAM PLACE 	 SUYDAM PLACE 	
RALPH AVENUE 	 RALPH AVENUE 	
EAST 165 STREET 	 EAST 165 STREET 	
EAST 166 STREET 	 EAST 166 STREET 	
MORRIS AVENUE 	 MORRIS AVENUE 	
SHERMAN AVENUE 	 SHERMAN AVENUE 	
SHERMAN AVENUE 	 SHERMAN AVENUE 	
TELLER AVENUE 	 TELLER AVENUE 	
AVENUE C 	 AVENUE C 	
EAST 10 STREET 	 EAST 10 STREET 	
EAST 11 STREET 	 EAST 11 STREET 	
HANCOCK STREET 	 HANCOCK STREET 	
WILLOUGHBY AVENUE 	 WILLOUGHBY AVENUE 	
BAINBRIDGE STREET 	 BAINBRIDGE STREET 	
BERGEN STREET 	 BERGEN STREET 	
JEFFERSON AVENUE 	 JEFFERSON AVENUE 	
MALCOLM X BOULEVARD 	 MALCOLM X BOULEVARD 	
CHAUNCEY STREET 	 CHAUNCEY STREET 	
CHAUNCEY STREET 	 CHAUNCEY STREET 	
BUFFALO AVENUE 	 BUFFALO AVENUE 	
ATLANTIC A

In [142]:
tp = 56
fp = 4

fn = 17
tn = 1500 - 56 - 4 - 17

In [143]:
precision(tp, fp)

0.9333333333333333

In [144]:
recall(tp,fn)

0.7671232876712328