In [10]:
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import sklearn
from shapely.geometry import multipolygon
import shapely.wkt
import geopandas
import sodapy as Socrata
import requests

In [96]:
# Download the data from NYC open data
# This dataset containts a list of addresses in NYC.
NYC = pd.read_csv('insight/NYC_WaterConnections.csv')
NYC.head()

Unnamed: 0,BBL,the_geom,OBJECTID,Address,Material_G,Record_Typ,CityOwned,Shape_Leng,Shape_Area
0,1000010010,MULTIPOLYGON (((-74.01690582604903 40.69335343...,1,1 GOVERNORS ISLAND,Not Lead,OBSERVATION,No,12277.830501,7550344.0
1,1000010101,MULTIPOLYGON (((-74.04396208819837 40.69006636...,2,1 LIBERTY ISLAND,Unknown,UNAVAILABLE,No,3940.838665,501897.2
2,1000010201,MULTIPOLYGON (((-74.04001513069849 40.70084115...,3,1 ELLIS ISLAND,Unknown,UNAVAILABLE,No,6306.267382,1148541.0
3,1000020001,MULTIPOLYGON (((-74.01202751441055 40.70003725...,4,4 SOUTH STREET,Not Lead,HISTORICAL DATA,Yes,2721.059553,100825.1
4,1000020002,MULTIPOLYGON (((-74.01111163437284 40.70102458...,5,10 SOUTH STREET,Unknown,HISTORICAL DATA,Yes,2411.870163,87244.13


In [97]:
total_addresses = NYC.shape[0]
percent_not_lead = NYC.loc[NYC['Material_G']=='Not Lead'].shape[0]/total_addresses
percent_lead = NYC.loc[NYC['Material_G']=='Potential Lead'].shape[0]/total_addresses
percent_unknown = NYC.loc[NYC['Material_G']=='Unknown'].shape[0]/total_addresses
percent_NA = NYC.loc[NYC['Material_G']=='Non-Applicable'].shape[0]/total_addresses

print('The NYC dataset contains', total_addresses, 'addresses with labels: Not Lead, Unknown, Lead, Non-Applicable.')
print('The percentage of addresses labeled Not Lead:', percent_not_lead*100)
print('The percentage of addresses labeled Lead:', percent_lead*100)
print('The percentage of addresses labeled Unknown:', percent_unknown*100)
print('The percentage of addresses labeled Non-applicable:', percent_NA*100)

The NYC dataset contains 857536 addresses with labels: Not Lead, Unknown, Lead, Non-Applicable.
The percentage of addresses labeled Not Lead: 56.081727181132926
The percentage of addresses labeled Lead: 16.133550078364056
The percentage of addresses labeled Unknown: 27.69248171505336
The percentage of addresses labeled Non-applicable: 0.09224102544966042


In [98]:
# Rename "Material_G" as Pipe Material
# Drop unknown labels
NYC.rename(columns={"Material_G": "Pipe Material"}, inplace=True)
NYC_lead=NYC[(NYC['Pipe Material']=='Lead') | (NYC['Pipe Material']=='Not Lead')  ]


In [None]:
# New York City Open Data
# https://data.cityofnewyork.us/City-Government/Property-Valuation-and-Assessment-Data/yjxr-fw8i
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'yjxr-fw8i'
app_token = 'ZK2rqIcpP6zuUNuyUQA0Qe6Mr'

client = Socrata.Socrata(socrata_domain, app_token)



In [None]:
# Here we match the list of labeled addresses with Property and Tax Information updated in January 2020. 
# This includes building value, dimensions, location, census tract, tax class (building type).


df_2019 = pd.DataFrame()
# Use maximum query limit (50000)
loop_size = 50000
num_loops = 151

# can maybe do where in list(bbl.astype(str))
for i in range(num_loops):
    results = client.get(socrata_dataset_identifier,
                         select = 'avland, avland2, avtot, avtot2, bble, bin, blddepth, bldfront, census_tract, community_board, council_district, easement, exland,exland2, extot, extot2, fullval, ltdepth, ltfront, nta, owner, stories, taxclass, valtype, zip',
                         limit=loop_size,
                         offset=loop_size * i)
    df_query = pd.DataFrame.from_dict(results)
    df_query.drop_duplicates(subset='bble', keep='first', inplace=True)
    df_2019 = df_2019.append(df_query[df_query['bble'].isin(list(bbl.astype(str)))],sort=False, ignore_index=False)

In [6]:
# Every building in NYC is identified by BBL code (borough, building, lot)

# The building data set includes construction year for each building by BBL
building = pd.read_csv("insight/NYC_building.csv")

# Data from API
df_2019 = pd.read_csv("insight/df_2019_unique.csv")

temp_lead = NYC_lead
temp_building = building
temp_property = df_2019
print('Shape lead:',temp_lead.shape )
print('Shape building:', temp_building.shape)
print('Shape property:', temp_property.shape)

Shape lead: (480921, 9)
Shape building: (1084844, 15)
Shape property: (619185, 26)


In [112]:
# Rename all BBL
temp_building = temp_building.rename(columns={"BASE_BBL": "BBL"})
temp_property = temp_property.rename(columns={"bble":"BBL"})

In [116]:
# Merge by BBL on labeled lead dataset.
m = pd.merge(temp_lead,temp_property,on='BBL',how='left')
n = pd.merge(m,temp_building,on='BBL',how='left')

merged_data = n.drop_duplicates(subset='BBL',keep='first')
#Building data may have multiple entries for units in buildings but each have same BBL and construction year.
merged_data.shape

(480921, 48)

In [15]:
# Census Data From the American Community Survey at the census tract level. Pulled Income, poverty, and racial demographics. 
# The datasets were grouped by county, within each by census tract

Manhattan_income = pd.read_csv('insight/Manhattan_ACS_median_income.csv')
Manhattan_poverty = pd.read_csv('insight/Manhattan_ACS_poverty.csv')
Manhattan_race = pd.read_csv('insight/Manhattan_ACS_race.csv')
Manhattan_hispanic = pd.read_csv('insight/Manhattan_hispanic.csv')

Bronx_income = pd.read_csv('insight/Bronx_ACS_median_income.csv')
Bronx_poverty = pd.read_csv('insight/Bronx_ACS_poverty.csv')
Bronx_race = pd.read_csv('insight/Bronx_ACS_race.csv')
Bronx_hispanic = pd.read_csv('insight/Bronx_hispanic.csv')

Brooklyn_income = pd.read_csv('insight/Brooklyn_ACS_median_income.csv')
Brooklyn_poverty = pd.read_csv('insight/Brooklyn_ACS_poverty.csv')
Brooklyn_race = pd.read_csv('insight/Brooklyn_ACS_race.csv')
Brooklyn_hispanic = pd.read_csv('insight/Brooklyn_hispanic.csv')

Queens_income = pd.read_csv('insight/Queens_ACS_median_income.csv')
Queens_poverty = pd.read_csv('insight/Queens_ACS_poverty.csv')
Queens_race = pd.read_csv('insight/Queens_ACS_race.csv')
Queens_hispanic = pd.read_csv('insight/Queens_hispanic.csv')

StatenIsland_income = pd.read_csv('insight/StatenIsland_ACS_median_income.csv')
StatenIsland_poverty = pd.read_csv('insight/StatenIsland_ACS_poverty.csv')
StatenIsland_race = pd.read_csv('insight/StatenIsland_ACS_race.csv')
StatenIsland_hispanic = pd.read_csv('insight/StatenIsland_hispanic.csv')

In [16]:
#Pull relevant columns and rename according to descriptions in dataset

# INCOME
Manhattan_income = Manhattan_income[['GEO.display-label','HC02_EST_VC02']]
Manhattan_income.rename(columns={"HC02_EST_VC02": "Median_Income", 'GEO.display-label':'census_tract'},inplace=True)

Bronx_income = Bronx_income[['GEO.display-label','HC02_EST_VC02']]
Bronx_income.rename(columns={"HC02_EST_VC02": "Median_Income", 'GEO.display-label':'census_tract'},inplace=True)

Brooklyn_income = Brooklyn_income[['GEO.display-label','HC02_EST_VC02']]
Brooklyn_income.rename(columns={"HC02_EST_VC02": "Median_Income", 'GEO.display-label':'census_tract'},inplace=True)

Queens_income = Queens_income[['GEO.display-label','HC02_EST_VC02']]
Queens_income.rename(columns={"HC02_EST_VC02": "Median_Income", 'GEO.display-label':'census_tract'},inplace=True)

StatenIsland_income = StatenIsland_income[['GEO.display-label','HC02_EST_VC02']]
StatenIsland_income.rename(columns={"HC02_EST_VC02": "Median_Income", 'GEO.display-label':'census_tract'},inplace=True)

# Poverty

Manhattan_poverty = Manhattan_poverty[['GEO.display-label', 'HC03_EST_VC01']]
Manhattan_poverty.rename(columns={"HC03_EST_VC01": "percent_below_poverty", 'GEO.display-label':'census_tract'},inplace=True)

Bronx_poverty = Bronx_poverty[['GEO.display-label', 'HC03_EST_VC01']]
Bronx_poverty.rename(columns={"HC03_EST_VC01": "percent_below_poverty", 'GEO.display-label':'census_tract'},inplace=True)

Brooklyn_poverty = Brooklyn_poverty[['GEO.display-label', 'HC03_EST_VC01']]
Brooklyn_poverty.rename(columns={"HC03_EST_VC01": "percent_below_poverty", 'GEO.display-label':'census_tract'},inplace=True)

Queens_poverty = Queens_poverty[['GEO.display-label', 'HC03_EST_VC01']]
Queens_poverty.rename(columns={"HC03_EST_VC01": "percent_below_poverty", 'GEO.display-label':'census_tract'},inplace=True)

StatenIsland_poverty = StatenIsland_poverty[['GEO.display-label', 'HC03_EST_VC01']]
StatenIsland_poverty.rename(columns={"HC03_EST_VC01": "percent_below_poverty", 'GEO.display-label':'census_tract'},inplace=True)


# Racial Demographics

Manhattan_race = Manhattan_race[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03','HD01_VD05','HD01_VD08']]
Manhattan_race.rename(columns={'HD01_VD01':'Total_pop','HD01_VD02':'white','HD01_VD03':'black','HD01_VD05':'asian','HD01_VD08':"mixed", 'GEO.display-label':'census_tract'},inplace=True)

Bronx_race = Bronx_race[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03','HD01_VD05','HD01_VD08']]
Bronx_race.rename(columns={'HD01_VD01':'Total_pop','HD01_VD02':'white','HD01_VD03':'black','HD01_VD05':'asian','HD01_VD08':"mixed", 'GEO.display-label':'census_tract'},inplace=True)

Brooklyn_race = Brooklyn_race[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03','HD01_VD05','HD01_VD08']]
Brooklyn_race.rename(columns={'HD01_VD01':'Total_pop','HD01_VD02':'white','HD01_VD03':'black','HD01_VD05':'asian','HD01_VD08':"mixed", 'GEO.display-label':'census_tract'},inplace=True)

Queens_race = Queens_race[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03','HD01_VD05','HD01_VD08']]
Queens_race.rename(columns={'HD01_VD01':'Total_pop','HD01_VD02':'white','HD01_VD03':'black','HD01_VD05':'asian','HD01_VD08':"mixed", 'GEO.display-label':'census_tract'},inplace=True)

StatenIsland_race = StatenIsland_race[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03','HD01_VD05','HD01_VD08']]
StatenIsland_race.rename(columns={'HD01_VD01':'Total_pop','HD01_VD02':'white','HD01_VD03':'black','HD01_VD05':'asian','HD01_VD08':"mixed", 'GEO.display-label':'census_tract'},inplace=True)

# Hispanic in separate dataset

Manhattan_hispanic = Manhattan_hispanic[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03']]
Manhattan_hispanic.rename(columns={'HD01_VD01':'total_pop_hisp_data','HD01_VD02':'hispanic','HD01_VD03':'non_hispanic', 'GEO.display-label':'census_tract'},inplace=True)

Bronx_hispanic = Bronx_hispanic[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03']]
Bronx_hispanic.rename(columns={'HD01_VD01':'total_pop_hisp_data','HD01_VD02':'hispanic','HD01_VD03':'non_hispanic', 'GEO.display-label':'census_tract'},inplace=True)

Broklyn_hispanic = Brooklyn_hispanic[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03']]
Brooklyn_hispanic.rename(columns={'HD01_VD01':'total_pop_hisp_data','HD01_VD02':'hispanic','HD01_VD03':'non_hispanic', 'GEO.display-label':'census_tract'},inplace=True)

Queens_hispanic = Queens_hispanic[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03']]
Queens_hispanic.rename(columns={'HD01_VD01':'total_pop_hisp_data','HD01_VD02':'hispanic','HD01_VD03':'non_hispanic', 'GEO.display-label':'census_tract'},inplace=True)

StatenIsland_hispanic = StatenIsland_hispanic[['GEO.display-label','HD01_VD01','HD01_VD02','HD01_VD03']]
StatenIsland_hispanic.rename(columns={'HD01_VD01':'total_pop_hisp_data','HD01_VD02':'hispanic','HD01_VD03':'non_hispanic', 'GEO.display-label':'census_tract'},inplace=True)


In [74]:
# Merge on Census Tract

#Manhattan
x_man = pd.merge(Manhattan_poverty, Manhattan_income
,how='outer',on='census_tract')

x_man =pd.merge(x_man, Manhattan_race
,how='outer',on='census_tract')

x_man = pd.merge(x_man, Manhattan_hispanic
,how='outer',on='census_tract')

# Bronx

x_brx = pd.merge(Bronx_poverty,Bronx_income
,how='outer',on='census_tract')

x_brx =pd.merge(x_brx, Bronx_race
,how='outer',on='census_tract')

x_brx = pd.merge(x_brx,Bronx_hispanic
,how='outer',on='census_tract')

# Brooklyn

x_bkn = pd.merge(Brooklyn_poverty,Brooklyn_income
,how='outer',on='census_tract')

x_bkn =pd.merge(x_bkn, Brooklyn_race
,how='outer',on='census_tract')

x_bkn = pd.merge(x_bkn,Brooklyn_hispanic
,how='outer',on='census_tract')

# Queens

x_qns = pd.merge(Queens_poverty,Queens_income
,how='outer',on='census_tract')

x_qns =pd.merge(x_qns, Queens_race
,how='outer',on='census_tract')

x_qns = pd.merge(x_qns,Queens_hispanic
,how='outer',on='census_tract')

#Staten Island 

x_st = pd.merge(StatenIsland_poverty,StatenIsland_income
,how='outer',on='census_tract')

x_st =pd.merge(x_st, StatenIsland_race
,how='outer',on='census_tract')

x_st = pd.merge(x_st,StatenIsland_hispanic
,how='outer',on='census_tract')

In [75]:
ACS_data = pd.DataFrame()
ACS_data = x_man.append(x_brx,ignore_index=True, sort=False)
ACS_data = ACS_data.append(x_bkn,ignore_index=True, sort=False)
ACS_data = ACS_data.append(x_qns,ignore_index=True, sort=False)
ACS_data = ACS_data.append(x_st,ignore_index=True, sort=False)
ACS_data = ACS_data.drop(columns = ['GEO.id','GEO.id2','HD02_VD01','HD02_VD02','HD02_VD03'])
ACS_data = ACS_data.drop(ACS_data.index[0]) # Row for describing column names

In [76]:
# Census Tract mus be reformatted before combining with NYC dats
ACS_data.census_tract.head()

1       Census Tract 1, New York County, New York
2    Census Tract 2.01, New York County, New York
3    Census Tract 2.02, New York County, New York
4       Census Tract 5, New York County, New York
5       Census Tract 6, New York County, New York
Name: census_tract, dtype: object

In [94]:
#ACS_data['census_tract'] = ACS_data.census_tract.str.partition(',')[0]
#ACS_data['census_tract'] = ACS_data.census_tract.str.partition('t')[2]
ACS_data['census_tract'] = pd.to_numeric(ACS_data['census_tract'],errors='coerce')
#ACS_data['census_tract'] = ACS_data['census_tract'].astype(float)
#ACS_data['census_tract']=ACS_data['census_tract'].str.strip()

#ACS_data['census_tract'] = ACS_data['census_tract'].astype(float)

ACS_data['census_tract'] = np.floor(ACS_data['census_tract']).astype(int)
#ACS_data['census_tract'] = ACS_data.census_tract.str[:-2]
ACS_data.census_tract.head()

1    1
2    2
3    2
4    5
5    6
Name: census_tract, dtype: int32

## Feature Engineering

In [None]:
# Engineering spatial features. 
# Frist the lead index. Find proportion of lead labels within 0.25 km of each building.

In [122]:
from shapely import wkt
import geopandas as gp
from shapely.geometry import Polygon, Point

locations = merged_data['the_geom_x']
merged = merged_data.copy()
merged['the_geom'] = locations.apply(wkt.loads)
merged['longitude'] = gp.GeoSeries(merged['the_geom']).centroid.x
merged['latitude'] = gp.GeoSeries(merged['the_geom']).centroid.y

gdf = geopandas.GeoDataFrame(
    merged, geometry=[Point(x, y) for x, y in zip(merged.longitude, merged.latitude)])


In [None]:
# Find proportion of lead in 0.25 km radius for each building

def lead_index_quarter(p):
    # Lat/lon radius of 0.25 km in NYC
    delta_lat_quarter = .0225 
    delta_lon_quarter = .002965
    px = p.x
    py=p.y
    cts = gdf[(gdf['latitude']<py+delta_lat_quarter) & (gdf['longitude']<px+delta_lon_quarter) & (gdf['latitude']>py-delta_lat_quarter) & (gdf['longitude']>px-delta_lon_quarter)]['Pipe Material'].value_counts(normalize=True)
    if (cts.shape[0]==2): return(cts[1])
    elif (cts.index[0]=='Lead'): return(1)
    else: return(0)

gdf = gdf.geometry.apply(lead_index_quarter)


In [None]:
# Next we compute nearest neighbor comparison. 

In [None]:
# The distance between two coordinates varies by location on Earth. Need to convert to km.

def haversine(coord1, coord2):
    import math
    
    # Coordinates in decimal degrees (e.g. 40.2, -73.9)
    lon1, lat1 = coord1.x, coord1.y
    lon2, lat2 = coord2.x, coord2.y
    R = 6371000  # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2.0) ** 2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0) ** 2

    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    meters = R * c  # output distance in meters
    km = meters / 1000.0  # output distance in kilometers

    return(km)


In [None]:
# Compute Nearest neighbor comparison. Dist(Closest Nonlead) - Dist(closest lead)

%%time
# Check in small radius around each building, expand if necessary
delta_lat = .0009
delta_lon = .00012


temp = geo_frame


    for index, row in geo_frame.iterrows():
    
        point = row.geometry
        px = point.x
        py = point.y
        temp = geo_frame[(geo_frame['latitude']<py+delta_lat) & (geo_frame['longitude']<px+delta_lon) & (geo_frame['latitude']>py-delta_lat) & (geo_frame['longitude']>px-delta_lon)]
    
        while (len(temp.PipeMaterial.drop(index, axis=0).value_counts()) != 2):
            delta_lat= 2*delta_lat
            delta_lon= 2*delta_lon
            temp = geo_frame[(geo_frame['latitude']<py+delta_lat) & (geo_frame['longitude']<px+delta_lon) & (geo_frame['latitude']>py-delta_lat) & (geo_frame['longitude']>px-delta_lon)]
    


        if (row.PipeMaterial == 'Lead'):
            multipoint = temp[temp['PipeMaterial'] == 'Lead'].drop(index, axis=0).geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_LSL_dist'] = haversine(point,nearest_geom)
    
            multipoint = temp[temp['PipeMaterial'] == 'Not Lead'].geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_NLSL_dist'] = haversine(point,nearest_geom)
        elif (row.PipeMaterial == 'Not Lead'):
            multipoint = temp[temp['PipeMaterial'] == 'Lead'].geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_LSL_dist'] = haversine(point,nearest_geom)
    
            multipoint = temp[temp['PipeMaterial'] == 'Not Lead'].drop(index, axis=0).geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_NLSL_dist'] = haversine(point,nearest_geom)

    
