In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import sklearn
from shapely.geometry import multipolygon
import shapely.wkt
import geopandas
import sodapy as Socrata
import requests

In [3]:
# Download the data from NYC open data
NYC = pd.read_csv('insight/NYC_WaterConnections.csv')

total_addresses = NYC.shape[0]
percent_not_lead = NYC.loc[NYC['Material_G']=='Not Lead'].shape[0]/total_addresses
percent_lead = NYC.loc[NYC['Material_G']=='Potential Lead'].shape[0]/total_addresses
percent_unknown = NYC.loc[NYC['Material_G']=='Unknown'].shape[0]/total_addresses
percent_NA = NYC.loc[NYC['Material_G']=='Non-Applicable'].shape[0]/total_addresses

print('The NYC dataset contains', total_addresses, 'addresses with labels: Not Lead, Unknown, Lead, Non-Applicable.')
print('The percentage of addresses labeled Not Lead:', percent_not_lead*100)
print('The percentage of addresses labeled Lead:', percent_lead*100)
print('The percentage of addresses labeled Unknown:', percent_unknown*100)
print('The percentage of addresses labeled Non-applicable:', percent_NA*100)

The NYC dataset contains 857536 addresses with labels: Not Lead, Unknown, Potential Lead, Non-Applicable.
The percentage of addresses labeled Not Lead: 56.081727181132926
The percentage of addresses labeled Lead: 16.133550078364056
The percentage of addresses labeled Unknown: 27.69248171505336
The percentage of addresses labeled Non-applicable: 0.09224102544966042


In [15]:
# Drop unknown labels
NYC.rename(columns={"Material_G": "Pipe Material"}, inplace=True)
NYC_lead=NYC[(NYC['Pipe Material']=='Lead') | (NYC['Pipe Material']=='Not Lead')  ]


(619272, 9)

In [None]:
# New York City Open Data
# https://data.cityofnewyork.us/City-Government/Property-Valuation-and-Assessment-Data/yjxr-fw8i
socrata_domain = 'data.cityofnewyork.us'
socrata_dataset_identifier = 'yjxr-fw8i'
app_token = 'ZK2rqIcpP6zuUNuyUQA0Qe6Mr'

client = Socrata.Socrata(socrata_domain, app_token)



In [None]:
# Here we match the list of labeled addresses with Property and Tax Information updated in January 2020. 
# This includes building value, dimensions, location, census tract, tax class (building type).


df_2019 = pd.DataFrame()

loop_size = 50000
num_loops = 151

# can maybe do where in list(bbl.astype(str))
for i in range(num_loops):
    results = client.get(socrata_dataset_identifier,
                         select = 'avland, avland2, avtot, avtot2, bble, bin, blddepth, bldfront, census_tract, community_board, council_district, easement, exland,exland2, extot, extot2, fullval, ltdepth, ltfront, nta, owner, stories, taxclass, valtype, zip',
                         limit=loop_size,
                         offset=loop_size * i)
    df_query = pd.DataFrame.from_dict(results)
    df_query.drop_duplicates(subset='bble', keep='first', inplace=True)
    df_2019 = df_2019.append(df_query[df_query['bble'].isin(list(bbl.astype(str)))],sort=False, ignore_index=False)

In [17]:
# Every building in NYC is identified by BBL code (borough, building, lot)
# The building data set includes construction year for each building by BBL
building = pd.read_csv("insight/NYC_building.csv")


df_2019 = pd.read_csv("insight/df_2019_unique.csv")
temp_lead = NYC_lead
temp_building = building
temp_property = df_2019
print('Shape lead:',temp_lead.shape )
print('Shape building:', temp_building.shape)
print('Shape property:', temp_property.shape)

Shape lead: (619272, 9)
Shape building: (1084844, 15)
Shape property: (619185, 26)


## Feature Engineering

In [None]:
# Distance between two latitude/longitude coordinates depends on location on Earth.

def haversine(coord1, coord2):
    import math
    
    # Coordinates in decimal degrees (e.g. 40.2, -73.9)
    lon1, lat1 = coord1.x, coord1.y
    lon2, lat2 = coord2.x, coord2.y
    R = 6371000  # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2.0) ** 2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0) ** 2

    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    meters = R * c  # output distance in meters
    km = meters / 1000.0  # output distance in kilometers

    return(km)


In [None]:
# Compute Nearest

%%time
delta_lat = .0009
delta_lon = .00012


temp = geo_frame


    for index, row in geo_frame.iterrows():
    
        point = row.geometry
        px = point.x
        py = point.y
        temp = geo_frame[(geo_frame['latitude']<py+delta_lat) & (geo_frame['longitude']<px+delta_lon) & (geo_frame['latitude']>py-delta_lat) & (geo_frame['longitude']>px-delta_lon)]
    
        while (len(temp.PipeMaterial.drop(index, axis=0).value_counts()) != 2):
            delta_lat= 2*delta_lat
            delta_lon= 2*delta_lon
            temp = geo_frame[(geo_frame['latitude']<py+delta_lat) & (geo_frame['longitude']<px+delta_lon) & (geo_frame['latitude']>py-delta_lat) & (geo_frame['longitude']>px-delta_lon)]
    


        if (row.PipeMaterial == 'Lead'):
            multipoint = temp[temp['PipeMaterial'] == 'Lead'].drop(index, axis=0).geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_LSL_dist'] = haversine(point,nearest_geom)
    
            multipoint = temp[temp['PipeMaterial'] == 'Not Lead'].geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_NLSL_dist'] = haversine(point,nearest_geom)
        elif (row.PipeMaterial == 'Not Lead'):
            multipoint = temp[temp['PipeMaterial'] == 'Lead'].geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_LSL_dist'] = haversine(point,nearest_geom)
    
            multipoint = temp[temp['PipeMaterial'] == 'Not Lead'].drop(index, axis=0).geometry.unary_union
            queried_geom, nearest_geom = nearest_points(point, multipoint)
            geo_frame.loc[index, 'nearest_NLSL_dist'] = haversine(point,nearest_geom)

    
