In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%run -i ../notebooks/functions.py

In [3]:
#get NTA centroid list
NTA = pd.read_csv('../processed_data/geocoded_population.csv').loc[:, ['latitude', 'longitude']]
print(NTA.shape)
NTA.head()

(195, 2)


Unnamed: 0,latitude,longitude
0,40.768352,-73.809546
1,40.763352,-73.868396
2,40.734894,-73.783716
3,40.755734,-73.983503
4,40.818055,-73.856188


## Data Binning

In [4]:
#import data
population_df = pd.read_csv('../processed_data/geocoded_population.csv')
traffic_df = pd.read_csv('../processed_data/geocoded_traffic.csv')
pedestrian_df = pd.read_csv('../processed_data/geocoded_pedestrian.csv')
library_df = pd.read_csv('../processed_data/libraries.csv')
parks_df = pd.read_csv('../processed_data/parks.csv')
hotels_df = pd.read_csv('../processed_data/hotels.csv')
facilities_df = pd.read_csv('../processed_data/facilities.csv')

In [5]:
population_df.head()

Unnamed: 0,latitude,longitude,Population
0,40.768352,-73.809546,51739
1,40.763352,-73.868396,23150
2,40.734894,-73.783716,17812
3,40.755734,-73.983503,28630
4,40.818055,-73.856188,53686


In [6]:
#bin the simple dataframes
population_bin = bin_data(population_df, 'sum')
traffic_bin = bin_data(traffic_df, 'sum')
pedestrian_bin = bin_data(pedestrian_df, 'sum')
library_bin = bin_data(library_df, 'count')
parks_bin = bin_data(parks_df, 'sum')
hotels_bin = bin_data(hotels_df, 'count')

In [7]:
#check
print(traffic_bin.shape)
traffic_bin.head()

(195, 3)


Unnamed: 0,NTA_lat,NTA_long,traffic
0,40.525528,-74.233554,104.606481
1,40.528645,-74.187734,446.743056
2,40.540334,-74.207828,572.143519
3,40.55186,-74.15089,576.06713
4,40.55266,-74.188482,647.849537


#### Facilities DF

In [8]:
#pull value counts for each category for each neighborhood
facilities_count = bin_data(facilities_df).value_counts(['factype', 'close_NTA_lat', 'close_NTA_long'])

In [9]:
#get the index corresponding to the subgroup names
factypes = facilities_count.index.get_level_values(0).unique()
#empty dict to save facility dataframes
facility_bin = {}
#column list to create new column names
fac_columns = ['NTA_lat', 'NTA_long']

#create a dataframe for each subgroup and append name to column list
for factype in factypes:
    facility_bin[factype] = pd.DataFrame(facilities_count[factype], columns=['count']).reset_index()
    fac_columns.append(factype)
    
#check how many subgroup and merge all back in onto the lat_long
print(len(facility_bin))
facilities_all = facility_bin[factypes[0]].merge(facility_bin[factypes[1]],how='outer',on=['close_NTA_lat', 
                                'close_NTA_long']).merge(facility_bin[factypes[2]],how='outer', on=['close_NTA_lat', 
                                'close_NTA_long']).merge(facility_bin[factypes[3]],how='outer', on=['close_NTA_lat', 
                                'close_NTA_long']).merge(facility_bin[factypes[4]],how='outer', on=['close_NTA_lat', 
                                'close_NTA_long']).merge(facility_bin[factypes[5]],how='outer', on=['close_NTA_lat', 
                                'close_NTA_long']).merge(facility_bin[factypes[6]],how='outer',on=['close_NTA_lat', 
                                'close_NTA_long']).merge(facility_bin[factypes[7]],how='outer', on=['close_NTA_lat', 
                                'close_NTA_long'])


facilities_all.columns = fac_columns


8


  return merge(


In [10]:
#check merged dataframe
print(facilities_all.shape)
facilities_all.head()

(193, 10)


Unnamed: 0,NTA_lat,NTA_long,PARKING LOTS AND GARAGES,NON-PUBLIC K-12 SCHOOLS,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS,"STREETSCAPES, PLAZAS, AND MALLS",BUS DEPOTS AND TERMINALS,MUSEUMS,COLLEGES OR UNIVERSITIES
0,40.754313,-73.969018,115.0,9.0,13.0,2,8.0,,4.0,3.0
1,40.764964,-73.955088,102.0,4.0,31.0,9,2.0,,2.0,4.0
2,40.755734,-73.983503,85.0,,3.0,2,6.0,,7.0,9.0
3,40.744221,-73.977515,85.0,5.0,19.0,8,2.0,,,3.0
4,40.747581,-73.99902,82.0,8.0,17.0,15,1.0,,5.0,5.0


In [11]:
#add 0 for missing
#for each NTA value
for lat, long in NTA.values:
    #check that it is not in the grouped dataframe
    if lat not in facilities_all['NTA_lat'].values and long not in facilities_all['NTA_long'].values:
            #if not then append a row with that NTA lat long
            neighborhood = {'NTA_lat': lat, 'NTA_long': long}
            facilities_all = facilities_all.append(neighborhood, ignore_index=True)
            
facilities_final = facilities_all.fillna(0)

In [12]:
print(facilities_final.shape)
facilities_final.head()

(195, 10)


Unnamed: 0,NTA_lat,NTA_long,PARKING LOTS AND GARAGES,NON-PUBLIC K-12 SCHOOLS,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS,"STREETSCAPES, PLAZAS, AND MALLS",BUS DEPOTS AND TERMINALS,MUSEUMS,COLLEGES OR UNIVERSITIES
0,40.754313,-73.969018,115.0,9.0,13.0,2.0,8.0,0.0,4.0,3.0
1,40.764964,-73.955088,102.0,4.0,31.0,9.0,2.0,0.0,2.0,4.0
2,40.755734,-73.983503,85.0,0.0,3.0,2.0,6.0,0.0,7.0,9.0
3,40.744221,-73.977515,85.0,5.0,19.0,8.0,2.0,0.0,0.0,3.0
4,40.747581,-73.99902,82.0,8.0,17.0,15.0,1.0,0.0,5.0,5.0


## Zipping

In [13]:
#list_to_zip = [population_bin, traffic_bin, pedestrian_bin, library_bin, parks_bin, hotels_bin, facilities_final]
#df_list = llz_set(list_to_zip)

In [14]:
master_df = pd.merge(population_bin, traffic_bin,  how='left', left_on=['NTA_lat','NTA_long'], right_on = ['NTA_lat','NTA_long'])
master_df = pd.merge(master_df, pedestrian_bin,  how='left', left_on=['NTA_lat','NTA_long'], right_on = ['NTA_lat','NTA_long'])
master_df = pd.merge(master_df, library_bin,  how='left', left_on=['NTA_lat','NTA_long'], right_on = ['NTA_lat','NTA_long'])
master_df = pd.merge(master_df, parks_bin,  how='left', left_on=['NTA_lat','NTA_long'], right_on = ['NTA_lat','NTA_long'])
master_df = pd.merge(master_df, hotels_bin,  how='left', left_on=['NTA_lat','NTA_long'], right_on = ['NTA_lat','NTA_long'])
master_df = pd.merge(master_df, facilities_final,  how='left', left_on=['NTA_lat','NTA_long'], right_on = ['NTA_lat','NTA_long'])

In [15]:
master_df

Unnamed: 0,NTA_lat,NTA_long,Population,traffic,ped_traffic,count_x,acres,count_y,PARKING LOTS AND GARAGES,NON-PUBLIC K-12 SCHOOLS,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS,"STREETSCAPES, PLAZAS, AND MALLS",BUS DEPOTS AND TERMINALS,MUSEUMS,COLLEGES OR UNIVERSITIES
0,40.525528,-74.233554,23313,104.606481,0.000000,1.0,443.237,2.0,19.0,3.0,0.0,3.0,1.0,1.0,1.0,0.0
1,40.528645,-74.187734,27770,446.743056,0.000000,1.0,697.744,1.0,3.0,2.0,2.0,3.0,1.0,0.0,0.0,0.0
2,40.540334,-74.207828,20763,572.143519,0.000000,0.0,315.840,1.0,1.0,1.0,0.0,4.0,0.0,1.0,1.0,0.0
3,40.545779,-74.128351,0,0.000000,0.000000,0.0,24.137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40.551860,-74.150890,40720,576.067130,0.000000,1.0,419.592,0.0,2.0,5.0,0.0,6.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,40.880937,-73.836641,34517,152.939815,0.000000,1.0,50.192,4.0,11.0,7.0,3.0,10.0,0.0,0.0,0.0,0.0
191,40.882156,-73.858949,61321,1059.756944,0.000000,1.0,211.042,1.0,2.0,6.0,6.0,10.0,2.0,1.0,0.0,0.0
192,40.882406,-73.910667,30161,234.555556,1852.666667,2.0,185.228,0.0,15.0,2.0,5.0,11.0,4.0,0.0,0.0,0.0
193,40.897931,-73.852216,42483,624.671296,0.000000,1.0,1.224,5.0,9.0,4.0,7.0,7.0,1.0,0.0,0.0,0.0


In [19]:
master_df.to_csv('../processed_data/master.csv', index=False, encoding='utf-8')