In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%run -i ../notebooks/functions.py

In [3]:
#get NTA centroid list
NTA = pd.read_csv('../processed_data/geocoded_population.csv').loc[:, ['latitude', 'longitude']]
print(NTA.shape)
NTA.head()

(195, 2)


Unnamed: 0,latitude,longitude
0,40.768352,-73.809546
1,40.763352,-73.868396
2,40.734894,-73.783716
3,40.755734,-73.983503
4,40.818055,-73.856188


## Data Binning

In [4]:
#-- ORIGINAL DATA -- 
#create list of frames to pull in (facilities treated seperately)
wanted_df = ['geocoded_population', 'geocoded_traffic', 'geocoded_pedestrian', 
             'hotels', 'parks', 'libraries']
#list of whether each frame should have 'sum' or 'count'
metrics = ['sum', 'sum', 'sum', 'count', 'count', 'count']

In [5]:
#--- EXPERIMENT --- 

#create list of frames to pull in (facilities treated seperately)
wanted_df = ['geocoded_population', 'geocoded_traffic']
#list of whether each frame should have 'sum' or 'count'
metrics = ['sum', 'sum', 'sum', 'count']

In [6]:
#empty dict for binned dataframes
df_bins = {}

#import every wanted dataframe
for i, frame in enumerate(wanted_df):
    #set path to current frame
    path = f"../processed_data/{frame}.csv"
    #create temporary dataframe out of loaded csv
    temp_df = pd.read_csv(path)
    
    #save binned dataframe to dictionary under frame name
    df_bins[frame] = bin_data(temp_df, metrics[i], wanted_df[i])

In [7]:
#check it works
print(df_bins['geocoded_population'].shape)
df_bins['geocoded_population'].head()

(195, 3)


Unnamed: 0,NTA_lat,NTA_long,Population
0,40.525528,-74.233554,23313
1,40.528645,-74.187734,27770
2,40.540334,-74.207828,20763
3,40.545779,-74.128351,0
4,40.55186,-74.15089,40720


#### Facilities DF

In [8]:
facilities_df = pd.read_csv("../processed_data/facilities.csv")

#pull value counts for each category for each neighborhood
facilities_count = bin_data(facilities_df).value_counts(['factype', 'close_NTA_lat', 'close_NTA_long'])

In [9]:
#get the index corresponding to the subgroup names
factypes = facilities_count.index.get_level_values(0).unique()
#empty dict to save facility dataframes
facility_bin = {}
#column list to create new column names
fac_columns = ['NTA_lat', 'NTA_long']

#create a dataframe for each subgroup and append name to column list
for factype in factypes:
    facility_bin[factype] = pd.DataFrame(facilities_count[factype], columns=['count']).reset_index()
    #add factype name to the column names
    fac_columns.append(factype)

#create initial dataframe out of merged first two factypes
facilities_all = pd.merge(facility_bin[factypes[0]], facility_bin[factypes[1]],  how='left', 
                          left_on=['close_NTA_lat','close_NTA_long'], 
                          right_on = ['close_NTA_lat','close_NTA_long'])

#merge rest of the factype dataframes in
for i in range(2, len(facility_bin)):
    facilities_all = pd.merge(facilities_all, facility_bin[factypes[i]],  how='left', 
                              left_on=['close_NTA_lat','close_NTA_long'], 
                              right_on = ['close_NTA_lat','close_NTA_long'])
    

#rename the columns
facilities_all.columns = fac_columns


In [10]:
#add 0 for missing NTA values
#for each NTA value
for lat, long in NTA.values:
    #check that it is not in the grouped dataframe
    if lat not in facilities_all['NTA_lat'].values and long not in facilities_all['NTA_long'].values:
            #if not then append a row with that NTA lat long
            neighborhood = {'NTA_lat': lat, 'NTA_long': long}
            facilities_all = facilities_all.append(neighborhood, ignore_index=True)
#replace all NaN with 0           
facilities_final = facilities_all.fillna(0)

In [11]:
print(facilities_final.shape)
facilities_final.head()

(195, 5)


Unnamed: 0,NTA_lat,NTA_long,DAY CARE,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS
0,40.63095,-73.988661,68.0,11.0,6.0
1,40.703917,-73.958597,49.0,17.0,6.0
2,40.641889,-74.004646,43.0,12.0,14.0
3,40.79038,-73.973908,42.0,9.0,16.0
4,40.671228,-73.983903,36.0,9.0,15.0


## Zipping

In [12]:
#create initial dataframe out of merged first two frames
master_df = pd.merge(df_bins[wanted_df[0]], df_bins[wanted_df[1]],  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

#merge all subsequent dataframes
for i in range(2, len(wanted_df)):
    master_df = pd.merge(master_df, df_bins[wanted_df[i]],  
                         how='left', left_on=['NTA_lat','NTA_long'], 
                         right_on = ['NTA_lat','NTA_long'])

#add on facilities
master_df = pd.merge(master_df, facilities_final,  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

In [13]:
#add on binned building data
#import data
building_count = pd.read_csv('../processed_data/building_count.csv')
building_stories = pd.read_csv( '../processed_data/building_stories.csv')

#merge together
building_df = pd.merge(building_count, building_stories,  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

#merge into master
master_df = pd.merge(master_df, building_df,  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

In [14]:
#add yelp 
yelp_df = pd.read_csv('../processed_data/yelp.csv')

master_df = pd.concat([master_df, yelp_df], axis=1)

In [15]:
#check it works
master_df.head()

Unnamed: 0,NTA_lat,NTA_long,Population,traffic,DAY CARE,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS,buildings,legalstories,rating,price
0,40.525528,-74.233554,23313,104.606481,6.0,0.0,3.0,68.0,1.970588,3.23,1.96
1,40.528645,-74.187734,27770,446.743056,5.0,2.0,3.0,0.0,0.0,2.67,1.9
2,40.540334,-74.207828,20763,572.143519,3.0,0.0,4.0,0.0,0.0,2.7,1.92
3,40.545779,-74.128351,0,0.0,0.0,0.0,0.0,0.0,0.0,2.57,2.0
4,40.55186,-74.15089,40720,576.06713,9.0,0.0,6.0,46.0,1.978261,2.94,1.9


In [16]:
#save to csv
master_df.to_csv('../processed_data/master.csv', index=False, encoding='utf-8')

In [17]:
master_df.describe()

Unnamed: 0,NTA_lat,NTA_long,Population,traffic,DAY CARE,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS,buildings,legalstories,rating,price
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,40.721129,-73.92045,41923.758974,525.978286,11.610256,6.107692,7.769231,256.410256,2.677669,3.401641,1.831179
std,0.091022,0.101162,22282.970575,662.928438,8.741529,5.746688,4.738859,474.397736,2.77251,0.594936,0.165561
min,40.525528,-74.233554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.48,1.44
25%,40.652996,-73.974395,26172.0,0.0,6.0,2.0,4.0,0.0,0.0,2.81,1.72
50%,40.718338,-73.916821,36891.0,343.050926,10.0,4.0,7.0,70.0,2.41784,3.46,1.84
75%,40.789101,-73.855079,53896.0,670.030671,15.5,9.0,11.0,152.5,3.464321,3.99,1.95
max,40.899535,-73.711025,132378.0,3993.351852,68.0,31.0,21.0,2946.0,26.875,4.29,2.4
