In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%run -i ../notebooks/functions.py

In [12]:
#get NTA centroid list
NTA = pd.read_csv('../processed_data/density/geocoded_population.csv').loc[:, ['latitude', 'longitude']]
#export NTA as its own csv
NTA.to_csv('../processed_data/NTA_coords.csv')

print(NTA.shape)
NTA.head()

(195, 2)


Unnamed: 0,latitude,longitude
0,40.768352,-73.809546
1,40.763352,-73.868396
2,40.734894,-73.783716
3,40.755734,-73.983503
4,40.818055,-73.856188


## Data Binning

In [4]:
#-- ORIGINAL DATA -- 
#create list of frames to pull in (facilities treated seperately)
wanted_df = ['geocoded_population', 'geocoded_traffic', 'geocoded_pedestrian', 
             'hotels', 'parks', 'libraries']
#list of whether each frame should have 'sum' or 'count'
metrics = ['sum', 'sum', 'sum', 'count', 'count', 'count']

In [14]:
#--- EXPERIMENT --- 

#create list of frames to pull in (facilities treated seperately)
wanted_df = ['density/geocoded_population', 'density/geocoded_traffic', 
             'buildings/building_count','buildings/building_stories', ]
#list of whether each frame should have 'sum' or 'count'
metrics = ['sum', 'sum', False, False]

In [15]:
#empty dict for binned dataframes
df_bins = {}

#import every wanted dataframe
for i, frame in enumerate(wanted_df):
    #set path to current frame
    path = f"../processed_data/{frame}.csv"
    #create temporary dataframe out of loaded csv
    temp_df = pd.read_csv(path)
    
    #if metrics are true then bin data by the metric
    if bool(metrics[i]):         
        #save binned dataframe to dictionary under frame name
        df_bins[frame] = bin_data(temp_df, metrics[i], wanted_df[i])    
    else:
        df_bins[frame] = temp_df

In [17]:
#check it works
print(df_bins['density/geocoded_population'].shape)
df_bins['density/geocoded_population'].head()

(195, 3)


Unnamed: 0,NTA_lat,NTA_long,Population
0,40.525528,-74.233554,23313
1,40.528645,-74.187734,27770
2,40.540334,-74.207828,20763
3,40.545779,-74.128351,0
4,40.55186,-74.15089,40720


#### Facilities DF

In [None]:
facilities_df = pd.read_csv("../processed_data/facilities.csv")

#pull value counts for each category for each neighborhood
facilities_count = bin_data(facilities_df).value_counts(['factype', 'close_NTA_lat', 'close_NTA_long'])

In [None]:
#get the index corresponding to the subgroup names
factypes = facilities_count.index.get_level_values(0).unique()
#empty dict to save facility dataframes
facility_bin = {}
#column list to create new column names
fac_columns = ['NTA_lat', 'NTA_long']

#create a dataframe for each subgroup and append name to column list
for factype in factypes:
    facility_bin[factype] = pd.DataFrame(facilities_count[factype], columns=['count']).reset_index()
    #add factype name to the column names
    fac_columns.append(factype)

#create initial dataframe out of merged first two factypes
facilities_all = pd.merge(facility_bin[factypes[0]], facility_bin[factypes[1]],  how='left', 
                          left_on=['close_NTA_lat','close_NTA_long'], 
                          right_on = ['close_NTA_lat','close_NTA_long'])

#merge rest of the factype dataframes in
for i in range(2, len(facility_bin)):
    facilities_all = pd.merge(facilities_all, facility_bin[factypes[i]],  how='left', 
                              left_on=['close_NTA_lat','close_NTA_long'], 
                              right_on = ['close_NTA_lat','close_NTA_long'])
    

#rename the columns
facilities_all.columns = fac_columns


In [None]:
#add 0 for missing NTA values
#for each NTA value
for lat, long in NTA.values:
    #check that it is not in the grouped dataframe
    if lat not in facilities_all['NTA_lat'].values and long not in facilities_all['NTA_long'].values:
            #if not then append a row with that NTA lat long
            neighborhood = {'NTA_lat': lat, 'NTA_long': long}
            facilities_all = facilities_all.append(neighborhood, ignore_index=True)
#replace all NaN with 0           
facilities_final = facilities_all.fillna(0)

In [None]:
print(facilities_final.shape)
facilities_final.head()

## Zipping

In [None]:
#create initial dataframe out of merged first two frames
master_df = pd.merge(df_bins[wanted_df[0]], df_bins[wanted_df[1]],  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

#merge all subsequent dataframes
for i in range(2, len(wanted_df)):
    master_df = pd.merge(master_df, df_bins[wanted_df[i]],  
                         how='left', left_on=['NTA_lat','NTA_long'], 
                         right_on = ['NTA_lat','NTA_long'])

#add on facilities
master_df = pd.merge(master_df, facilities_final,  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

In [None]:
#check it works
master_df.head()

In [None]:
#save to csv
master_df.to_csv('../processed_data/master.csv', index=False, encoding='utf-8')

In [None]:
master_df.describe()