In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%run -i ../notebooks/functions/functions.py

In [3]:
#get NTA centroid list
NTA = pd.read_csv('../processed_data/NTA_coords.csv')

print(NTA.shape)
NTA.head()

(195, 2)


Unnamed: 0,NTA_lat,NTA_long
0,40.768352,-73.809546
1,40.763352,-73.868396
2,40.734894,-73.783716
3,40.755734,-73.983503
4,40.818055,-73.856188


In [4]:
NTA.dtypes

NTA_lat     float64
NTA_long    float64
dtype: object

## Data Binning

#-- ORIGINAL DATA -- 
#create list of frames to pull in (facilities treated seperately)
wanted_df = ['density/geocoded_population', 'density/geocoded_traffic', 'density/geocoded_pedestrian', 
             'wellbeing/hotels', 'wellbeing/parks', 'wellbeing/libraries']
#list of whether each frame should have 'sum' or 'count'
metrics = [False, 'sum', 'sum', 'count', 'sum', 'count']

In [5]:
#--- EXPERIMENT --- 

#create list of frames to pull in (facilities treated seperately)
wanted_df = ['density/geocoded_population', 'density/geocoded_traffic', 
             'buildings/building_count','buildings/building_stories', ]
#list of whether each frame should have 'sum' or 'count'
metrics = [False, 'sum', False, False]

In [6]:
#empty dict for binned dataframes
df_bins = {}

#import every wanted dataframe
for i, frame in enumerate(wanted_df):
    #set path to current frame
    path = f"../processed_data/{frame}.csv"
    #create temporary dataframe out of loaded csv
    temp_df = pd.read_csv(path)
    
    #save name of csv to save binned dataframe under
    df_name = frame.split('/')[1]
    
    #if metrics are true then bin data by the metric
    if bool(metrics[i]):         
        #save binned dataframe to dictionary under frame name
        df_bins[df_name] = bin_data(temp_df, metrics[i], wanted_df[i])    
    else:
        df_bins[df_name] = temp_df

In [7]:
df_bins['geocoded_traffic'].replace({0: 290}, inplace = True)

In [8]:
#check it works
print(df_bins['geocoded_traffic'].shape)
df_bins['geocoded_traffic'].tail()

(195, 3)


Unnamed: 0,NTA_lat,NTA_long,traffic
190,40.78333,-73.785962,290.0
191,40.588299,-73.941511,290.0
192,40.67209,-73.773039,290.0
193,40.690302,-73.832763,290.0
194,40.812089,-73.885547,290.0


#### Facilities DF

For generation of `facilities.csv` see `notebooks/buildings_wellbeing.ipynb`

In [9]:
#get data
facilities_df = pd.read_csv("../processed_data/wellbeing/facilities.csv")

#pull value counts for each category for each neighborhood
facilities_count = bin_data(facilities_df).value_counts(['factype', 'NTA_lat', 'NTA_long'])

In [10]:
#get the index corresponding to the subgroup names
factypes = facilities_count.index.get_level_values(0).unique()
#empty dict to save facility dataframes
facility_bin = {}
#column list to create new column names
fac_columns = ['NTA_lat', 'NTA_long']

#create a dataframe for each subgroup and append name to column list
for factype in factypes:
    facility_bin[factype] = pd.DataFrame(facilities_count[factype], columns=['count']).reset_index()
    #add factype name to the column names
    fac_columns.append(factype)

#create initial dataframe out of merged first two factypes
facilities_all = pd.merge(facility_bin[factypes[0]], facility_bin[factypes[1]],  how='left', 
                          left_on=['NTA_lat','NTA_long'], 
                          right_on = ['NTA_lat','NTA_long'])

#merge rest of the factype dataframes in
for i in range(2, len(facility_bin)):
    facilities_all = pd.merge(facilities_all, facility_bin[factypes[i]],  how='left', 
                              left_on=['NTA_lat','NTA_long'], 
                              right_on = ['NTA_lat','NTA_long'])
    

#rename the columns
facilities_all.columns = fac_columns


  facilities_all = pd.merge(facilities_all, facility_bin[factypes[i]],  how='left',


In [11]:
#add 0 for missing NTA values
#for each NTA value
for lat, long in NTA.values:
    #check that it is not in the grouped dataframe
    if lat not in facilities_all['NTA_lat'].values and long not in facilities_all['NTA_long'].values:
            #if not then append a row with that NTA lat long
            neighborhood = {'NTA_lat': lat, 'NTA_long': long}
            facilities_all = facilities_all.append(neighborhood, ignore_index=True)
#replace all NaN with 0           
facilities_final = facilities_all.fillna(0)

In [12]:
print(facilities_final.shape)
facilities_final.head()

(195, 6)


Unnamed: 0,NTA_lat,NTA_long,DAY CARE,NON-PUBLIC K-12 SCHOOLS,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS
0,40.63095,-73.988661,68.0,59.0,11.0,6.0
1,40.703917,-73.958597,49.0,46.0,17.0,6.0
2,40.641889,-74.004646,43.0,16.0,12.0,14.0
3,40.79038,-73.973908,42.0,24.0,9.0,16.0
4,40.671228,-73.983903,36.0,7.0,9.0,15.0


## Zipping

In [13]:
#create initial dataframe out of merged first two frames
master_df = pd.merge(df_bins[wanted_df[0].split('/')[1]], df_bins[wanted_df[1].split('/')[1]],  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

#merge all subsequent dataframes
for i in range(2, len(wanted_df)):
    name = wanted_df[i].split('/')[1]
    
    master_df = pd.merge(master_df, df_bins[name],  
                         how='left', left_on=['NTA_lat','NTA_long'], 
                         right_on = ['NTA_lat','NTA_long'])

#add on facilities
master_df = pd.merge(master_df, facilities_final,  
                     how='left', left_on=['NTA_lat','NTA_long'], 
                     right_on = ['NTA_lat','NTA_long'])

In [14]:
#check it works
master_df.head()

Unnamed: 0,NTA_lat,NTA_long,Population,traffic,buildings,legalstories,DAY CARE,NON-PUBLIC K-12 SCHOOLS,HOSPITALS AND CLINICS,PUBLIC K-12 SCHOOLS
0,40.768352,-73.809546,51739.0,244.032407,70.0,1.0,13.0,4.0,1.0,4.0
1,40.763352,-73.868396,23150.0,290.0,62.0,2.193548,3.0,0.0,0.0,3.0
2,40.734894,-73.783716,17812.0,618.412037,23.0,2.086957,4.0,5.0,5.0,4.0
3,40.755734,-73.983503,28630.0,3993.351852,8.0,26.875,9.0,0.0,3.0,2.0
4,40.818055,-73.856188,53686.0,290.0,355.0,2.754237,4.0,2.0,5.0,15.0


In [15]:
#save to csv
master_df.to_csv('../processed_data/master.csv', index=False, encoding='utf-8')