### Data preprocessing and preparation of 2018 Public Life Study. Will be used to assess people in public space (e.g. those experiencing homelessness) during 2018 and will be compared to FIFI requests from 2018

In [1]:
#getting and working with data
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings; warnings.simplefilter('ignore')
np.set_printoptions(suppress=True)

In [7]:
#path for 2018 Public Life Study of "People staying"
path_staying = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/data_resources/Public_Life_Data_2018_-_People_Staying.csv'

#path for 2018 Public Life Study location info (zip code added manually)
path_location = 'C:/Users/Schindler/Documents/ProgrammingFun/FIFI/data_resources/Public_Life_Data_2018_-_Locations.csv'

In [37]:
#read data into pandas dataframe
people_staying_data = pd.DataFrame(data = pd.read_csv(path_staying))
print(people_staying_data.shape)

#read data into pandas dataframe
location_data = pd.DataFrame(data = pd.read_csv(path_location))
print(location_data.shape)

#merge data sets on common key = 'Location_ID'
people_staying_data_final = pd.merge(people_staying_data, location_data, left_on='Location_ID', right_on='Location_ID', sort=False)
print(people_staying_data_final.shape)

people_staying_data_final.head()

(6537, 58)
(108, 13)
(6502, 70)


Unnamed: 0,Unique_ID,Study_ID,Location_ID,Survey_ID,Day_of_Week,Time_of_Day,Date,Start_time,End_time,Conditions,Weather,Temperature,Surveyor,Row_ID,Row_Total,Group_Size,Male,Female,Gender_Unknown,White,BlackAfricanAmerican,AmericanIndian_AlaskanNative,Asian,NativeHawaiian_PacificIslander,Latino,Multiple_Races,Race_Unknown,Age_0-4,Age_5-14,Age_15-24,Age_25-44,Age_45-64,Age_65+,Standing,Leaning,SittingFormal_Public,SittingFormal_Commercial,SittingFormal_Private,SittingInformal,Lying,Mobility_Assistance_Device,CommercialActivity_Providing,CommercialActivity_Buying,CommercialActivity_Observing,Eating/Drinking,Talking,CulturalActivity,UsingElectronics,ActiveRecreation,PassiveRecreation,WaitingTransportation,CivicWork,Smoking,DisruptiveActivity_Aggressive,DisruptiveActivity_Intoxicated,LivingActivities,Soliciting,Notes,Location_Name_Primary,Location_Name_Secondary,Location_Subdivision,Zip,Element_Key,Transit_Stop_Present,Neighborhood_Type,Neighborhood,Location_Character,line_typology_vehicular,Average_number_Commercial_Seats,Average_number_Public_Seats
0,0,2018 People Staying Counts,MAP1,Weekday1_Morning,Weekday,Morning,20180802,820,830,,Heavy Clouds,58,UD4H,1,1,3.0,,1.0,,,,,,,1.0,,,,,,,1.0,,,,1.0,,,,,,,1.0,,1.0,1.0,,,,,,,,,,,,,E MADISON ST,BETWEEN 41ST AVE E AND 42ND AVE E,NW,98112,14441,,Outside,Madison Park,Commercial,Neighborhood Corridor,0,16.0
1,1,2018 People Staying Counts,MAP1,Weekday1_Morning,Weekday,Morning,20180802,820,830,,Heavy Clouds,58,UD4H,2,1,3.0,,1.0,,,,,,,1.0,,,,,,,1.0,,,,1.0,,,,,,,1.0,,1.0,1.0,,,,,,,,,,,,,E MADISON ST,BETWEEN 41ST AVE E AND 42ND AVE E,NW,98112,14441,,Outside,Madison Park,Commercial,Neighborhood Corridor,0,16.0
2,2,2018 People Staying Counts,MAP1,Weekday1_Morning,Weekday,Morning,20180802,820,830,,Heavy Clouds,58,UD4H,3,1,3.0,,,1.0,,,,,,,,1.0,1.0,,,,,,,,,,,1.0,,,,,,,,,,,1.0,,,,,,,,,E MADISON ST,BETWEEN 41ST AVE E AND 42ND AVE E,NW,98112,14441,,Outside,Madison Park,Commercial,Neighborhood Corridor,0,16.0
3,3,2018 People Staying Counts,MAP1,Weekday1_Morning,Weekday,Morning,20180802,820,830,,Heavy Clouds,58,UD4H,4,1,1.0,1.0,,,1.0,,,,,,,,,,,1.0,,,1.0,,,,,,,,,,,,,,1.0,,,,,,,,,,,E MADISON ST,BETWEEN 41ST AVE E AND 42ND AVE E,NW,98112,14441,,Outside,Madison Park,Commercial,Neighborhood Corridor,0,16.0
4,4,2018 People Staying Counts,MAP1,Weekday1_Morning,Weekday,Morning,20180802,820,830,,Heavy Clouds,58,UD4H,5,1,2.0,1.0,,,1.0,,,,,,,,,,,1.0,,,1.0,,,,,,,,,,,1.0,,,1.0,,,,,,,,,,,E MADISON ST,BETWEEN 41ST AVE E AND 42ND AVE E,NW,98112,14441,,Outside,Madison Park,Commercial,Neighborhood Corridor,0,16.0


In [38]:
#each neighborhood can have multiple locations but we are interested in the neighborhood as a whole 
#create new column corresponding to neighborhood as a whole (e.g. strip off numbers at end of location_id)
print(people_staying_data_final['Location_ID'].value_counts().sort_index().head())
people_staying_data_final['Neighborhood'] = people_staying_data_final['Location_ID'].str.rstrip('12345')
print(people_staying_data_final['Neighborhood'].value_counts().sort_index().head())

ALK1     74
ALK2     77
BAL1     21
BAL2     28
BAL3    121
Name: Location_ID, dtype: int64
ALK    151
BAL    298
BEA     66
BLT    539
BLV     16
Name: Neighborhood, dtype: int64


In [39]:
#want to determine percentage of time each selected parameter (e.g those related to homelessness) was observed per location_ID
params_of_interest = ['Location_ID',
                      'Zip',
                      'Neighborhood',
                      'SittingInformal', 
                      'Lying', 
                      'DisruptiveActivity_Aggressive',
                      'DisruptiveActivity_Intoxicated',
                      'LivingActivities',
                      'Soliciting']

location_grouped = people_staying_data_final.groupby('Location_ID')[params_of_interest].count() 

location_grouped_mean = location_grouped.apply(lambda col: col/location_grouped['Location_ID']*100, axis = 0) 

zip_grouped = people_staying_data_final.groupby('Zip')[params_of_interest].count() 

zip_grouped_mean = zip_grouped.apply(lambda col: col/zip_grouped['Zip']*100, axis = 0) 

neighborhood_grouped = people_staying_data_final.groupby('Neighborhood')[params_of_interest].count() 

neighborhood_grouped_mean = location_grouped.apply(lambda col: col/neighborhood_grouped['Neighborhood']*100, axis = 0) 

In [54]:
#save as csv, to be combined with demographic info for each neighborhood

location_grouped_mean.to_csv('location_grouped_mean.csv')
zip_grouped_mean.to_csv('zip_grouped_mean.csv')
neighborhood_grouped_mean.to_csv('neighborhood_grouped_mean.csv')