In [22]:
import pandas as pd 
import os
import warnings
warnings.filterwarnings("ignore")

In [23]:
os.getcwd()

'/Users/zenn/Documents/GitHub/NYCclean_ML4C'

In [24]:
df = pd.read_csv('./Data/ses_initial_combined.csv')
df.shape # should be length of 59

(59, 205)

In [25]:
# define relevant variables
relevant_var = ['pop', 'homeless', 'poverty', 
                # age vars
                'over', 'under']

#### Exploring possible variables to use. 

In [26]:
maybe_cols = []
for var in relevant_var: 
    maybe_cols.append([col for col in df.columns if var in col])

maybe_cols

[['pop_acs',
  'pop_2000',
  'pop_2010',
  'pop_change_00_10',
  'fp_100_pop',
  'fp_500_pop'],
 [],
 ['poverty_rate',
  'moe_poverty_rate',
  'poverty_rate_boro',
  'poverty_rate_nyc',
  'v_poverty'],
 ['female_85_over',
  'male_85_over',
  'moe_over65_rate',
  'over65_rate',
  'moe_over65_rate_boro',
  'over65_rate_boro',
  'moe_over65_rate_nyc',
  'over65_rate_nyc',
  'poverty_rate',
  'moe_poverty_rate',
  'poverty_rate_boro',
  'poverty_rate_nyc',
  'v_poverty'],
 ['female_under_5',
  'male_under_5',
  'moe_under18_rate',
  'under18_rate',
  'moe_under18_rate_boro',
  'under18_rate_boro',
  'moe_under18_rate_nyc',
  'under18_rate_nyc']]

In [27]:
[col for col in df.columns if 'white' in col]

['pct_white_nh']

In [28]:
[col for col in df.columns if 'rent' in col]

['moe_hh_rent_burd',
 'pct_hh_rent_burd',
 'moe_hh_rent_burd_boro',
 'pct_hh_rent_burd_boro',
 'moe_hh_rent_burd_nyc',
 'pct_hh_rent_burd_nyc',
 'fp_100_rent_burden',
 'fp_500_rent_burden',
 'fp_100_rent_burden_value',
 'fp_500_rent_burden_value']

In [29]:
[col for col in df.columns if 'commute' in col]

['moe_mean_commute',
 'mean_commute',
 'moe_mean_commute_boro',
 'mean_commute_boro',
 'moe_mean_commute_nyc',
 'mean_commute_nyc']

#### Selecting the relevant variables

In [30]:
df1 = df[['borocd', 
    # population -- 
    'pop_2010',  # pop acs does not match entirely to cd, using pop_2010 instead
    'area_sqmi', # for calculating pop density
    # education --
    'pct_bach_deg',
    # age --
    'over65_rate',
    'under18_rate',
    # others -- 
    'pct_white_nh', # proxy for race 
    'poverty_rate',
    'pct_hh_rent_burd', # rent burden, proxy to combine income + rent cost
    'mean_commute', # mean commute time to work
    'unemployment',  
    'crime_per_1000']]

##### Calculating population density

In [31]:
df1['pop_density'] = df1['pop_2010'] / df1['area_sqmi']
# df1.drop(['pop_2010', 'area_sqmi'], axis=1, inplace=True) keeping these columns for normalization later

#### Adding temporal dimension

In [32]:
# create datetime index of months within period we're interested in
list_of_months = pd.date_range('2016-11-01','2021-11-01' , freq='1M')-pd.offsets.MonthBegin(1)
len(list_of_months)
max(list_of_months)

Timestamp('2021-10-01 00:00:00')

In [33]:
final_df = pd.DataFrame(columns=df1.columns)
for month in list_of_months:
    df1['month'] = month
    final_df = pd.concat([final_df, df1])

final_df.shape

(3540, 14)

In [34]:
final_df['month'].nunique()

60

In [35]:
final_df.head()

Unnamed: 0,borocd,pop_2010,area_sqmi,pct_bach_deg,over65_rate,under18_rate,pct_white_nh,poverty_rate,pct_hh_rent_burd,mean_commute,unemployment,crime_per_1000,pop_density,month
0,201,91497,2.166633,11.3,9.0,29.7,2.1,32.3,49.4,42.5,6.4,21.7,42230.049029,2016-11-01
1,202,52246,2.218793,14.2,9.2,27.5,1.9,32.3,53.8,44.9,6.4,12.3,23547.039187,2016-11-01
2,203,79762,1.607575,14.1,9.2,29.1,1.2,31.4,48.0,43.9,6.6,13.6,49616.354945,2016-11-01
3,206,83268,1.530401,12.6,9.6,28.3,6.1,31.4,55.1,44.4,8.4,12.0,54409.272313,2016-11-01
4,204,146441,1.99204,15.8,9.7,27.7,1.9,34.7,54.0,41.9,6.4,14.6,73513.072409,2016-11-01


In [36]:
# changing boro code 
df = final_df
df['borocd']=df['borocd'].astype(str)
df['borough_no'] = df['borocd'].str[0:1]
df['cd']=df['borocd'].str[1:]

df['borough_code'] = df['borough_no'].replace({'1': 'MN',
                          '2': 'BX',
                          '3':'BK',
                          '4':'QN',
                          '5': 'SI'})

In [37]:
df['community_district'] = df['borough_code'] + df['cd']

In [38]:
df['community_district'].nunique() # sanity check

59

In [39]:
df['year'] = df['month'].dt.year
df['month'] = df['month'].dt.month

In [40]:
df.drop(['borough_no', 'cd', 'borough_code'], axis=1, inplace=True)
df.head()

Unnamed: 0,borocd,pop_2010,area_sqmi,pct_bach_deg,over65_rate,under18_rate,pct_white_nh,poverty_rate,pct_hh_rent_burd,mean_commute,unemployment,crime_per_1000,pop_density,month,community_district,year
0,201,91497,2.166633,11.3,9.0,29.7,2.1,32.3,49.4,42.5,6.4,21.7,42230.049029,11,BX01,2016
1,202,52246,2.218793,14.2,9.2,27.5,1.9,32.3,53.8,44.9,6.4,12.3,23547.039187,11,BX02,2016
2,203,79762,1.607575,14.1,9.2,29.1,1.2,31.4,48.0,43.9,6.6,13.6,49616.354945,11,BX03,2016
3,206,83268,1.530401,12.6,9.6,28.3,6.1,31.4,55.1,44.4,8.4,12.0,54409.272313,11,BX06,2016
4,204,146441,1.99204,15.8,9.7,27.7,1.9,34.7,54.0,41.9,6.4,14.6,73513.072409,11,BX04,2016


In [41]:
df.to_csv('ses_predictors_v3.csv')

In [42]:
df.head()

Unnamed: 0,borocd,pop_2010,area_sqmi,pct_bach_deg,over65_rate,under18_rate,pct_white_nh,poverty_rate,pct_hh_rent_burd,mean_commute,unemployment,crime_per_1000,pop_density,month,community_district,year
0,201,91497,2.166633,11.3,9.0,29.7,2.1,32.3,49.4,42.5,6.4,21.7,42230.049029,11,BX01,2016
1,202,52246,2.218793,14.2,9.2,27.5,1.9,32.3,53.8,44.9,6.4,12.3,23547.039187,11,BX02,2016
2,203,79762,1.607575,14.1,9.2,29.1,1.2,31.4,48.0,43.9,6.6,13.6,49616.354945,11,BX03,2016
3,206,83268,1.530401,12.6,9.6,28.3,6.1,31.4,55.1,44.4,8.4,12.0,54409.272313,11,BX06,2016
4,204,146441,1.99204,15.8,9.7,27.7,1.9,34.7,54.0,41.9,6.4,14.6,73513.072409,11,BX04,2016
