In [27]:
import pandas as pd 
import os
import warnings
warnings.filterwarnings("ignore")

In [28]:
os.getcwd()

'/Users/zenn/Documents/School/Machine Learning/Final Project'

In [29]:
df = pd.read_csv('./Data/combined.csv')
df.shape # should be length of 59

(59, 205)

In [30]:
# define relevant variables
relevant_var = ['pop', 'homeless', 'poverty', 
                # age vars
                'over', 'under']

#### Exploring possible variables to use. 

In [31]:
maybe_cols = []
for var in relevant_var: 
    maybe_cols.append([col for col in df.columns if var in col])

maybe_cols

[['pop_acs',
  'pop_2000',
  'pop_2010',
  'pop_change_00_10',
  'fp_100_pop',
  'fp_500_pop'],
 [],
 ['poverty_rate',
  'moe_poverty_rate',
  'poverty_rate_boro',
  'poverty_rate_nyc',
  'v_poverty'],
 ['female_85_over',
  'male_85_over',
  'moe_over65_rate',
  'over65_rate',
  'moe_over65_rate_boro',
  'over65_rate_boro',
  'moe_over65_rate_nyc',
  'over65_rate_nyc',
  'poverty_rate',
  'moe_poverty_rate',
  'poverty_rate_boro',
  'poverty_rate_nyc',
  'v_poverty'],
 ['female_under_5',
  'male_under_5',
  'moe_under18_rate',
  'under18_rate',
  'moe_under18_rate_boro',
  'under18_rate_boro',
  'moe_under18_rate_nyc',
  'under18_rate_nyc']]

In [32]:
[col for col in df.columns if 'white' in col]

['pct_white_nh']

In [33]:
[col for col in df.columns if 'rent' in col]

['moe_hh_rent_burd',
 'pct_hh_rent_burd',
 'moe_hh_rent_burd_boro',
 'pct_hh_rent_burd_boro',
 'moe_hh_rent_burd_nyc',
 'pct_hh_rent_burd_nyc',
 'fp_100_rent_burden',
 'fp_500_rent_burden',
 'fp_100_rent_burden_value',
 'fp_500_rent_burden_value']

In [34]:
[col for col in df.columns if 'commute' in col]

['moe_mean_commute',
 'mean_commute',
 'moe_mean_commute_boro',
 'mean_commute_boro',
 'moe_mean_commute_nyc',
 'mean_commute_nyc']

#### Selecting the relevant variables

In [35]:
df1 = df[['borocd', 
    # population -- 
    'pop_2010',  # pop acs does not match entirely to cd, using pop_2010 instead
    'area_sqmi', # for calculating pop density
    # education --
    'pct_bach_deg',
    # age --
    'over65_rate',
    'under18_rate',
    # others -- 
    'pct_white_nh', # proxy for race 
    'poverty_rate',
    'pct_hh_rent_burd', # rent burden, proxy to combine income + rent cost
    'mean_commute', # mean commute time to work
    'unemployment',  
    'crime_per_1000']]

##### Calculating population density

In [36]:
df1['pop_density'] = df1['pop_2010'] / df1['area_sqmi']
df1.drop(['pop_2010', 'area_sqmi'], axis=1, inplace=True)

#### Adding temporal dimension

In [37]:
# create datetime index of months within period we're interested in
list_of_months = pd.date_range('2016-11-01','2021-11-01' , freq='1M')-pd.offsets.MonthBegin(1)
len(list_of_months)
max(list_of_months)

Timestamp('2021-10-01 00:00:00')

In [38]:
final_df = pd.DataFrame(columns=df1.columns)
for month in list_of_months:
    df1['month'] = month
    final_df = pd.concat([final_df, df1])

final_df.shape

(3540, 12)

In [39]:
final_df['month'].nunique()

60

In [41]:
# changing boro code 
df['borocd']=df['borocd'].astype(str)
df['borough_no'] = df['borocd'].str[0:1]
df['cd']=df['borocd'].str[1:]

df['borough_code'] = df['borough_no'].replace({'1': 'MN',
                          '2': 'BX',
                          '3':'BK',
                          '4':'QN',
                          '5': 'SI'})

In [42]:
df['community_district'] = df['borough_code'] + df['cd']

In [43]:
df['community_district'].nunique() # sanity check

59

In [44]:
df.to_csv('ses_predictors_v2.csv')