In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
crime_with_proximity = pd.read_csv('../../data/pre_training/crime_with_proximity.csv')

In [3]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')
agg_public_healthindicator = pd.read_csv('../../data/processed/agg_public_healthindicator.csv')
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')
clean_train_ridership = pd.read_csv('../../data/processed/clean_train_ridership.csv')
clean_areas = pd.read_csv('../../data/processed/clean_areas.csv')
clean_crime = pd.read_csv('../../data/processed/clean_crime.csv')

In [4]:
crime_with_proximity = crime_with_proximity.merge(clean_crime[['id','areas']], on='id',how='inner')
crime_with_proximity = crime_with_proximity.rename(columns={'areas':'area_id'})

In [5]:
crime_with_proximity['date'] = pd.to_datetime(crime_with_proximity['date'])
crime_with_proximity['hour'] = crime_with_proximity['date'].dt.hour
crime_with_proximity['day'] = crime_with_proximity['date'].dt.date
crime_with_proximity['date_hour'] = crime_with_proximity['date'].dt.floor('h')
crime_with_proximity.sort_values('date', inplace=True)

In [6]:
district_crimes_over_hours = crime_with_proximity.groupby(['district', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'district_crimes_this_hour'})
district_crimes_over_hours

Unnamed: 0,district,date,district_crimes_this_hour
0,1,2016-01-01 00:00:00,7
1,1,2016-01-01 01:00:00,1
2,1,2016-01-01 02:00:00,3
3,1,2016-01-01 03:00:00,2
4,1,2016-01-01 04:00:00,2
...,...,...,...
638468,25,2020-12-30 20:00:00,2
638469,25,2020-12-30 21:00:00,3
638470,25,2020-12-30 22:00:00,2
638471,25,2020-12-30 23:00:00,1


In [7]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    district_crimes_over_hours[f'district_crimes_{window}_hours_prev'] = district_crimes_over_hours.groupby('district')['district_crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

district_crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'district', 'date_hour', 'hour', 'day']], right=district_crimes_over_hours, left_on=['district', 'date_hour'], right_on=['district', 'date'], how='left')

In [8]:
district_crimes_over_hours = district_crimes_over_hours[['id', 'district', 'date_hour', 'district_crimes_this_hour', 'district_crimes_1_hours_prev', 'district_crimes_3_hours_prev', 'district_crimes_6_hours_prev', 'district_crimes_12_hours_prev', 'district_crimes_24_hours_prev']]

In [9]:
area_crimes_over_hours = crime_with_proximity.groupby(['area_id', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'area_crimes_this_hour'})
area_crimes_over_hours

Unnamed: 0,area_id,date,area_crimes_this_hour
0,1,2016-01-01 00:00:00,2
1,1,2016-01-01 01:00:00,2
2,1,2016-01-01 03:00:00,2
3,1,2016-01-01 12:00:00,1
4,1,2016-01-02 00:00:00,1
...,...,...,...
897943,77,2020-12-29 10:00:00,2
897944,77,2020-12-29 11:00:00,1
897945,77,2020-12-30 17:00:00,1
897946,77,2020-12-30 19:00:00,1


In [10]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    area_crimes_over_hours[f'area_crimes_{window}_hours_prev'] = area_crimes_over_hours.groupby('area_id')['area_crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

area_crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'area_id', 'date_hour', 'hour', 'day']], right=area_crimes_over_hours, left_on=['area_id', 'date_hour'], right_on=['area_id', 'date'], how='left')

In [11]:
area_crimes_over_hours = area_crimes_over_hours[['id', 'area_id', 'date_hour', 'area_crimes_this_hour', 'area_crimes_1_hours_prev', 'area_crimes_3_hours_prev', 'area_crimes_6_hours_prev', 'area_crimes_12_hours_prev', 'area_crimes_24_hours_prev']]

In [12]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])
clean_bike_trips['hour'] = clean_bike_trips['date'].dt.hour
clean_bike_trips['date'] = clean_bike_trips['date'].dt.date

In [13]:
grouped_bike_trips = clean_bike_trips.groupby(['date', 'hour', 'district'])['station_id'].agg('count').reset_index().rename(columns={'station_id':'hourly_bike_rides'})
grouped_bike_trips['date'] = pd.to_datetime(grouped_bike_trips['date'])

In [14]:
final_df = pd.merge(left=crime_with_proximity, right=grouped_bike_trips, on=['date','hour','district'], how='left').drop(['hour','day'], axis=1).fillna(0)

In [15]:
agg_public_healthindicator.columns = ['district_' + col if col != 'district' else 'district' for col in agg_public_healthindicator.columns ]
final_df = pd.merge(left=final_df, right=agg_public_healthindicator, on='district', how='left')

In [16]:
final_df = pd.merge(left=final_df, right=clean_police_districts[['district','disadvantaged_score']], on='district', how='left')

In [17]:
clean_train_ridership['date'] = pd.to_datetime(clean_train_ridership['date'])

In [18]:
grouped_train_ridership = clean_train_ridership.groupby(['date','district'])['rides'].agg('sum').reset_index()

In [19]:
final_df = pd.merge(left=final_df, right=grouped_train_ridership, on=['date', 'district'], how='left').fillna(0)

In [20]:
final_df = pd.merge(left=final_df, right=district_crimes_over_hours.drop('district', axis=1), on='id', how='inner')
final_df = pd.merge(left=final_df, right=area_crimes_over_hours.drop('area_id', axis=1), on='id', how='inner')

In [21]:
final_df.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [22]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
areas = np.arange(1, 78)
area_base_df = pd.DataFrame([(area, date) for area in areas for date in date_range], columns=['area_id', 'date_hour'])
area_base_df['day'] = area_base_df['date_hour'].dt.day
area_base_df['hour'] = area_base_df['date_hour'].dt.hour

In [23]:
area_base_df

Unnamed: 0,area_id,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19
3376292,77,2020-12-31 20:00:00,31,20
3376293,77,2020-12-31 21:00:00,31,21
3376294,77,2020-12-31 22:00:00,31,22


In [24]:
area_base_df['year'] = area_base_df['date_hour'].dt.year
area_base_df['month'] = area_base_df['date_hour'].dt.month
area_base_df['day'] = area_base_df['date_hour'].dt.day
area_base_df['day_of_week'] = area_base_df['date_hour'].dt.dayofweek
area_base_df

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week
0,1,2016-01-01 00:00:00,1,0,2016,1,4
1,1,2016-01-01 01:00:00,1,1,2016,1,4
2,1,2016-01-01 02:00:00,1,2,2016,1,4
3,1,2016-01-01 03:00:00,1,3,2016,1,4
4,1,2016-01-01 04:00:00,1,4,2016,1,4
...,...,...,...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19,2020,12,3
3376292,77,2020-12-31 20:00:00,31,20,2020,12,3
3376293,77,2020-12-31 21:00:00,31,21,2020,12,3
3376294,77,2020-12-31 22:00:00,31,22,2020,12,3


In [25]:
clean_public_healthindicator = pd.read_csv('../../data/processed/clean_public_healthindicator.csv')
disadvantaged_areas_within_areas = pd.read_csv('../../data/processed/disadvantaged_areas_within_areas.csv')

In [26]:
clean_public_healthindicator

Unnamed: 0,id,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,0.075,23714,0.181,0.288,0.079,0.227
1,2,0.079,21375,0.196,0.383,0.070,0.151
2,3,0.077,32355,0.136,0.222,0.046,0.227
3,4,0.068,35503,0.125,0.256,0.031,0.095
4,5,0.045,51615,0.054,0.255,0.002,0.071
...,...,...,...,...,...,...,...
72,73,0.183,19709,0.156,0.424,0.011,0.157
73,74,0.069,34221,0.045,0.370,0.011,0.031
74,75,0.149,26185,0.109,0.394,0.008,0.137
75,76,0.047,29402,0.110,0.265,0.019,0.095


In [27]:
base_df_with_area_stats = area_base_df.merge(right=clean_public_healthindicator, left_on='area_id', right_on='id', how='left')
base_df_with_area_stats = base_df_with_area_stats.merge(right=disadvantaged_areas_within_areas, left_on='area_id', right_on='areas', how='left')

In [28]:
filtered_df = ['police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_distance_5',
       'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
       'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
       'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
       'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
       'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
       'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
       'bike_rides_within_0.1_and_5_min', 'bike_rides_within_0.3_and_5_min',
       'bike_rides_within_0.5_and_5_min', 'bike_rides_within_0.1_and_10_min',
       'bike_rides_within_0.3_and_10_min', 'bike_rides_within_0.5_and_10_min',
       'bike_rides_within_0.1_and_15_min', 'bike_rides_within_0.3_and_15_min',
       'bike_rides_within_0.5_and_15_min',
       'hourly_bike_rides', 'district_unemployment',
       'district_per_capita_income', 'district_no_hs_dip',
       'district_gov_depend', 'district_crowded_housing', 'district_below_pov',
       'disadvantaged_score', 'rides', 'district_crimes_this_hour',
       'district_crimes_1_hours_prev', 'district_crimes_3_hours_prev',
       'district_crimes_6_hours_prev', 'district_crimes_12_hours_prev',
       'district_crimes_24_hours_prev', 'area_crimes_this_hour',
       'area_crimes_1_hours_prev', 'area_crimes_3_hours_prev',
       'area_crimes_6_hours_prev', 'area_crimes_12_hours_prev',
       'area_crimes_24_hours_prev']

In [29]:
final_df_area_avg = final_df.groupby(['area_id','date_hour'])[filtered_df].agg('mean').reset_index()
final_df_district_avg = final_df.groupby(['district','date_hour'])[filtered_df].agg('mean').reset_index()

In [30]:
base_df_with_area_stats.rename(columns={
    'unemployment': 'area_unemployment',
    'per_capita_income': 'area_per_capita_income',
    'no_hs_dip': 'area_no_hs_dip',
    'gov_depend': 'area_gov_depend',
    'crowded_housing': 'area_crowded_housing',
    'below_pov': 'area_below_pov'
}, inplace=True)

In [31]:
base_df_with_area_stats

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week,id,area_unemployment,area_per_capita_income,area_no_hs_dip,area_gov_depend,area_crowded_housing,area_below_pov,poly,district,areas
0,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.01939800001747 -87.66368000002285...,24.0,1.0
1,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.01937400002132 -87.67335799998406...,24.0,1.0
2,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((41.998169000012545 -87.6624109999806...,24.0,1.0
3,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.00497899998903 -87.66572099996036...,24.0,1.0
4,1,2016-01-01 01:00:00,1,1,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.01939800001747 -87.66368000002285...,24.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13329787,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.983635999999635 -87.6593119999893...,20.0,77.0
13329788,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.98368300002446 -87.65660900000702...,20.0,77.0
13329789,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.998316999982876 -87.6550000000143...,24.0,77.0
13329790,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.99662699998114 -87.65029099995624...,24.0,77.0


In [32]:
districts = pd.read_csv('../../data/processed/clean_police_districts.csv')

In [33]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
districts = list(set(districts['district'].values))
district_base_df = pd.DataFrame([(district, date) for district in districts for date in date_range], columns=['district', 'date_hour'])
district_base_df['day'] = district_base_df['date_hour'].dt.day
district_base_df['hour'] = district_base_df['date_hour'].dt.hour

In [34]:
district_base_df

Unnamed: 0,district,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19
1008500,31,2020-12-31 20:00:00,31,20
1008501,31,2020-12-31 21:00:00,31,21
1008502,31,2020-12-31 22:00:00,31,22


In [35]:
district_base_df['year'] = district_base_df['date_hour'].dt.year
district_base_df['month'] = district_base_df['date_hour'].dt.month
district_base_df['day'] = district_base_df['date_hour'].dt.day
district_base_df['day_of_week'] = district_base_df['date_hour'].dt.dayofweek
district_base_df

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week
0,1,2016-01-01 00:00:00,1,0,2016,1,4
1,1,2016-01-01 01:00:00,1,1,2016,1,4
2,1,2016-01-01 02:00:00,1,2,2016,1,4
3,1,2016-01-01 03:00:00,1,3,2016,1,4
4,1,2016-01-01 04:00:00,1,4,2016,1,4
...,...,...,...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19,2020,12,3
1008500,31,2020-12-31 20:00:00,31,20,2020,12,3
1008501,31,2020-12-31 21:00:00,31,21,2020,12,3
1008502,31,2020-12-31 22:00:00,31,22,2020,12,3


In [36]:
district_df = disadvantaged_areas_within_areas.groupby('district')['areas'].apply(set).reset_index()
district_df.columns = ['district', 'areas']
district_df

Unnamed: 0,district,areas
0,1,{35}
1,2,"{35, 36, 38, 39, 40, 41}"
2,3,"{40, 42, 43, 69}"
3,4,"{43, 46, 47, 48, 51, 52}"
4,5,"{49, 50, 53, 54}"
5,6,"{44, 69, 71}"
6,7,"{67, 68, 69}"
7,8,"{56, 66, 58}"
8,9,"{34, 37, 58, 59, 60, 61, 63}"
9,10,"{29, 30}"


In [37]:
base_df_with_district_stats = district_base_df.merge(right=district_df, on='district', how='left')
base_df_with_district_stats = base_df_with_district_stats.explode('areas')
base_df_with_district_stats = base_df_with_district_stats.merge(right=clean_public_healthindicator, left_on='areas', right_on='id', how='left')

In [38]:
base_df_with_district_stats

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week,areas,id,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,2016-01-01 00:00:00,1,0,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
1,1,2016-01-01 01:00:00,1,1,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
2,1,2016-01-01 02:00:00,1,2,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
3,1,2016-01-01 03:00:00,1,3,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
4,1,2016-01-01 04:00:00,1,4,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3113203,31,2020-12-31 19:00:00,31,19,2020,12,3,,,,,,,,
3113204,31,2020-12-31 20:00:00,31,20,2020,12,3,,,,,,,,
3113205,31,2020-12-31 21:00:00,31,21,2020,12,3,,,,,,,,
3113206,31,2020-12-31 22:00:00,31,22,2020,12,3,,,,,,,,


In [39]:
base_df_with_district_stats = base_df_with_district_stats.groupby(['district', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week'])[['unemployment', 'per_capita_income', 'no_hs_dip', 'gov_depend', 'crowded_housing', 'below_pov']].agg('mean').reset_index()
base_df_with_district_stats

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
1,1,2016-01-01 01:00:00,1,1,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
2,1,2016-01-01 02:00:00,1,2,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
3,1,2016-01-01 03:00:00,1,3,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
4,1,2016-01-01 04:00:00,1,4,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19,2020,12,3,,,,,,
1008500,31,2020-12-31 20:00:00,31,20,2020,12,3,,,,,,
1008501,31,2020-12-31 21:00:00,31,21,2020,12,3,,,,,,
1008502,31,2020-12-31 22:00:00,31,22,2020,12,3,,,,,,


In [40]:
base_df_with_district_stats.rename(columns={
    'unemployment': 'area_unemployment',
    'per_capita_income': 'area_per_capita_income',
    'no_hs_dip': 'area_no_hs_dip',
    'gov_depend': 'area_gov_depend',
    'crowded_housing': 'area_crowded_housing',
    'below_pov': 'area_below_pov'
}, inplace=True)

In [41]:
base_df_with_district_stats

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week,area_unemployment,area_per_capita_income,area_no_hs_dip,area_gov_depend,area_crowded_housing,area_below_pov
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
1,1,2016-01-01 01:00:00,1,1,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
2,1,2016-01-01 02:00:00,1,2,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
3,1,2016-01-01 03:00:00,1,3,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
4,1,2016-01-01 04:00:00,1,4,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19,2020,12,3,,,,,,
1008500,31,2020-12-31 20:00:00,31,20,2020,12,3,,,,,,
1008501,31,2020-12-31 21:00:00,31,21,2020,12,3,,,,,,
1008502,31,2020-12-31 22:00:00,31,22,2020,12,3,,,,,,


In [42]:
area_final_df = pd.merge(base_df_with_area_stats, final_df_area_avg, on=['area_id','date_hour'], how='left')
district_final_df = pd.merge(base_df_with_district_stats, final_df_district_avg, on=['district','date_hour'], how='left')

In [43]:
area_final_df['temp_id'] = area_final_df.index
district_final_df['temp_id'] = district_final_df.index

In [44]:
area_null_districts = area_final_df[area_final_df['district'].isna()][['temp_id', 'area_id']]
area_null_districts = area_null_districts.merge(clean_areas[['id', 'district']], left_on='area_id', right_on='id', how='inner')
area_final_df.loc[area_final_df['district'].isna(), 'district'] = area_final_df['temp_id'].map(area_null_districts.set_index('temp_id')['district'])

In [45]:
area_final_df = area_final_df.drop(['id', 'poly', 'areas', 'temp_id', 'district_crimes_this_hour'], axis=1)
district_final_df = district_final_df.drop(['temp_id', 'area_crimes_this_hour'], axis=1)

In [46]:
area_final_df.columns

Index(['area_id', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week',
       'area_unemployment', 'area_per_capita_income', 'area_no_hs_dip',
       'area_gov_depend', 'area_crowded_housing', 'area_below_pov', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_

In [47]:
district_final_df.columns

Index(['district', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week',
       'area_unemployment', 'area_per_capita_income', 'area_no_hs_dip',
       'area_gov_depend', 'area_crowded_housing', 'area_below_pov',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       

#### Normalize Columns

In [48]:
area_columns_to_normalize = [
    'area_unemployment', 'area_per_capita_income', 'area_no_hs_dip',
    'area_gov_depend', 'area_crowded_housing', 'area_below_pov',
    'police_stations_distance_0.1', 'police_stations_distance_0.3',
    'police_stations_distance_0.5', 'police_stations_distance_1',
    'police_stations_distance_3', 'police_stations_distance_5',
    'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
    'bike_stations_distance_0.5', 'bike_stations_distance_1',
    'bike_stations_distance_3', 'bike_stations_distance_5',
    'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
    'bus_stops_distance_0.5', 'bus_stops_distance_1',
    'bus_stops_distance_3', 'bus_stops_distance_5',
    'train_stations_distance_0.1', 'train_stations_distance_0.3',
    'train_stations_distance_0.5', 'train_stations_distance_1',
    'train_stations_distance_3', 'train_stations_distance_5',
    'alleylights_distance_0.1', 'alleylights_distance_0.3',
    'alleylights_distance_0.5', 'alleylights_distance_1',
    'alleylights_distance_3', 'alleylights_distance_5',
    'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
    'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
    'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
    'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
    'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
    'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
    'bike_rides_within_0.1_and_5_min', 'bike_rides_within_0.3_and_5_min',
    'bike_rides_within_0.5_and_5_min', 'bike_rides_within_0.1_and_10_min',
    'bike_rides_within_0.3_and_10_min', 'bike_rides_within_0.5_and_10_min',
    'bike_rides_within_0.1_and_15_min', 'bike_rides_within_0.3_and_15_min',
    'bike_rides_within_0.5_and_15_min', 'hourly_bike_rides',
    'district_unemployment', 'district_per_capita_income',
    'district_no_hs_dip', 'district_gov_depend', 'district_crowded_housing',
    'district_below_pov', 'disadvantaged_score', 'rides', 'district_crimes_1_hours_prev',
    'district_crimes_3_hours_prev', 'district_crimes_6_hours_prev',
    'district_crimes_12_hours_prev', 'district_crimes_24_hours_prev',
    'area_crimes_this_hour', 'area_crimes_1_hours_prev',
    'area_crimes_3_hours_prev', 'area_crimes_6_hours_prev',
    'area_crimes_12_hours_prev', 'area_crimes_24_hours_prev'
]

In [49]:
district_columns_to_normalize = area_columns_to_normalize.copy()
district_columns_to_normalize.remove('area_crimes_this_hour') 
district_columns_to_normalize.append('district_crimes_this_hour')

In [50]:
scaler = MinMaxScaler()
area_final_df[area_columns_to_normalize] = scaler.fit_transform(area_final_df[area_columns_to_normalize])
district_final_df[district_columns_to_normalize] = scaler.fit_transform(district_final_df[district_columns_to_normalize])

In [51]:
print([f'{col}: {area_final_df[col].isna().sum()}' for col in area_final_df.columns])

['area_id: 0', 'date_hour: 0', 'day: 0', 'hour: 0', 'year: 0', 'month: 0', 'day_of_week: 0', 'area_unemployment: 0', 'area_per_capita_income: 0', 'area_no_hs_dip: 0', 'area_gov_depend: 0', 'area_crowded_housing: 0', 'area_below_pov: 0', 'district: 0', 'police_stations_distance_0.1: 8056753', 'police_stations_distance_0.3: 8056753', 'police_stations_distance_0.5: 8056753', 'police_stations_distance_1: 8056753', 'police_stations_distance_3: 8056753', 'police_stations_distance_5: 8056753', 'bike_stations_distance_0.1: 8056753', 'bike_stations_distance_0.3: 8056753', 'bike_stations_distance_0.5: 8056753', 'bike_stations_distance_1: 8056753', 'bike_stations_distance_3: 8056753', 'bike_stations_distance_5: 8056753', 'bus_stops_distance_0.1: 8056753', 'bus_stops_distance_0.3: 8056753', 'bus_stops_distance_0.5: 8056753', 'bus_stops_distance_1: 8056753', 'bus_stops_distance_3: 8056753', 'bus_stops_distance_5: 8056753', 'train_stations_distance_0.1: 8056753', 'train_stations_distance_0.3: 805675

In [52]:
print([f'{col}: {district_final_df[col].isna().sum()}' for col in district_final_df.columns])

['district: 0', 'date_hour: 0', 'day: 0', 'hour: 0', 'year: 0', 'month: 0', 'day_of_week: 0', 'area_unemployment: 131544', 'area_per_capita_income: 131544', 'area_no_hs_dip: 131544', 'area_gov_depend: 131544', 'area_crowded_housing: 131544', 'area_below_pov: 131544', 'police_stations_distance_0.1: 370031', 'police_stations_distance_0.3: 370031', 'police_stations_distance_0.5: 370031', 'police_stations_distance_1: 370031', 'police_stations_distance_3: 370031', 'police_stations_distance_5: 370031', 'bike_stations_distance_0.1: 370031', 'bike_stations_distance_0.3: 370031', 'bike_stations_distance_0.5: 370031', 'bike_stations_distance_1: 370031', 'bike_stations_distance_3: 370031', 'bike_stations_distance_5: 370031', 'bus_stops_distance_0.1: 370031', 'bus_stops_distance_0.3: 370031', 'bus_stops_distance_0.5: 370031', 'bus_stops_distance_1: 370031', 'bus_stops_distance_3: 370031', 'bus_stops_distance_5: 370031', 'train_stations_distance_0.1: 370031', 'train_stations_distance_0.3: 370031', 

In [53]:
area_final_df.fillna(0, inplace=True)
district_final_df.fillna(0, inplace=True)

In [55]:
area_final_df.to_csv('../../data/pre_training/area_pre_feature_selection.csv', index=False)
district_final_df.to_csv('../../data/pre_training/district_pre_feature_selection.csv', index=False)