In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler



In [2]:
crime_with_proximity = pd.read_csv('../../data/pre_training/crime_with_proximity.csv')

In [3]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')
agg_public_healthindicator = pd.read_csv('../../data/processed/agg_public_healthindicator.csv')
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')
clean_train_ridership = pd.read_csv('../../data/processed/clean_train_ridership.csv')

In [None]:
crime_with_proximity['date'] = pd.to_datetime(crime_with_proximity['date'])
crime_with_proximity['hour'] = crime_with_proximity['date'].dt.hour
crime_with_proximity['day'] = crime_with_proximity['date'].dt.date
crime_with_proximity['date_hour'] = crime_with_proximity['date'].dt.floor('h')
crime_with_proximity.sort_values('date', inplace=True)

In [None]:
crimes_over_hours = crime_with_proximity.groupby(['district', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'crimes_this_hour'})
crimes_over_hours

Unnamed: 0,district,date,crimes_this_hour
0,1,2016-01-01 00:00:00,7
1,1,2016-01-01 01:00:00,1
2,1,2016-01-01 02:00:00,3
3,1,2016-01-01 03:00:00,2
4,1,2016-01-01 04:00:00,2
...,...,...,...
638876,31,2020-12-15 21:00:00,1
638877,31,2020-12-20 09:00:00,1
638878,31,2020-12-24 19:00:00,1
638879,31,2020-12-25 03:00:00,1


In [None]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    crimes_over_hours[f'crimes_{window}_hours_prev'] = crimes_over_hours.groupby('district')['crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'district', 'date_hour', 'hour', 'day']], right=crimes_over_hours, left_on=['district', 'date_hour'], right_on=['district', 'date'], how='left')

In [None]:
crimes_over_hours = crimes_over_hours[['id', 'crimes_this_hour', 'crimes_1_hours_prev', 'crimes_3_hours_prev', 'crimes_6_hours_prev', 'crimes_12_hours_prev', 'crimes_24_hours_prev']]

In [None]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])
clean_bike_trips['hour'] = clean_bike_trips['date'].dt.hour
clean_bike_trips['date'] = clean_bike_trips['date'].dt.date

In [None]:
grouped_bike_trips = clean_bike_trips.groupby(['date', 'hour', 'district'])['station_id'].agg('count').reset_index().rename(columns={'station_id':'hourly_bike_rides'})
grouped_bike_trips['date'] = pd.to_datetime(grouped_bike_trips['date'])

In [None]:
final_df = pd.merge(left=crime_with_proximity, right=grouped_bike_trips, on=['date','hour','district'], how='left').drop(['hour','day'], axis=1).fillna(0)

In [None]:
agg_public_healthindicator.columns = ['district_' + col if col != 'district' else 'district' for col in agg_public_healthindicator.columns ]
final_df = pd.merge(left=final_df, right=agg_public_healthindicator, on='district', how='left')

In [None]:
final_df = pd.merge(left=final_df, right=clean_police_districts[['district','disadvantaged_score']], on='district', how='left')

In [None]:
clean_train_ridership['date'] = pd.to_datetime(clean_train_ridership['date'])

In [None]:
clean_train_ridership

Unnamed: 0,date,line,station_name,lat,long,district,areas
0,2017-12-22,Blue Line,Jefferson Park,41.970642,-87.760898,16,11
1,2017-12-18,Red Line,Cermak-Chinatown,41.853214,-87.630974,9,34
2,2017-12-07,Orange Line,35th/Archer,41.829568,-87.680593,9,59
3,2017-12-07,"Brown, Purple (Express)",Wellington,41.936040,-87.653272,19,6
4,2017-12-22,"Purple, Red Line",Wilson,41.964262,-87.657740,19,3
...,...,...,...,...,...,...,...
116345,2017-08-06,Blue Line,UIC-Halsted,41.875524,-87.649648,12,28
116346,2016-11-04,Brown Line,Rockwell,41.966218,-87.694117,19,4
116347,2016-09-16,Green Line,43rd,41.816471,-87.619042,2,38
116348,2017-11-15,"Brown, Purple (Express)",Armitage,41.918224,-87.652650,18,7


In [None]:
grouped_train_ridership = clean_train_ridership.groupby(['date','district'])['rides'].agg('sum').reset_index()

KeyError: 'Column not found: rides'

In [None]:
final_df = pd.merge(left=final_df, right=grouped_train_ridership, on=['date', 'district'], how='left').fillna(0)

In [None]:
final_df = pd.merge(left=final_df, right=crimes_over_hours, on='id', how='inner')

In [None]:
final_df.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [21]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
areas = np.arange(1, 78)
base_df = pd.DataFrame([(area, date) for area in areas for date in date_range], columns=['area_id', 'date_hour'])
base_df['day'] = base_df['date_hour'].dt.day
base_df['hour'] = base_df['date_hour'].dt.hour

In [22]:
base_df

Unnamed: 0,area_id,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19
3376292,77,2020-12-31 20:00:00,31,20
3376293,77,2020-12-31 21:00:00,31,21
3376294,77,2020-12-31 22:00:00,31,22


In [23]:
base_df['day_of_week'] = base_df['date_hour'].dt.dayofweek
base_df['day_of_month'] = base_df['date_hour'].dt.day
base_df['month'] = base_df['date_hour'].dt.month
base_df

Unnamed: 0,area_id,date_hour,day,hour,day_of_week,day_of_month,month
0,1,2016-01-01 00:00:00,1,0,4,1,1
1,1,2016-01-01 01:00:00,1,1,4,1,1
2,1,2016-01-01 02:00:00,1,2,4,1,1
3,1,2016-01-01 03:00:00,1,3,4,1,1
4,1,2016-01-01 04:00:00,1,4,4,1,1
...,...,...,...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19,3,31,12
3376292,77,2020-12-31 20:00:00,31,20,3,31,12
3376293,77,2020-12-31 21:00:00,31,21,3,31,12
3376294,77,2020-12-31 22:00:00,31,22,3,31,12


In [None]:
clean_public_healthindicator = pd.read_csv('../../data/processed/clean_public_healthindicator.csv')
disadvantaged_areas_within_areas = pd.read_csv('../../data/processed/disadvantaged_areas_within_areas.csv')

In [None]:
base_df_with_area_stats = base_df.merge(right=clean_public_healthindicator, left_on='area_id', right_on='id', how='left')
base_df_with_area_stats = base_df_with_area_stats.merge(right=disadvantaged_areas_within_areas, left_on='area_id', right_on='areas', how='left')

In [None]:
filtered_df = ['police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_distance_5',
       'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
       'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
       'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
       'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
       'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
       'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
       'hourly_bike_rides', 'district_unemployment', 'district_per_capita_income',
       'district_no_hs_dip', 'district_gov_depend', 'district_crowded_housing', 
       'district_below_pov', 'crimes_this_hour', 'crimes_1_hours_prev', 'crimes_3_hours_prev',
       'crimes_6_hours_prev', 'crimes_12_hours_prev', 'crimes_24_hours_prev']

In [None]:
crime_counts = final_df.groupby(['area', 'date_hour'])['id'].agg('nunique').reset_index()
final_df_area_avg = final_df.groupby(['area','date_hour'])[filtered_df].agg('mean').reset_index()
final_df = pd.merge(crime_counts, final_df_area_avg, on=['area','date_hour'], how='inner')

In [None]:
final_df = pd.merge(base_df_with_area_stats, final_df, on=['area','date_hour'], how='left')

In [None]:
final_df

#### Normalize Columns

In [None]:
columns_to_normalize = []

In [None]:
scaler = MinMaxScaler()
final_df[columns_to_normalize] = scaler.fit_transform(final_df[columns_to_normalize])

In [None]:
final_df

In [None]:
final_df.to_csv('../../data/pre_training/pre_feature_selection.csv', index=False)