In [22]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler



In [2]:
crime_with_proximity = pd.read_csv('../../data/pre_training/crime_with_proximity.csv')

In [3]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')
clean_public_healthindicator = pd.read_csv('../../data/processed/clean_public_healthindicator.csv')
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')
clean_train_ridership = pd.read_csv('../../data/processed/clean_train_ridership.csv')

In [4]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])
clean_bike_trips['hour'] = clean_bike_trips['date'].dt.hour
clean_bike_trips['date'] = clean_bike_trips['date'].dt.date

In [5]:
grouped_bike_trips = clean_bike_trips.groupby(['date', 'hour', 'district'])['station_id'].agg('count').reset_index().rename(columns={'station_id':'hourly_bike_rides'})
grouped_bike_trips['date'] = pd.to_datetime(grouped_bike_trips['date'])

In [6]:
crime_with_proximity['date'] = pd.to_datetime(crime_with_proximity['date'])
crime_with_proximity['hour'] = crime_with_proximity['date'].dt.hour
crime_with_proximity['day'] = crime_with_proximity['date'].dt.date
crime_with_proximity['date_hour'] = crime_with_proximity['date'].dt.floor('h')
crime_with_proximity.sort_values('date', inplace=True)

In [7]:
crimes_over_hours = crime_with_proximity.groupby(['district', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'crimes_this_hour'})
crimes_over_hours

Unnamed: 0,district,date,crimes_this_hour
0,1,2016-01-01 00:00:00,7
1,1,2016-01-01 01:00:00,1
2,1,2016-01-01 02:00:00,3
3,1,2016-01-01 03:00:00,2
4,1,2016-01-01 04:00:00,2
...,...,...,...
638876,31,2020-12-15 21:00:00,1
638877,31,2020-12-20 09:00:00,1
638878,31,2020-12-24 19:00:00,1
638879,31,2020-12-25 03:00:00,1


In [8]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    crimes_over_hours[f'crimes_{window}_hours_prev'] = crimes_over_hours.groupby('district')['crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'district', 'date_hour']], right=crimes_over_hours, left_on=['district', 'date_hour'], right_on=['district', 'date'], how='left')

In [9]:
crimes_over_hours = crimes_over_hours[['id', 'crimes_this_hour', 'crimes_1_hours_prev', 'crimes_3_hours_prev', 'crimes_6_hours_prev', 'crimes_12_hours_prev', 'crimes_24_hours_prev']]

In [10]:
final_df = pd.merge(left=crime_with_proximity, right=grouped_bike_trips, on=['date','hour','district'], how='left').drop(['hour','day'], axis=1).fillna(0)

In [11]:
final_df = pd.merge(left=final_df, right=clean_public_healthindicator, on='district', how='left')

In [12]:
final_df = pd.merge(left=final_df, right=clean_police_districts[['district','disadvantaged_score']], on='district', how='left')

In [13]:
clean_train_ridership['date'] = pd.to_datetime(clean_train_ridership['date'])

In [14]:
grouped_train_ridership = clean_train_ridership.groupby(['date','district'])['rides'].agg('sum').reset_index()

In [15]:
final_df = pd.merge(left=final_df, right=grouped_train_ridership, on=['date', 'district'], how='left').fillna(0)

In [16]:
final_df = pd.merge(left=final_df, right=crimes_over_hours, on='id', how='inner')

In [19]:
final_df.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

#### Normalize Columns

In [20]:
columns_to_normalize = [
    'police_stations_distance_0.1', 'police_stations_distance_0.3',
    'bike_stations_distance_0.1', 'bike_stations_distance_0.3', 'bike_stations_distance_0.5', 
    'bike_stations_distance_1', 'bike_stations_distance_3', 'bike_stations_distance_5',
    'bus_stops_distance_0.1', 'bus_stops_distance_0.3', 'bus_stops_distance_0.5', 
    'bus_stops_distance_1', 'bus_stops_distance_3', 'bus_stops_distance_5',
    'train_stations_distance_0.1', 'train_stations_distance_0.3', 'train_stations_distance_0.5', 
    'train_stations_distance_1', 'train_stations_distance_3', 'train_stations_distance_5',
    'alleylights_distance_0.1', 'alleylights_distance_0.3', 'alleylights_distance_0.5', 
    'alleylights_distance_1', 'alleylights_distance_3', 'alleylights_distance_5',
    'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3', 
    'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1', 
    'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
    'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3', 
    'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1', 
    'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
    'per_capita_income', 'rides', 'unemployment', 'no_hs_dip', 
    'gov_depend', 'crowded_housing', 'below_pov', 'disadvantaged_score', 
    'crimes_this_hour', 'crimes_1_hours_prev', 'crimes_3_hours_prev', 
    'crimes_6_hours_prev', 'crimes_12_hours_prev', 'crimes_24_hours_prev']

In [23]:
scaler = MinMaxScaler()
final_df[columns_to_normalize] = scaler.fit_transform(final_df[columns_to_normalize])

In [24]:
final_df

Unnamed: 0,id,date,type,lat,long,district,police_stations_distance_0.1,police_stations_distance_0.3,police_stations_distance_0.5,police_stations_distance_1,...,crowded_housing,below_pov,disadvantaged_score,rides,crimes_this_hour,crimes_1_hours_prev,crimes_3_hours_prev,crimes_6_hours_prev,crimes_12_hours_prev,crimes_24_hours_prev
0,HZ262045,2016-01-01,OFFENSE INVOLVING CHILDREN,41.923311,-87.710436,14,0.0,0.0,0,1,...,0.383523,0.423623,0.096774,0.020799,0.161290,0.000000,0.012195,0.027397,0.071429,0.150943
1,JC492397,2016-01-01,OFFENSE INVOLVING CHILDREN,41.945919,-87.763575,16,0.0,0.0,0,0,...,0.209416,0.204009,0.000000,0.022505,0.145161,0.000000,0.018293,0.027397,0.058824,0.116981
2,HZ290771,2016-01-01,OFFENSE INVOLVING CHILDREN,41.951464,-87.675547,19,0.0,0.0,0,0,...,0.170455,0.357904,0.064516,0.073845,0.048387,0.000000,0.024390,0.031963,0.063025,0.139623
3,HZ157088,2016-01-01,SEX OFFENSE,41.777626,-87.611961,3,0.0,0.0,0,1,...,0.252841,0.758437,0.741935,0.001862,0.241935,0.016129,0.018293,0.031963,0.063025,0.105660
4,HZ237276,2016-01-01,DECEPTIVE PRACTICE,41.907110,-87.674665,14,0.0,0.0,0,0,...,0.383523,0.423623,0.096774,0.020799,0.161290,0.000000,0.012195,0.027397,0.071429,0.150943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259991,JE122096,2020-12-31,CRIMINAL DAMAGE,41.760603,-87.584783,3,0.0,0.0,0,0,...,0.252841,0.758437,0.741935,0.000993,0.064516,0.000000,0.018293,0.027397,0.063025,0.132075
1259992,JD474274,2020-12-31,MOTOR VEHICLE THEFT,41.784822,-87.618972,3,0.0,0.0,0,0,...,0.252841,0.758437,0.741935,0.000993,0.064516,0.000000,0.018293,0.027397,0.063025,0.132075
1259993,JE114345,2020-12-31,DECEPTIVE PRACTICE,41.867130,-87.720990,11,0.0,0.0,0,0,...,0.784091,1.000000,0.967742,0.002655,0.000000,0.000000,0.012195,0.036530,0.075630,0.143396
1259994,JD474657,2020-12-31,DECEPTIVE PRACTICE,42.013976,-87.812707,16,0.0,0.0,0,0,...,0.209416,0.204009,0.000000,0.011709,0.000000,0.000000,0.018293,0.031963,0.075630,0.162264


In [25]:
final_df.to_csv('../../data/pre_training/pre_feature_selection.csv', index=False)