In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from shapely.geometry import Point, Polygon
from shapely import geometry

In [2]:
crime_with_proximity = pd.read_csv('../../data/pre_training/crime_with_proximity.csv')

In [3]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')
agg_public_healthindicator = pd.read_csv('../../data/processed/agg_public_healthindicator.csv')
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')
clean_train_ridership = pd.read_csv('../../data/processed/clean_train_ridership.csv')
clean_areas = pd.read_csv('../../data/processed/clean_areas.csv')
clean_crime = pd.read_csv('../../data/processed/clean_crime.csv')
clean_dis_areas = pd.read_csv('../../data/processed/disadvantaged_areas_within_areas.csv')

In [4]:
crime_with_proximity = crime_with_proximity.merge(clean_crime[['id','areas']], on='id',how='inner')
crime_with_proximity = crime_with_proximity.rename(columns={'areas':'area_id'})

In [5]:
crime_with_proximity['date'] = pd.to_datetime(crime_with_proximity['date'])
crime_with_proximity['hour'] = crime_with_proximity['date'].dt.hour
crime_with_proximity['day'] = crime_with_proximity['date'].dt.date
crime_with_proximity['date_hour'] = crime_with_proximity['date'].dt.floor('h')
crime_with_proximity.sort_values('date', inplace=True)

In [6]:
district_crimes_over_hours = crime_with_proximity.groupby(['district', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'district_crimes_this_hour'})
district_crimes_over_hours

Unnamed: 0,district,date,district_crimes_this_hour
0,1,2016-01-01 00:00:00,7
1,1,2016-01-01 01:00:00,1
2,1,2016-01-01 02:00:00,3
3,1,2016-01-01 03:00:00,2
4,1,2016-01-01 04:00:00,2
...,...,...,...
638468,25,2020-12-30 20:00:00,2
638469,25,2020-12-30 21:00:00,3
638470,25,2020-12-30 22:00:00,2
638471,25,2020-12-30 23:00:00,1


In [7]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    district_crimes_over_hours[f'district_crimes_{window}_hours_prev'] = district_crimes_over_hours.groupby('district')['district_crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

district_crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'district', 'date_hour', 'hour', 'day']], right=district_crimes_over_hours, left_on=['district', 'date_hour'], right_on=['district', 'date'], how='left')

In [8]:
district_crimes_over_hours = district_crimes_over_hours[['id', 'district', 'date_hour', 'district_crimes_this_hour', 'district_crimes_1_hours_prev', 'district_crimes_3_hours_prev', 'district_crimes_6_hours_prev', 'district_crimes_12_hours_prev', 'district_crimes_24_hours_prev']]

In [9]:
area_crimes_over_hours = crime_with_proximity.groupby(['area_id', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'area_crimes_this_hour'})
area_crimes_over_hours

Unnamed: 0,area_id,date,area_crimes_this_hour
0,1,2016-01-01 00:00:00,2
1,1,2016-01-01 01:00:00,2
2,1,2016-01-01 03:00:00,2
3,1,2016-01-01 12:00:00,1
4,1,2016-01-02 00:00:00,1
...,...,...,...
897943,77,2020-12-29 10:00:00,2
897944,77,2020-12-29 11:00:00,1
897945,77,2020-12-30 17:00:00,1
897946,77,2020-12-30 19:00:00,1


In [10]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    area_crimes_over_hours[f'area_crimes_{window}_hours_prev'] = area_crimes_over_hours.groupby('area_id')['area_crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

area_crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'area_id', 'date_hour', 'hour', 'day']], right=area_crimes_over_hours, left_on=['area_id', 'date_hour'], right_on=['area_id', 'date'], how='left')

In [11]:
area_crimes_over_hours = area_crimes_over_hours[['id', 'area_id', 'date_hour', 'area_crimes_this_hour', 'area_crimes_1_hours_prev', 'area_crimes_3_hours_prev', 'area_crimes_6_hours_prev', 'area_crimes_12_hours_prev', 'area_crimes_24_hours_prev']]

##### Adding Disadvantaged Areas

In [12]:
crime_with_proximity

Unnamed: 0,id,date,type,lat,long,district,police_stations_distance_0.1,police_stations_distance_0.3,police_stations_distance_0.5,police_stations_distance_1,...,bike_rides_within_0.1_and_10_min,bike_rides_within_0.3_and_10_min,bike_rides_within_0.5_and_10_min,bike_rides_within_0.1_and_15_min,bike_rides_within_0.3_and_15_min,bike_rides_within_0.5_and_15_min,area_id,hour,day,date_hour
680,HZ262045,2016-01-01,OFFENSE INVOLVING CHILDREN,41.923311,-87.710436,14,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,22,0,2016-01-01,2016-01-01
742,HZ445076,2016-01-01,SEX OFFENSE,41.883660,-87.669368,12,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,28,0,2016-01-01,2016-01-01
741,JC492397,2016-01-01,OFFENSE INVOLVING CHILDREN,41.945919,-87.763575,16,0,0,0,0,...,0.0,1.0,1.0,0.0,2.0,2.0,15,0,2016-01-01,2016-01-01
740,HZ290771,2016-01-01,OFFENSE INVOLVING CHILDREN,41.951464,-87.675547,19,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,5,0,2016-01-01,2016-01-01
739,HZ157088,2016-01-01,SEX OFFENSE,41.777626,-87.611961,3,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,42,0,2016-01-01,2016-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259944,JE122096,2020-12-31,CRIMINAL DAMAGE,41.760603,-87.584783,3,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,43,0,2020-12-31,2020-12-31
1259943,JD474274,2020-12-31,MOTOR VEHICLE THEFT,41.784822,-87.618972,3,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,40,0,2020-12-31,2020-12-31
1259942,JE114345,2020-12-31,DECEPTIVE PRACTICE,41.867130,-87.720990,11,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,29,0,2020-12-31,2020-12-31
1259949,JD474657,2020-12-31,DECEPTIVE PRACTICE,42.013976,-87.812707,16,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,9,0,2020-12-31,2020-12-31


In [13]:
def parse_polygon(polygon_string):
    points = polygon_string.strip('POLYGON ((').strip('))').split(', ')
    points = [tuple(map(float, point.split())) for point in points]
    return Polygon(points)

In [14]:
def swap_coordinates(polygon):
    if polygon.is_empty:
        return polygon
    swapped_coords = [(y, x) for x, y in polygon.exterior.coords]
    return Polygon(swapped_coords)

In [15]:
clean_dis_areas['poly'] = clean_dis_areas['poly'].apply(parse_polygon)
clean_dis_areas['poly'] = clean_dis_areas['poly'].apply(swap_coordinates)
clean_dis_areas['id'] = clean_dis_areas.index
clean_dis_areas.head()

Unnamed: 0,poly,district,areas,id
0,POLYGON ((-87.57366299996758 41.70796600003142...,4,51,0
1,POLYGON ((-87.60151899995586 41.68632499999862...,5,50,1
2,POLYGON ((-87.60036199996543 41.69284299996604...,5,50,2
3,POLYGON ((-87.62032800000847 41.69942700001048...,5,49,3
4,POLYGON ((-87.66036699995497 41.75771799998948...,6,71,4


In [16]:
clean_dis_areas['poly'] = clean_dis_areas['poly'].apply(lambda x: Polygon(x))

In [17]:
dis_areas_to_areas = {}
for idx, row in clean_dis_areas.iterrows():
    if row['areas'] in dis_areas_to_areas.keys():
        dis_areas_to_areas[row['areas']].append((row['id'], row['poly']))
    else:
        dis_areas_to_areas[row['areas']] = [(row['id'], row['poly'])]

In [18]:
def determine_dis_area_for_crimes(df, perc):
    dis_areas = []
    perc_cnt = perc

    for i in range(len(df)):
        point = geometry.Point(df.loc[i, 'long'], df.loc[i, 'lat'])

        curr_dis_area = None
        if df.loc[i, 'area_id'] in dis_areas_to_areas:
            for (area, geom) in dis_areas_to_areas[df.loc[i, 'area_id']]:
                if (geom.contains(point)):
                    curr_dis_area = area 
                    break
        dis_areas.append(curr_dis_area)

        if i > 0 and i % (round(len(df) * (perc_cnt/100))) == 0:
            print(f"{perc_cnt}%- Row {i}/{len(df)} completed")
            perc_cnt += perc

    df['dis_area_id'] = dis_areas

    return df

In [19]:
crime_with_proximity = determine_dis_area_for_crimes(crime_with_proximity, 2)

2%- Row 25199/1259960 completed
4%- Row 50398/1259960 completed
6%- Row 75598/1259960 completed
8%- Row 100797/1259960 completed
10%- Row 125996/1259960 completed
12%- Row 151195/1259960 completed
14%- Row 176394/1259960 completed
16%- Row 201594/1259960 completed
18%- Row 226793/1259960 completed
20%- Row 251992/1259960 completed
22%- Row 277191/1259960 completed
24%- Row 302390/1259960 completed
26%- Row 327590/1259960 completed
28%- Row 352789/1259960 completed
30%- Row 377988/1259960 completed
32%- Row 403187/1259960 completed
34%- Row 428386/1259960 completed
36%- Row 453586/1259960 completed
38%- Row 478785/1259960 completed
40%- Row 503984/1259960 completed
42%- Row 529183/1259960 completed
44%- Row 554382/1259960 completed
46%- Row 579582/1259960 completed
48%- Row 604781/1259960 completed
50%- Row 629980/1259960 completed
52%- Row 655179/1259960 completed
54%- Row 680378/1259960 completed
56%- Row 705578/1259960 completed
58%- Row 730777/1259960 completed
60%- Row 755976/12599

In [20]:
dis_area_crimes_over_hours = crime_with_proximity.groupby(['dis_area_id', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'dis_area_crimes_this_hour'})
dis_area_crimes_over_hours

Unnamed: 0,dis_area_id,date,dis_area_crimes_this_hour
0,0.0,2016-01-01 00:00:00,1
1,0.0,2016-01-01 02:00:00,1
2,0.0,2016-01-03 17:00:00,1
3,0.0,2016-01-04 12:00:00,1
4,0.0,2016-01-04 14:00:00,1
...,...,...,...
531858,274.0,2020-12-13 00:00:00,1
531859,274.0,2020-12-13 09:00:00,2
531860,274.0,2020-12-13 16:00:00,1
531861,274.0,2020-12-23 13:00:00,1


In [21]:
crime_with_proximity_dis_area = crime_with_proximity.dropna(subset=['dis_area_id'], axis=0)

In [22]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    dis_area_crimes_over_hours[f'dis_area_crimes_{window}_hours_prev'] = dis_area_crimes_over_hours.groupby('dis_area_id')['dis_area_crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

dis_area_crimes_over_hours = pd.merge(left=crime_with_proximity_dis_area[['id', 'dis_area_id', 'date_hour', 'hour', 'day']], right=dis_area_crimes_over_hours, left_on=['dis_area_id', 'date_hour'], right_on=['dis_area_id', 'date'], how='left')

In [23]:
dis_area_crimes_over_hours = dis_area_crimes_over_hours[['id', 'dis_area_id', 'date_hour', 'dis_area_crimes_this_hour', 'dis_area_crimes_1_hours_prev', 'dis_area_crimes_3_hours_prev', 'dis_area_crimes_6_hours_prev', 'dis_area_crimes_12_hours_prev', 'dis_area_crimes_24_hours_prev']]

In [24]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])
clean_bike_trips['hour'] = clean_bike_trips['date'].dt.hour
clean_bike_trips['date'] = clean_bike_trips['date'].dt.date

In [25]:
grouped_bike_trips = clean_bike_trips.groupby(['date', 'hour', 'district'])['station_id'].agg('count').reset_index().rename(columns={'station_id':'hourly_bike_rides'})
grouped_bike_trips['date'] = pd.to_datetime(grouped_bike_trips['date'])

In [26]:
final_df = pd.merge(left=crime_with_proximity, right=grouped_bike_trips, on=['date','hour','district'], how='left').drop(['hour','day'], axis=1).fillna(0)

In [27]:
agg_public_healthindicator.columns = ['district_' + col if col != 'district' else 'district' for col in agg_public_healthindicator.columns ]
final_df = pd.merge(left=final_df, right=agg_public_healthindicator, on='district', how='left')

In [28]:
final_df = pd.merge(left=final_df, right=clean_police_districts[['district','disadvantaged_score']], on='district', how='left')

In [29]:
clean_train_ridership['date'] = pd.to_datetime(clean_train_ridership['date'])

In [30]:
grouped_train_ridership = clean_train_ridership.groupby(['date','district'])['rides'].agg('sum').reset_index()

In [31]:
final_df = pd.merge(left=final_df, right=grouped_train_ridership, on=['date', 'district'], how='left').fillna(0)

In [32]:
final_df = final_df.drop_duplicates(subset=['id'])
dis_area_crimes_over_hours = dis_area_crimes_over_hours.drop_duplicates(subset=['id'])

In [33]:
final_df = pd.merge(left=final_df, right=district_crimes_over_hours[['id','district_crimes_this_hour',
       'district_crimes_1_hours_prev', 'district_crimes_3_hours_prev',
       'district_crimes_6_hours_prev', 'district_crimes_12_hours_prev',
       'district_crimes_24_hours_prev']], on='id', how='inner')

In [34]:
final_df = pd.merge(left=final_df, right=area_crimes_over_hours[['id', 'area_crimes_this_hour',
       'area_crimes_1_hours_prev', 'area_crimes_3_hours_prev',
       'area_crimes_6_hours_prev', 'area_crimes_12_hours_prev',
       'area_crimes_24_hours_prev']], on='id', how='inner')

In [35]:
final_df = pd.merge(left=final_df, right=dis_area_crimes_over_hours[['id', 'dis_area_crimes_this_hour',
       'dis_area_crimes_1_hours_prev', 'dis_area_crimes_3_hours_prev',
       'dis_area_crimes_6_hours_prev', 'dis_area_crimes_12_hours_prev',
       'dis_area_crimes_24_hours_prev']], on='id', how='inner')

In [36]:
final_df.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [37]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
areas = np.arange(1, 78)
area_base_df = pd.DataFrame([(area, date) for area in areas for date in date_range], columns=['area_id', 'date_hour'])
area_base_df['day'] = area_base_df['date_hour'].dt.day
area_base_df['hour'] = area_base_df['date_hour'].dt.hour

In [38]:
area_base_df

Unnamed: 0,area_id,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19
3376292,77,2020-12-31 20:00:00,31,20
3376293,77,2020-12-31 21:00:00,31,21
3376294,77,2020-12-31 22:00:00,31,22


In [39]:
area_base_df['year'] = area_base_df['date_hour'].dt.year
area_base_df['month'] = area_base_df['date_hour'].dt.month
area_base_df['day'] = area_base_df['date_hour'].dt.day
area_base_df['day_of_week'] = area_base_df['date_hour'].dt.dayofweek
area_base_df

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week
0,1,2016-01-01 00:00:00,1,0,2016,1,4
1,1,2016-01-01 01:00:00,1,1,2016,1,4
2,1,2016-01-01 02:00:00,1,2,2016,1,4
3,1,2016-01-01 03:00:00,1,3,2016,1,4
4,1,2016-01-01 04:00:00,1,4,2016,1,4
...,...,...,...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19,2020,12,3
3376292,77,2020-12-31 20:00:00,31,20,2020,12,3
3376293,77,2020-12-31 21:00:00,31,21,2020,12,3
3376294,77,2020-12-31 22:00:00,31,22,2020,12,3


In [40]:
clean_public_healthindicator = pd.read_csv('../../data/processed/clean_public_healthindicator.csv')
disadvantaged_areas_within_areas = pd.read_csv('../../data/processed/disadvantaged_areas_within_areas.csv')

In [41]:
clean_public_healthindicator

Unnamed: 0,id,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,0.075,23714,0.181,0.288,0.079,0.227
1,2,0.079,21375,0.196,0.383,0.070,0.151
2,3,0.077,32355,0.136,0.222,0.046,0.227
3,4,0.068,35503,0.125,0.256,0.031,0.095
4,5,0.045,51615,0.054,0.255,0.002,0.071
...,...,...,...,...,...,...,...
72,73,0.183,19709,0.156,0.424,0.011,0.157
73,74,0.069,34221,0.045,0.370,0.011,0.031
74,75,0.149,26185,0.109,0.394,0.008,0.137
75,76,0.047,29402,0.110,0.265,0.019,0.095


In [42]:
base_df_with_area_stats = area_base_df.merge(right=clean_public_healthindicator, left_on='area_id', right_on='id', how='left')
base_df_with_area_stats = base_df_with_area_stats.merge(right=disadvantaged_areas_within_areas, left_on='area_id', right_on='areas', how='left')

In [52]:
filtered_df = ['police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_distance_5',
       'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
       'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
       'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
       'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
       'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
       'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
       'bike_rides_within_0.1_and_5_min', 'bike_rides_within_0.3_and_5_min',
       'bike_rides_within_0.5_and_5_min', 'bike_rides_within_0.1_and_10_min',
       'bike_rides_within_0.3_and_10_min', 'bike_rides_within_0.5_and_10_min',
       'bike_rides_within_0.1_and_15_min', 'bike_rides_within_0.3_and_15_min',
       'bike_rides_within_0.5_and_15_min',
       'hourly_bike_rides', 'district_unemployment',
       'district_per_capita_income', 'district_no_hs_dip',
       'district_gov_depend', 'district_crowded_housing', 'district_below_pov',
       'disadvantaged_score', 'rides', 'district_crimes_this_hour',
       'district_crimes_1_hours_prev', 'district_crimes_3_hours_prev',
       'district_crimes_6_hours_prev', 'district_crimes_12_hours_prev',
       'district_crimes_24_hours_prev', 'area_crimes_this_hour',
       'area_crimes_1_hours_prev', 'area_crimes_3_hours_prev',
       'area_crimes_6_hours_prev', 'area_crimes_12_hours_prev',
       'area_crimes_24_hours_prev', 'dis_area_crimes_this_hour',
       'dis_area_crimes_1_hours_prev', 'dis_area_crimes_3_hours_prev',
       'dis_area_crimes_6_hours_prev', 'dis_area_crimes_12_hours_prev',
       'dis_area_crimes_24_hours_prev']

In [44]:
final_df.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [45]:
final_df_area_avg = final_df.groupby(['area_id','date_hour'])[filtered_df].agg('mean').reset_index()
final_df_district_avg = final_df.groupby(['district','date_hour'])[filtered_df].agg('mean').reset_index()

In [46]:
base_df_with_area_stats.rename(columns={
    'unemployment': 'area_unemployment',
    'per_capita_income': 'area_per_capita_income',
    'no_hs_dip': 'area_no_hs_dip',
    'gov_depend': 'area_gov_depend',
    'crowded_housing': 'area_crowded_housing',
    'below_pov': 'area_below_pov'
}, inplace=True)

In [47]:
base_df_with_area_stats

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week,id,area_unemployment,area_per_capita_income,area_no_hs_dip,area_gov_depend,area_crowded_housing,area_below_pov,poly,district,areas
0,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.01939800001747 -87.66368000002285...,24.0,1.0
1,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.01937400002132 -87.67335799998406...,24.0,1.0
2,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((41.998169000012545 -87.6624109999806...,24.0,1.0
3,1,2016-01-01 00:00:00,1,0,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.00497899998903 -87.66572099996036...,24.0,1.0
4,1,2016-01-01 01:00:00,1,1,2016,1,4,1,0.075,23714,0.181,0.288,0.079,0.227,POLYGON ((42.01939800001747 -87.66368000002285...,24.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13329787,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.983635999999635 -87.6593119999893...,20.0,77.0
13329788,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.98368300002446 -87.65660900000702...,20.0,77.0
13329789,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.998316999982876 -87.6550000000143...,24.0,77.0
13329790,77,2020-12-31 23:00:00,31,23,2020,12,3,77,0.090,33364,0.090,0.234,0.039,0.166,POLYGON ((41.99662699998114 -87.65029099995624...,24.0,77.0


##### Creating Disadvantaged Areas Base Df

In [48]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
dis_areas = np.arange(1, 78)
dis_area_base_df = pd.DataFrame([(dis_area, date) for dis_area in dis_areas for date in date_range], columns=['dis_area_id', 'date_hour'])
dis_area_base_df['day'] = dis_area_base_df['date_hour'].dt.day
dis_area_base_df['hour'] = dis_area_base_df['date_hour'].dt.hour

In [49]:
dis_area_base_df

Unnamed: 0,dis_area_id,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19
3376292,77,2020-12-31 20:00:00,31,20
3376293,77,2020-12-31 21:00:00,31,21
3376294,77,2020-12-31 22:00:00,31,22


In [50]:
dis_area_base_df['year'] = dis_area_base_df['date_hour'].dt.year
dis_area_base_df['month'] = dis_area_base_df['date_hour'].dt.month
dis_area_base_df['day'] = dis_area_base_df['date_hour'].dt.day
dis_area_base_df['day_of_week'] = dis_area_base_df['date_hour'].dt.dayofweek
dis_area_base_df

Unnamed: 0,dis_area_id,date_hour,day,hour,year,month,day_of_week
0,1,2016-01-01 00:00:00,1,0,2016,1,4
1,1,2016-01-01 01:00:00,1,1,2016,1,4
2,1,2016-01-01 02:00:00,1,2,2016,1,4
3,1,2016-01-01 03:00:00,1,3,2016,1,4
4,1,2016-01-01 04:00:00,1,4,2016,1,4
...,...,...,...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19,2020,12,3
3376292,77,2020-12-31 20:00:00,31,20,2020,12,3
3376293,77,2020-12-31 21:00:00,31,21,2020,12,3
3376294,77,2020-12-31 22:00:00,31,22,2020,12,3


In [51]:
base_df_with_dis_area_stats = dis_area_base_df.merge(right=clean_public_healthindicator, left_on='dis_area_id', right_on='id', how='left')
base_df_with_dis_area_stats = base_df_with_dis_area_stats.merge(right=disadvantaged_areas_within_areas, left_on='dis_area_id', right_on='areas', how='left')

In [53]:
final_df_dis_area_avg = final_df.groupby(['dis_area_id','date_hour'])[filtered_df].agg('mean').reset_index()
final_df_district_avg = final_df.groupby(['district','date_hour'])[filtered_df].agg('mean').reset_index()

In [54]:
base_df_with_dis_area_stats.rename(columns={
    'unemployment': 'dis_area_unemployment',
    'per_capita_income': 'dis_area_per_capita_income',
    'no_hs_dip': 'dis_area_no_hs_dip',
    'gov_depend': 'dis_area_gov_depend',
    'crowded_housing': 'dis_area_crowded_housing',
    'below_pov': 'dis_area_below_pov'
}, inplace=True)

In [57]:
base_df_with_dis_area_stats.drop('poly', axis=1, inplace=True)

##### Creating Districts Base Df

In [58]:
districts = pd.read_csv('../../data/processed/clean_police_districts.csv')

In [59]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
districts = list(set(districts['district'].values))
district_base_df = pd.DataFrame([(district, date) for district in districts for date in date_range], columns=['district', 'date_hour'])
district_base_df['day'] = district_base_df['date_hour'].dt.day
district_base_df['hour'] = district_base_df['date_hour'].dt.hour

In [60]:
district_base_df

Unnamed: 0,district,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19
1008500,31,2020-12-31 20:00:00,31,20
1008501,31,2020-12-31 21:00:00,31,21
1008502,31,2020-12-31 22:00:00,31,22


In [61]:
district_base_df['year'] = district_base_df['date_hour'].dt.year
district_base_df['month'] = district_base_df['date_hour'].dt.month
district_base_df['day'] = district_base_df['date_hour'].dt.day
district_base_df['day_of_week'] = district_base_df['date_hour'].dt.dayofweek
district_base_df

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week
0,1,2016-01-01 00:00:00,1,0,2016,1,4
1,1,2016-01-01 01:00:00,1,1,2016,1,4
2,1,2016-01-01 02:00:00,1,2,2016,1,4
3,1,2016-01-01 03:00:00,1,3,2016,1,4
4,1,2016-01-01 04:00:00,1,4,2016,1,4
...,...,...,...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19,2020,12,3
1008500,31,2020-12-31 20:00:00,31,20,2020,12,3
1008501,31,2020-12-31 21:00:00,31,21,2020,12,3
1008502,31,2020-12-31 22:00:00,31,22,2020,12,3


In [62]:
district_df = disadvantaged_areas_within_areas.groupby('district')['areas'].apply(set).reset_index()
district_df.columns = ['district', 'areas']
district_df

Unnamed: 0,district,areas
0,1,{35}
1,2,"{35, 36, 38, 39, 40, 41}"
2,3,"{40, 42, 43, 69}"
3,4,"{43, 46, 47, 48, 51, 52}"
4,5,"{49, 50, 53, 54}"
5,6,"{44, 69, 71}"
6,7,"{67, 68, 69}"
7,8,"{56, 66, 58}"
8,9,"{34, 37, 58, 59, 60, 61, 63}"
9,10,"{29, 30}"


In [63]:
base_df_with_district_stats = district_base_df.merge(right=district_df, on='district', how='left')
base_df_with_district_stats = base_df_with_district_stats.explode('areas')
base_df_with_district_stats = base_df_with_district_stats.merge(right=clean_public_healthindicator, left_on='areas', right_on='id', how='left')

In [64]:
base_df_with_district_stats

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week,areas,id,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,2016-01-01 00:00:00,1,0,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
1,1,2016-01-01 01:00:00,1,1,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
2,1,2016-01-01 02:00:00,1,2,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
3,1,2016-01-01 03:00:00,1,3,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
4,1,2016-01-01 04:00:00,1,4,2016,1,4,35,35.0,0.167,23098.0,0.169,0.31,0.016,0.261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3113203,31,2020-12-31 19:00:00,31,19,2020,12,3,,,,,,,,
3113204,31,2020-12-31 20:00:00,31,20,2020,12,3,,,,,,,,
3113205,31,2020-12-31 21:00:00,31,21,2020,12,3,,,,,,,,
3113206,31,2020-12-31 22:00:00,31,22,2020,12,3,,,,,,,,


In [65]:
base_df_with_district_stats = base_df_with_district_stats.groupby(['district', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week'])[['unemployment', 'per_capita_income', 'no_hs_dip', 'gov_depend', 'crowded_housing', 'below_pov']].agg('mean').reset_index()
base_df_with_district_stats

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
1,1,2016-01-01 01:00:00,1,1,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
2,1,2016-01-01 02:00:00,1,2,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
3,1,2016-01-01 03:00:00,1,3,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
4,1,2016-01-01 04:00:00,1,4,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19,2020,12,3,,,,,,
1008500,31,2020-12-31 20:00:00,31,20,2020,12,3,,,,,,
1008501,31,2020-12-31 21:00:00,31,21,2020,12,3,,,,,,
1008502,31,2020-12-31 22:00:00,31,22,2020,12,3,,,,,,


In [66]:
base_df_with_district_stats.rename(columns={
    'unemployment': 'area_unemployment',
    'per_capita_income': 'area_per_capita_income',
    'no_hs_dip': 'area_no_hs_dip',
    'gov_depend': 'area_gov_depend',
    'crowded_housing': 'area_crowded_housing',
    'below_pov': 'area_below_pov'
}, inplace=True)

In [67]:
base_df_with_district_stats

Unnamed: 0,district,date_hour,day,hour,year,month,day_of_week,area_unemployment,area_per_capita_income,area_no_hs_dip,area_gov_depend,area_crowded_housing,area_below_pov
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
1,1,2016-01-01 01:00:00,1,1,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
2,1,2016-01-01 02:00:00,1,2,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
3,1,2016-01-01 03:00:00,1,3,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
4,1,2016-01-01 04:00:00,1,4,2016,1,4,0.167,23098.0,0.169,0.31,0.016,0.261
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008499,31,2020-12-31 19:00:00,31,19,2020,12,3,,,,,,
1008500,31,2020-12-31 20:00:00,31,20,2020,12,3,,,,,,
1008501,31,2020-12-31 21:00:00,31,21,2020,12,3,,,,,,
1008502,31,2020-12-31 22:00:00,31,22,2020,12,3,,,,,,


In [70]:
area_final_df = pd.merge(base_df_with_area_stats, final_df_area_avg, on=['area_id','date_hour'], how='left')
dis_area_final_df = pd.merge(base_df_with_dis_area_stats, final_df_dis_area_avg, on=['dis_area_id','date_hour'], how='left')
district_final_df = pd.merge(base_df_with_district_stats, final_df_district_avg, on=['district','date_hour'], how='left')

In [71]:
area_final_df['temp_id'] = area_final_df.index
dis_area_final_df['temp_id'] = dis_area_final_df.index
district_final_df['temp_id'] = district_final_df.index

In [72]:
area_null_districts = area_final_df[area_final_df['district'].isna()][['temp_id', 'area_id']]
area_null_districts = area_null_districts.merge(clean_areas[['id', 'district']], left_on='area_id', right_on='id', how='inner')
area_final_df.loc[area_final_df['district'].isna(), 'district'] = area_final_df['temp_id'].map(area_null_districts.set_index('temp_id')['district'])

In [73]:
dis_area_null_districts = dis_area_final_df[dis_area_final_df['district'].isna()][['temp_id', 'dis_area_id']]
dis_area_null_districts = dis_area_null_districts.merge(clean_areas[['id', 'district']], left_on='dis_area_id', right_on='id', how='inner')
dis_area_final_df.loc[dis_area_final_df['district'].isna(), 'district'] = dis_area_final_df['temp_id'].map(dis_area_null_districts.set_index('temp_id')['district'])

In [76]:
area_final_df = area_final_df.drop(['id', 'poly', 'areas', 'temp_id', 'district_crimes_this_hour'], axis=1)
dis_area_final_df = dis_area_final_df.drop(['id', 'areas', 'temp_id', 'district_crimes_this_hour'], axis=1)
district_final_df = district_final_df.drop(['temp_id', 'area_crimes_this_hour'], axis=1)

In [77]:
area_final_df.columns

Index(['area_id', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week',
       'area_unemployment', 'area_per_capita_income', 'area_no_hs_dip',
       'area_gov_depend', 'area_crowded_housing', 'area_below_pov', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_

In [78]:
dis_area_final_df.columns

Index(['dis_area_id', 'date_hour', 'day', 'hour', 'year', 'month',
       'day_of_week', 'dis_area_unemployment', 'dis_area_per_capita_income',
       'dis_area_no_hs_dip', 'dis_area_gov_depend', 'dis_area_crowded_housing',
       'dis_area_below_pov', 'district', 'police_stations_distance_0.1',
       'police_stations_distance_0.3', 'police_stations_distance_0.5',
       'police_stations_distance_1', 'police_stations_distance_3',
       'police_stations_distance_5', 'bike_stations_distance_0.1',
       'bike_stations_distance_0.3', 'bike_stations_distance_0.5',
       'bike_stations_distance_1', 'bike_stations_distance_3',
       'bike_stations_distance_5', 'bus_stops_distance_0.1',
       'bus_stops_distance_0.3', 'bus_stops_distance_0.5',
       'bus_stops_distance_1', 'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3

In [79]:
district_final_df.columns

Index(['district', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week',
       'area_unemployment', 'area_per_capita_income', 'area_no_hs_dip',
       'area_gov_depend', 'area_crowded_housing', 'area_below_pov',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       

##### Save Pre-Normalized Datasets

In [80]:
area_final_df.fillna(0, inplace=True)
dis_area_final_df.fillna(0, inplace=True)
district_final_df.fillna(0, inplace=True)

In [81]:
area_final_df.to_csv('../../data/pre_training/area_pre_feature_selection_nonnormalized.csv', index=False)
dis_area_final_df.to_csv('../../data/pre_training/dis_area_pre_feature_selection_nonnormalized.csv', index=False)
district_final_df.to_csv('../../data/pre_training/district_pre_feature_selection_nonnormalized.csv', index=False)

#### Normalize Columns

In [48]:
area_columns_to_normalize = [
    'area_unemployment', 'area_per_capita_income', 'area_no_hs_dip',
    'area_gov_depend', 'area_crowded_housing', 'area_below_pov',
    'police_stations_distance_0.1', 'police_stations_distance_0.3',
    'police_stations_distance_0.5', 'police_stations_distance_1',
    'police_stations_distance_3', 'police_stations_distance_5',
    'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
    'bike_stations_distance_0.5', 'bike_stations_distance_1',
    'bike_stations_distance_3', 'bike_stations_distance_5',
    'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
    'bus_stops_distance_0.5', 'bus_stops_distance_1',
    'bus_stops_distance_3', 'bus_stops_distance_5',
    'train_stations_distance_0.1', 'train_stations_distance_0.3',
    'train_stations_distance_0.5', 'train_stations_distance_1',
    'train_stations_distance_3', 'train_stations_distance_5',
    'alleylights_distance_0.1', 'alleylights_distance_0.3',
    'alleylights_distance_0.5', 'alleylights_distance_1',
    'alleylights_distance_3', 'alleylights_distance_5',
    'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
    'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
    'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
    'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
    'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
    'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
    'bike_rides_within_0.1_and_5_min', 'bike_rides_within_0.3_and_5_min',
    'bike_rides_within_0.5_and_5_min', 'bike_rides_within_0.1_and_10_min',
    'bike_rides_within_0.3_and_10_min', 'bike_rides_within_0.5_and_10_min',
    'bike_rides_within_0.1_and_15_min', 'bike_rides_within_0.3_and_15_min',
    'bike_rides_within_0.5_and_15_min', 'hourly_bike_rides',
    'district_unemployment', 'district_per_capita_income',
    'district_no_hs_dip', 'district_gov_depend', 'district_crowded_housing',
    'district_below_pov', 'disadvantaged_score', 'rides', 'district_crimes_1_hours_prev',
    'district_crimes_3_hours_prev', 'district_crimes_6_hours_prev',
    'district_crimes_12_hours_prev', 'district_crimes_24_hours_prev',
    'area_crimes_this_hour', 'area_crimes_1_hours_prev',
    'area_crimes_3_hours_prev', 'area_crimes_6_hours_prev',
    'area_crimes_12_hours_prev', 'area_crimes_24_hours_prev'
]

In [49]:
district_columns_to_normalize = area_columns_to_normalize.copy()
district_columns_to_normalize.remove('area_crimes_this_hour') 
district_columns_to_normalize.append('district_crimes_this_hour')

In [50]:
scaler = MinMaxScaler()
area_final_df[area_columns_to_normalize] = scaler.fit_transform(area_final_df[area_columns_to_normalize])
district_final_df[district_columns_to_normalize] = scaler.fit_transform(district_final_df[district_columns_to_normalize])

In [None]:
print([f'{col}: {area_final_df[col].isna().sum()}' for col in area_final_df.columns])

In [None]:
print([f'{col}: {district_final_df[col].isna().sum()}' for col in district_final_df.columns])

In [53]:
area_final_df.fillna(0, inplace=True)
district_final_df.fillna(0, inplace=True)

In [55]:
area_final_df.to_csv('../../data/pre_training/area_pre_feature_selection.csv', index=False)
district_final_df.to_csv('../../data/pre_training/district_pre_feature_selection.csv', index=False)