In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler



In [2]:
crime_with_proximity = pd.read_csv('../../data/pre_training/crime_with_proximity.csv')

In [3]:
clean_bike_trips = pd.read_csv('../../data/processed/clean_bike_trips.csv')
agg_public_healthindicator = pd.read_csv('../../data/processed/agg_public_healthindicator.csv')
clean_police_districts = pd.read_csv('../../data/processed/clean_police_districts.csv')
clean_train_ridership = pd.read_csv('../../data/processed/clean_train_ridership.csv')
clean_areas = pd.read_csv('../../data/processed/clean_areas.csv')

In [4]:
crime_with_proximity['date'] = pd.to_datetime(crime_with_proximity['date'])
crime_with_proximity['hour'] = crime_with_proximity['date'].dt.hour
crime_with_proximity['day'] = crime_with_proximity['date'].dt.date
crime_with_proximity['date_hour'] = crime_with_proximity['date'].dt.floor('h')
crime_with_proximity.sort_values('date', inplace=True)

In [5]:
crimes_over_hours = crime_with_proximity.groupby(['district', pd.Grouper(key='date', freq='h')])['id'].count().reset_index().rename(columns={'id': 'district_crimes_this_hour'})
crimes_over_hours

Unnamed: 0,district,date,district_crimes_this_hour
0,1,2016-01-01 00:00:00,7
1,1,2016-01-01 01:00:00,1
2,1,2016-01-01 02:00:00,3
3,1,2016-01-01 03:00:00,2
4,1,2016-01-01 04:00:00,2
...,...,...,...
638876,31,2020-12-15 21:00:00,1
638877,31,2020-12-20 09:00:00,1
638878,31,2020-12-24 19:00:00,1
638879,31,2020-12-25 03:00:00,1


In [6]:
time_windows = [1, 3, 6, 12, 24]

for window in time_windows:
    crimes_over_hours[f'crimes_{window}_hours_prev'] = crimes_over_hours.groupby('district')['district_crimes_this_hour'].rolling(window=window, min_periods=1).sum().shift(1).reset_index(level=0, drop=True)

crimes_over_hours = pd.merge(left=crime_with_proximity[['id', 'district', 'date_hour', 'hour', 'day']], right=crimes_over_hours, left_on=['district', 'date_hour'], right_on=['district', 'date'], how='left')

In [7]:
crimes_over_hours = crimes_over_hours[['id', 'district_crimes_this_hour', 'crimes_1_hours_prev', 'crimes_3_hours_prev', 'crimes_6_hours_prev', 'crimes_12_hours_prev', 'crimes_24_hours_prev']]

In [8]:
clean_bike_trips['date'] = pd.to_datetime(clean_bike_trips['date'])
clean_bike_trips['hour'] = clean_bike_trips['date'].dt.hour
clean_bike_trips['date'] = clean_bike_trips['date'].dt.date

In [9]:
grouped_bike_trips = clean_bike_trips.groupby(['date', 'hour', 'district'])['station_id'].agg('count').reset_index().rename(columns={'station_id':'hourly_bike_rides'})
grouped_bike_trips['date'] = pd.to_datetime(grouped_bike_trips['date'])

In [10]:
final_df = pd.merge(left=crime_with_proximity, right=grouped_bike_trips, on=['date','hour','district'], how='left').drop(['hour','day'], axis=1).fillna(0)

In [11]:
agg_public_healthindicator.columns = ['district_' + col if col != 'district' else 'district' for col in agg_public_healthindicator.columns ]
final_df = pd.merge(left=final_df, right=agg_public_healthindicator, on='district', how='left')

In [12]:
final_df = pd.merge(left=final_df, right=clean_police_districts[['district','disadvantaged_score']], on='district', how='left')

In [13]:
clean_train_ridership['date'] = pd.to_datetime(clean_train_ridership['date'])

In [14]:
grouped_train_ridership = clean_train_ridership.groupby(['date','district'])['rides'].agg('sum').reset_index()

In [15]:
final_df = pd.merge(left=final_df, right=grouped_train_ridership, on=['date', 'district'], how='left').fillna(0)

In [16]:
final_df = pd.merge(left=final_df, right=crimes_over_hours, on='id', how='inner')

In [17]:
final_df.columns

Index(['id', 'date', 'type', 'lat', 'long', 'district',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_

In [18]:
clean_crime = pd.read_csv('../../data/processed/clean_crime.csv')

In [19]:
clean_crime = clean_crime[['id','areas']]

In [20]:
final_df = final_df.merge(clean_crime, on='id',how='inner')

In [21]:
final_df = final_df.rename(columns={'areas':'area_id'})

In [22]:
date_range = pd.date_range(start='2016-01-01 00:00:00', end='2020-12-31 23:00:00', freq='h')
areas = np.arange(1, 78)
base_df = pd.DataFrame([(area, date) for area in areas for date in date_range], columns=['area_id', 'date_hour'])
base_df['day'] = base_df['date_hour'].dt.day
base_df['hour'] = base_df['date_hour'].dt.hour

In [23]:
base_df

Unnamed: 0,area_id,date_hour,day,hour
0,1,2016-01-01 00:00:00,1,0
1,1,2016-01-01 01:00:00,1,1
2,1,2016-01-01 02:00:00,1,2
3,1,2016-01-01 03:00:00,1,3
4,1,2016-01-01 04:00:00,1,4
...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19
3376292,77,2020-12-31 20:00:00,31,20
3376293,77,2020-12-31 21:00:00,31,21
3376294,77,2020-12-31 22:00:00,31,22


In [24]:
base_df['year'] = base_df['date_hour'].dt.year
base_df['month'] = base_df['date_hour'].dt.month
base_df['day'] = base_df['date_hour'].dt.day
base_df['day_of_week'] = base_df['date_hour'].dt.dayofweek
base_df

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week
0,1,2016-01-01 00:00:00,1,0,2016,1,4
1,1,2016-01-01 01:00:00,1,1,2016,1,4
2,1,2016-01-01 02:00:00,1,2,2016,1,4
3,1,2016-01-01 03:00:00,1,3,2016,1,4
4,1,2016-01-01 04:00:00,1,4,2016,1,4
...,...,...,...,...,...,...,...
3376291,77,2020-12-31 19:00:00,31,19,2020,12,3
3376292,77,2020-12-31 20:00:00,31,20,2020,12,3
3376293,77,2020-12-31 21:00:00,31,21,2020,12,3
3376294,77,2020-12-31 22:00:00,31,22,2020,12,3


In [25]:
clean_public_healthindicator = pd.read_csv('../../data/processed/clean_public_healthindicator.csv')
disadvantaged_areas_within_areas = pd.read_csv('../../data/processed/disadvantaged_areas_within_areas.csv')

In [26]:
clean_public_healthindicator

Unnamed: 0,id,unemployment,per_capita_income,no_hs_dip,gov_depend,crowded_housing,below_pov
0,1,0.075,23714,0.181,0.288,0.079,0.227
1,2,0.079,21375,0.196,0.383,0.070,0.151
2,3,0.077,32355,0.136,0.222,0.046,0.227
3,4,0.068,35503,0.125,0.256,0.031,0.095
4,5,0.045,51615,0.054,0.255,0.002,0.071
...,...,...,...,...,...,...,...
72,73,0.183,19709,0.156,0.424,0.011,0.157
73,74,0.069,34221,0.045,0.370,0.011,0.031
74,75,0.149,26185,0.109,0.394,0.008,0.137
75,76,0.047,29402,0.110,0.265,0.019,0.095


In [27]:
base_df_with_area_stats = base_df.merge(right=clean_public_healthindicator, left_on='area_id', right_on='id', how='left')
base_df_with_area_stats = base_df_with_area_stats.merge(right=disadvantaged_areas_within_areas, left_on='area_id', right_on='areas', how='left')

In [28]:
filtered_df = ['police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
       'alleylights_distance_0.1', 'alleylights_distance_0.3',
       'alleylights_distance_0.5', 'alleylights_distance_1',
       'alleylights_distance_3', 'alleylights_distance_5',
       'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
       'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
       'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
       'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
       'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
       'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
       'hourly_bike_rides', 'district_unemployment', 'district_per_capita_income',
       'district_no_hs_dip', 'district_gov_depend', 'district_crowded_housing', 
       'district_below_pov', 'district_crimes_this_hour', 'crimes_1_hours_prev', 'crimes_3_hours_prev',
       'crimes_6_hours_prev', 'crimes_12_hours_prev', 'crimes_24_hours_prev']

In [29]:
crime_counts = final_df.groupby(['area_id', 'date_hour'])['district'].agg('count').reset_index().rename(columns={'district':'area_crimes_this_hour'})
final_df_area_avg = final_df.groupby(['area_id','date_hour'])[filtered_df].agg('mean').reset_index()
final_df = pd.merge(crime_counts, final_df_area_avg, on=['area_id','date_hour'], how='inner')

In [30]:
crime_counts

Unnamed: 0,area_id,date_hour,area_crimes_this_hour
0,1,2016-01-01 00:00:00,2
1,1,2016-01-01 01:00:00,2
2,1,2016-01-01 03:00:00,2
3,1,2016-01-01 12:00:00,1
4,1,2016-01-02 00:00:00,1
...,...,...,...
897943,77,2020-12-29 10:00:00,2
897944,77,2020-12-29 11:00:00,1
897945,77,2020-12-30 17:00:00,1
897946,77,2020-12-30 19:00:00,1


In [31]:
final_df_area_avg

Unnamed: 0,area_id,date_hour,police_stations_distance_0.1,police_stations_distance_0.3,police_stations_distance_0.5,police_stations_distance_1,police_stations_distance_3,police_stations_distance_5,bike_stations_distance_0.1,bike_stations_distance_0.3,...,district_no_hs_dip,district_gov_depend,district_crowded_housing,district_below_pov,district_crimes_this_hour,crimes_1_hours_prev,crimes_3_hours_prev,crimes_6_hours_prev,crimes_12_hours_prev,crimes_24_hours_prev
0,1,2016-01-01 00:00:00,0.0,0.0,0.5,0.5,2.0,3.5,0.0,1.5,...,0.1885,0.3355,0.0745,0.1890,4.0,1.0,5.0,16.0,23.0,37.0
1,1,2016-01-01 01:00:00,0.0,0.0,0.0,0.5,1.5,3.0,0.0,2.0,...,0.1885,0.3355,0.0745,0.1890,2.0,4.0,4.0,4.0,4.0,4.0
2,1,2016-01-01 03:00:00,0.0,0.5,1.0,1.0,2.0,3.5,0.5,1.5,...,0.1885,0.3355,0.0745,0.1890,2.0,1.0,7.0,7.0,7.0,7.0
3,1,2016-01-01 12:00:00,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,...,0.1885,0.3355,0.0745,0.1890,1.0,1.0,4.0,10.0,10.0,10.0
4,1,2016-01-02 00:00:00,0.0,0.0,0.0,1.0,2.0,3.0,0.0,2.0,...,0.1885,0.3355,0.0745,0.1890,1.0,1.0,3.0,7.0,13.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
897943,77,2020-12-29 10:00:00,0.0,0.0,0.0,0.0,3.0,4.5,0.0,1.5,...,0.1075,0.2450,0.0350,0.1305,2.0,1.0,3.0,6.0,14.0,33.0
897944,77,2020-12-29 11:00:00,0.0,1.0,1.0,1.0,2.0,4.0,0.0,1.0,...,0.1885,0.3355,0.0745,0.1890,2.0,2.0,4.0,7.0,16.0,36.0
897945,77,2020-12-30 17:00:00,0.0,0.0,0.0,0.0,3.0,3.0,0.0,2.0,...,0.1075,0.2450,0.0350,0.1305,3.0,2.0,5.0,8.0,15.0,29.0
897946,77,2020-12-30 19:00:00,0.0,0.0,0.0,0.0,3.0,4.0,0.0,1.0,...,0.1075,0.2450,0.0350,0.1305,1.0,3.0,6.0,10.0,17.0,31.0


In [32]:
final_df = pd.merge(base_df_with_area_stats, final_df, on=['area_id','date_hour'], how='left')

In [33]:
final_df['temp_id'] = final_df.index

In [34]:
null_districts = final_df[final_df['district'].isna()][['temp_id', 'area_id']]
null_districts = null_districts.merge(clean_areas[['id', 'district']], left_on='area_id', right_on='id', how='inner')
final_df.loc[final_df['district'].isna(), 'district'] = final_df['temp_id'].map(null_districts.set_index('temp_id')['district'])

In [35]:
final_df = final_df.drop(['id', 'poly', 'areas', 'temp_id'], axis=1)

In [36]:
final_df.columns

Index(['area_id', 'date_hour', 'day', 'hour', 'year', 'month', 'day_of_week',
       'unemployment', 'per_capita_income', 'no_hs_dip', 'gov_depend',
       'crowded_housing', 'below_pov', 'district', 'area_crimes_this_hour',
       'police_stations_distance_0.1', 'police_stations_distance_0.3',
       'police_stations_distance_0.5', 'police_stations_distance_1',
       'police_stations_distance_3', 'police_stations_distance_5',
       'bike_stations_distance_0.1', 'bike_stations_distance_0.3',
       'bike_stations_distance_0.5', 'bike_stations_distance_1',
       'bike_stations_distance_3', 'bike_stations_distance_5',
       'bus_stops_distance_0.1', 'bus_stops_distance_0.3',
       'bus_stops_distance_0.5', 'bus_stops_distance_1',
       'bus_stops_distance_3', 'bus_stops_distance_5',
       'train_stations_distance_0.1', 'train_stations_distance_0.3',
       'train_stations_distance_0.5', 'train_stations_distance_1',
       'train_stations_distance_3', 'train_stations_distance_5',
 

#### Normalize Columns

In [37]:
columns_to_normalize = [
    'unemployment', 'per_capita_income', 'no_hs_dip', 'gov_depend',
    'crowded_housing', 'below_pov','area_crimes_this_hour', 'police_stations_distance_0.1',
    'police_stations_distance_0.3', 'police_stations_distance_0.5',
    'police_stations_distance_1', 'police_stations_distance_3',
    'police_stations_distance_5', 'bike_stations_distance_0.1',
    'bike_stations_distance_0.3', 'bike_stations_distance_0.5',
    'bike_stations_distance_1', 'bike_stations_distance_3',
    'bike_stations_distance_5', 'bus_stops_distance_0.1',
    'bus_stops_distance_0.3', 'bus_stops_distance_0.5',
    'bus_stops_distance_1', 'bus_stops_distance_3', 'bus_stops_distance_5',
    'train_stations_distance_0.1', 'train_stations_distance_0.3',
    'train_stations_distance_0.5', 'train_stations_distance_1',
    'train_stations_distance_3', 'train_stations_distance_5',
    'alleylights_distance_0.1', 'alleylights_distance_0.3',
    'alleylights_distance_0.5', 'alleylights_distance_1',
    'alleylights_distance_3', 'alleylights_distance_5',
    'streetlights_allout_distance_0.1', 'streetlights_allout_distance_0.3',
    'streetlights_allout_distance_0.5', 'streetlights_allout_distance_1',
    'streetlights_allout_distance_3', 'streetlights_allout_distance_5',
    'streetlights_oneout_distance_0.1', 'streetlights_oneout_distance_0.3',
    'streetlights_oneout_distance_0.5', 'streetlights_oneout_distance_1',
    'streetlights_oneout_distance_3', 'streetlights_oneout_distance_5',
    'hourly_bike_rides', 'district_unemployment', 'district_per_capita_income', 
    'district_no_hs_dip', 'district_gov_depend', 'district_crowded_housing', 
    'district_below_pov', 'district_crimes_this_hour', 'crimes_1_hours_prev',
    'crimes_3_hours_prev', 'crimes_6_hours_prev', 'crimes_12_hours_prev',
    'crimes_24_hours_prev'
    ]

In [38]:
scaler = MinMaxScaler()
final_df[columns_to_normalize] = scaler.fit_transform(final_df[columns_to_normalize])

In [39]:
clean_areas

Unnamed: 0,id,poly,district
0,35,POLYGON ((41.84469250265398 -87.60914087617894...,2
1,36,POLYGON ((41.81692934626684 -87.59215283879394...,2
2,37,POLYGON ((41.80189303368919 -87.62879823733725...,9
3,38,"POLYGON ((41.81681377057218 -87.6067081256125,...",2
4,39,POLYGON ((41.81692934626684 -87.59215283879394...,2
...,...,...,...
72,74,POLYGON ((41.70714491233857 -87.69645961375822...,22
73,75,POLYGON ((41.685082119670845 -87.6421520465139...,22
74,76,POLYGON ((41.986396111591276 -87.8365808787436...,16
75,77,POLYGON ((41.99816614970252 -87.65455590025104...,20


In [40]:
final_df

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week,unemployment,per_capita_income,no_hs_dip,...,district_no_hs_dip,district_gov_depend,district_crowded_housing,district_below_pov,district_crimes_this_hour,crimes_1_hours_prev,crimes_3_hours_prev,crimes_6_hours_prev,crimes_12_hours_prev,crimes_24_hours_prev
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
1,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
2,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
3,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
4,1,2016-01-01 01:00:00,1,1,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.016129,0.048387,0.012270,0.009174,0.008439,0.007576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13329787,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,,,,,,,,,,
13329788,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,,,,,,,,,,
13329789,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,,,,,,,,,,
13329790,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,,,,,,,,,,


In [41]:
print([f'{col}: {final_df[col].isna().sum()}' for col in final_df.columns])

['area_id: 0', 'date_hour: 0', 'day: 0', 'hour: 0', 'year: 0', 'month: 0', 'day_of_week: 0', 'unemployment: 0', 'per_capita_income: 0', 'no_hs_dip: 0', 'gov_depend: 0', 'crowded_housing: 0', 'below_pov: 0', 'district: 0', 'area_crimes_this_hour: 8056753', 'police_stations_distance_0.1: 8056753', 'police_stations_distance_0.3: 8056753', 'police_stations_distance_0.5: 8056753', 'police_stations_distance_1: 8056753', 'police_stations_distance_3: 8056753', 'police_stations_distance_5: 8056753', 'bike_stations_distance_0.1: 8056753', 'bike_stations_distance_0.3: 8056753', 'bike_stations_distance_0.5: 8056753', 'bike_stations_distance_1: 8056753', 'bike_stations_distance_3: 8056753', 'bike_stations_distance_5: 8056753', 'bus_stops_distance_0.1: 8056753', 'bus_stops_distance_0.3: 8056753', 'bus_stops_distance_0.5: 8056753', 'bus_stops_distance_1: 8056753', 'bus_stops_distance_3: 8056753', 'bus_stops_distance_5: 8056753', 'train_stations_distance_0.1: 8056753', 'train_stations_distance_0.3: 80

In [45]:
final_df.fillna(0, inplace=True)

In [46]:
final_df

Unnamed: 0,area_id,date_hour,day,hour,year,month,day_of_week,unemployment,per_capita_income,no_hs_dip,...,district_no_hs_dip,district_gov_depend,district_crowded_housing,district_below_pov,district_crimes_this_hour,crimes_1_hours_prev,crimes_3_hours_prev,crimes_6_hours_prev,crimes_12_hours_prev,crimes_24_hours_prev
0,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
1,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
2,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
3,1,2016-01-01 00:00:00,1,0,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.048387,0.000000,0.018405,0.064220,0.088608,0.132576
4,1,2016-01-01 01:00:00,1,1,2016,1,4,0.092179,0.193048,0.272401,...,0.369307,0.59824,0.603701,0.376315,0.016129,0.048387,0.012270,0.009174,0.008439,0.007576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13329787,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13329788,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13329789,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13329790,77,2020-12-31 23:00:00,31,23,2020,12,3,0.134078,0.315778,0.109319,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [47]:
final_df.to_csv('../../data/pre_training/pre_feature_selection.csv', index=False)