In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
sns.set(style="white", color_codes=True)
sns.set_context(rc={'font.family': 'sans', 'font.size': 24, 'axes.titlesize':24, 'axes.labelsize':24})
#if you want to know current working dir
os.chdir('../..')

from src.utils import *
from src.utility import merge_speed_events
import src.data as data
import src.utility as utils
from src.preprocessing.other_features import avg_speed_for_roadtype_event

%matplotlib inline

In [2]:
X_df = data.base_dataset()
speeds = data.speeds()

caching base dataset train
caching resources/dataset/preprocessed/speeds_train_imputed_time.csv.gz


In [3]:
X_df.describe()

Unnamed: 0,KEY,KM,event_index,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,SPEED_AVG_-10,SPEED_AVG_-9,SPEED_AVG_-8,...,DISTANCE_-10,DISTANCE_-9,DISTANCE_-8,DISTANCE_-7,DISTANCE_-6,DISTANCE_-5,DISTANCE_-4,DISTANCE_-3,DISTANCE_-2,DISTANCE_-1
count,33202.0,33202.0,33202.0,33202.0,33202.0,33202.0,33199.0,26794.0,27305.0,27815.0,...,25991.0,26491.0,26986.0,27547.0,28128.0,28767.0,29406.0,30089.0,30845.0,31626.0
mean,155.785043,702.451208,5321998.0,0.254593,1.744021,0.438166,16.262809,179.665845,179.557271,179.478004,...,53.073949,53.21781,53.379901,53.544887,53.729487,53.930128,54.051792,54.13048,54.278716,54.323942
std,162.450088,401.71839,3129451.0,0.435639,0.781773,0.617349,19.426577,34.762425,34.787738,34.823604,...,56.441309,56.441159,56.507987,56.562771,56.63247,56.676593,56.70485,56.627971,56.641991,56.60313
min,0.0,17.0,916.0,0.0,1.0,0.0,0.0,65.529,65.5912,65.5912,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
25%,16.0,476.0,2710456.0,0.0,1.0,0.0,1.0,159.84895,159.6345,159.4804,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
50%,88.0,588.0,5402846.0,0.0,2.0,0.0,5.0,186.1446,186.0224,185.8717,...,36.0,36.0,36.0,36.0,36.0,36.0,36.0,37.0,37.0,37.0
75%,260.0,974.0,8133522.0,1.0,2.0,1.0,24.0,204.96985,204.9321,204.88665,...,66.0,66.0,67.0,67.0,69.0,69.0,70.0,70.0,70.0,70.0
max,578.0,1983.0,10822460.0,1.0,3.0,2.0,85.0,290.3575,290.3575,290.3575,...,279.0,279.0,279.0,279.0,279.0,286.0,286.0,286.0,286.0,286.0


In [4]:
columns = [col for col in X_df]
# columns = [col for col in X_df if not col.startswith('WEATHER')]
# columns = [col for col in columns if not col.startswith('EVENT_TYPE')]
columns

['KEY',
 'KM',
 'event_index',
 'EMERGENCY_LANE',
 'LANES',
 'ROAD_TYPE',
 'EVENT_DETAIL',
 'EVENT_TYPE',
 'DATETIME_UTC_-10',
 'DATETIME_UTC_-9',
 'DATETIME_UTC_-8',
 'DATETIME_UTC_-7',
 'DATETIME_UTC_-6',
 'DATETIME_UTC_-5',
 'DATETIME_UTC_-4',
 'DATETIME_UTC_-3',
 'DATETIME_UTC_-2',
 'DATETIME_UTC_-1',
 'DATETIME_UTC_y_0',
 'DATETIME_UTC_y_1',
 'DATETIME_UTC_y_2',
 'DATETIME_UTC_y_3',
 'SPEED_AVG_-10',
 'SPEED_AVG_-9',
 'SPEED_AVG_-8',
 'SPEED_AVG_-7',
 'SPEED_AVG_-6',
 'SPEED_AVG_-5',
 'SPEED_AVG_-4',
 'SPEED_AVG_-3',
 'SPEED_AVG_-2',
 'SPEED_AVG_-1',
 'SPEED_AVG_Y_0',
 'SPEED_AVG_Y_1',
 'SPEED_AVG_Y_2',
 'SPEED_AVG_Y_3',
 'SPEED_SD_-10',
 'SPEED_SD_-9',
 'SPEED_SD_-8',
 'SPEED_SD_-7',
 'SPEED_SD_-6',
 'SPEED_SD_-5',
 'SPEED_SD_-4',
 'SPEED_SD_-3',
 'SPEED_SD_-2',
 'SPEED_SD_-1',
 'SPEED_MAX_-10',
 'SPEED_MAX_-9',
 'SPEED_MAX_-8',
 'SPEED_MAX_-7',
 'SPEED_MAX_-6',
 'SPEED_MAX_-5',
 'SPEED_MAX_-4',
 'SPEED_MAX_-3',
 'SPEED_MAX_-2',
 'SPEED_MAX_-1',
 'SPEED_MIN_-10',
 'SPEED_MIN_-9',

In [5]:
speeds[DATETIME] = pd.to_datetime(speeds[DATETIME])
for i in range(1, 11):
    print(f'Processing {i}')
    time = 'DATETIME_UTC_-' + str(i)
    speed_avg = 'SPEED_AVG_-' + str(i)
    speed_max = 'SPEED_MAX_-' + str(i)
    speed_min = 'SPEED_MIN_-' + str(i)
    speed_std = 'SPEED_SD_-' + str(i)
    n_cars = 'N_VEHICLES_-' + str(i)
    X_df[time] = pd.to_datetime(X_df[time])
    
    X_df.drop(columns=[speed_avg, speed_max, speed_min, speed_std, n_cars], inplace=True)
    X_df = pd.merge(X_df, speeds[[KEY, KM, DATETIME, SPEED_AVG, SPEED_MAX, SPEED_MIN, SPEED_SD, N_CARS]],
                    left_on=[KEY, KM, time], right_on=[KEY, KM, DATETIME], how='left')
    X_df.rename(columns={SPEED_AVG: speed_avg, SPEED_MAX: speed_max, SPEED_MIN: speed_min, SPEED_SD: speed_std, N_CARS: n_cars}, inplace=True)
    print('Done')

Processing 1
Done
Processing 2
Done
Processing 3
Done
Processing 4
Done
Processing 5
Done
Processing 6
Done
Processing 7
Done
Processing 8
Done
Processing 9
Done
Processing 10
Done


In [8]:
columns = [col for col in X_df]
# columns = [col for col in X_df if not col.startswith('WEATHER')]
# columns = [col for col in columns if not col.startswith('EVENT_TYPE')]
columns.sort()
columns

['DATETIME_UTC_-1',
 'DATETIME_UTC_-10',
 'DATETIME_UTC_-2',
 'DATETIME_UTC_-3',
 'DATETIME_UTC_-4',
 'DATETIME_UTC_-5',
 'DATETIME_UTC_-6',
 'DATETIME_UTC_-7',
 'DATETIME_UTC_-8',
 'DATETIME_UTC_-9',
 'DATETIME_UTC_x',
 'DATETIME_UTC_x',
 'DATETIME_UTC_x',
 'DATETIME_UTC_x',
 'DATETIME_UTC_x',
 'DATETIME_UTC_y',
 'DATETIME_UTC_y',
 'DATETIME_UTC_y',
 'DATETIME_UTC_y',
 'DATETIME_UTC_y',
 'DATETIME_UTC_y_0',
 'DATETIME_UTC_y_1',
 'DATETIME_UTC_y_2',
 'DATETIME_UTC_y_3',
 'DISTANCE_-1',
 'DISTANCE_-10',
 'DISTANCE_-2',
 'DISTANCE_-3',
 'DISTANCE_-4',
 'DISTANCE_-5',
 'DISTANCE_-6',
 'DISTANCE_-7',
 'DISTANCE_-8',
 'DISTANCE_-9',
 'EMERGENCY_LANE',
 'EVENT_DETAIL',
 'EVENT_TYPE',
 'KEY',
 'KM',
 'LANES',
 'N_VEHICLES_-1',
 'N_VEHICLES_-10',
 'N_VEHICLES_-2',
 'N_VEHICLES_-3',
 'N_VEHICLES_-4',
 'N_VEHICLES_-5',
 'N_VEHICLES_-6',
 'N_VEHICLES_-7',
 'N_VEHICLES_-8',
 'N_VEHICLES_-9',
 'ROAD_TYPE',
 'SPEED_AVG_-1',
 'SPEED_AVG_-10',
 'SPEED_AVG_-2',
 'SPEED_AVG_-3',
 'SPEED_AVG_-4',
 'SPEED

In [7]:
X_df.describe()

Unnamed: 0,KEY,KM,event_index,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,SPEED_AVG_Y_0,SPEED_AVG_Y_1,SPEED_AVG_Y_2,...,SPEED_AVG_-9,SPEED_MAX_-9,SPEED_MIN_-9,SPEED_SD_-9,N_VEHICLES_-9,SPEED_AVG_-10,SPEED_MAX_-10,SPEED_MIN_-10,SPEED_SD_-10,N_VEHICLES_-10
count,33202.0,33202.0,33202.0,33202.0,33202.0,33202.0,33199.0,33202.0,31793.0,31033.0,...,32624.0,32624.0,32624.0,32624.0,32624.0,32547.0,32547.0,32547.0,32547.0,32547.0
mean,155.785043,702.451208,5321998.0,0.254593,1.744021,0.438166,16.262809,179.300981,179.71122,179.848292,...,179.578087,243.768054,126.338236,23.113379,176.430824,179.721208,243.923431,126.483254,23.121385,176.001583
std,162.450088,401.71839,3129451.0,0.435639,0.781773,0.617349,19.426577,34.908835,34.747733,34.644617,...,34.656037,54.635176,34.427525,8.53667,210.093831,34.621149,54.59698,34.410372,8.490347,209.553215
min,0.0,17.0,916.0,0.0,1.0,0.0,0.0,63.911,63.911,65.529,...,65.591231,71.192,59.866,0.0,2.0,65.529,66.338,59.866,0.0,2.0
25%,16.0,476.0,2710456.0,0.0,1.0,0.0,1.0,158.79545,159.4493,159.6427,...,159.373,208.722,108.406,17.407453,34.0,159.5348,208.722,108.406,17.432863,34.0
50%,88.0,588.0,5402846.0,0.0,2.0,0.0,5.0,185.29555,185.7158,185.8751,...,185.404472,245.936,132.676,22.696251,95.0,185.542021,245.936,132.676,22.706991,95.0
75%,260.0,974.0,8133522.0,1.0,2.0,1.0,24.0,204.77975,204.9433,204.9824,...,204.917279,279.914,150.474,27.959027,243.0,204.972713,279.914,150.474,27.972108,243.0
max,578.0,1983.0,10822460.0,1.0,3.0,2.0,85.0,311.7347,311.7347,311.7347,...,290.357455,470.838,275.06,146.444643,1445.0,290.357455,470.838,275.06,91.527902,1445.0


In [9]:
columns = [c for c in X_df if c.startswith('DATETIME_UTC_x') or c == 'DATETIME_UTC_y']
X_df.drop(columns=columns, inplace=True)
columns = [col for col in X_df]
# columns = [col for col in X_df if not col.startswith('WEATHER')]
# columns = [col for col in columns if not col.startswith('EVENT_TYPE')]
columns

['KEY',
 'KM',
 'event_index',
 'EMERGENCY_LANE',
 'LANES',
 'ROAD_TYPE',
 'EVENT_DETAIL',
 'EVENT_TYPE',
 'DATETIME_UTC_-10',
 'DATETIME_UTC_-9',
 'DATETIME_UTC_-8',
 'DATETIME_UTC_-7',
 'DATETIME_UTC_-6',
 'DATETIME_UTC_-5',
 'DATETIME_UTC_-4',
 'DATETIME_UTC_-3',
 'DATETIME_UTC_-2',
 'DATETIME_UTC_-1',
 'DATETIME_UTC_y_0',
 'DATETIME_UTC_y_1',
 'DATETIME_UTC_y_2',
 'DATETIME_UTC_y_3',
 'SPEED_AVG_Y_0',
 'SPEED_AVG_Y_1',
 'SPEED_AVG_Y_2',
 'SPEED_AVG_Y_3',
 'WEATHER_-10',
 'WEATHER_-9',
 'WEATHER_-8',
 'WEATHER_-7',
 'WEATHER_-6',
 'WEATHER_-5',
 'WEATHER_-4',
 'WEATHER_-3',
 'WEATHER_-2',
 'WEATHER_-1',
 'DISTANCE_-10',
 'DISTANCE_-9',
 'DISTANCE_-8',
 'DISTANCE_-7',
 'DISTANCE_-6',
 'DISTANCE_-5',
 'DISTANCE_-4',
 'DISTANCE_-3',
 'DISTANCE_-2',
 'DISTANCE_-1',
 'SPEED_AVG_-1',
 'SPEED_MAX_-1',
 'SPEED_MIN_-1',
 'SPEED_SD_-1',
 'N_VEHICLES_-1',
 'SPEED_AVG_-2',
 'SPEED_MAX_-2',
 'SPEED_MIN_-2',
 'SPEED_SD_-2',
 'N_VEHICLES_-2',
 'SPEED_AVG_-3',
 'SPEED_MAX_-3',
 'SPEED_MIN_-3',
 'SP