In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
sns.set(style="white", color_codes=True)
sns.set_context(rc={'font.family': 'sans', 'font.size': 24, 'axes.titlesize':24, 'axes.labelsize':24})
#if you want to know current working dir
os.chdir('../..')

from src.utils import *
from src.utility import merge_speed_events
import src.data as data
import src.utility as utils
from src.preprocessing.other_features import avg_speed_for_roadtype_event

%matplotlib inline

In [4]:
speeds_train = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_train_imputed_time.csv.gz'))
speeds_test = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_test_imputed_time.csv.gz'))
speeds_2019 = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_2019_imputed_time.csv.gz'))
print(speeds_train.shape, speeds_test.shape, speeds_2019.shape)

(11652930, 9) (3762660, 9) (438361, 9)


In [5]:
speeds_all = pd.concat([speeds_train, speeds_test, speeds_2019], ignore_index=True)
speeds_all.sort_values([KEY, KM, DATETIME], inplace=True)
speeds_all.drop_duplicates([KEY, KM, DATETIME], inplace=True)
speeds_all[DATETIME] = pd.to_datetime(speeds_all[DATETIME])
speeds_all.shape


(15823003, 9)

In [6]:
X_df = data.base_dataset(mode='full')
X_df.shape

caching resources/dataset/preprocessed/full/train/base_dataset.csv.gz


(27545, 60)

In [7]:
print(X_df.columns)
X_df.describe()

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

Unnamed: 0,event_index,KEY,KM,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,SPEED_AVG_-4,SPEED_AVG_-3,SPEED_AVG_-2,...,TEMPERATURE_-2,TEMPERATURE_-1,MIN_TEMPERATURE_-4,MIN_TEMPERATURE_-3,MIN_TEMPERATURE_-2,MIN_TEMPERATURE_-1,MAX_TEMPERATURE_-4,MAX_TEMPERATURE_-3,MAX_TEMPERATURE_-2,MAX_TEMPERATURE_-1
count,27545.0,27545.0,27545.0,27545.0,27545.0,27545.0,27545.0,13277.0,13321.0,13336.0,...,18337.0,18326.0,18346.0,18331.0,18337.0,18326.0,18346.0,18331.0,18337.0,18326.0
mean,29208.569468,198.094137,600.053984,0.279506,1.704738,0.358613,23.651988,185.924849,185.133403,183.973377,...,13.238098,13.247845,10.434863,10.420053,10.398429,10.377169,14.486537,14.488571,14.472651,14.487722
std,12723.926453,178.988736,358.76836,0.448765,0.751309,0.589432,21.9028,33.788831,34.11654,34.933422,...,7.956903,7.981672,7.191765,7.207014,7.2218,7.230031,7.999827,8.01599,8.038148,8.036155
min,1.0,0.0,17.0,0.0,1.0,0.0,-1.0,59.866,63.911,63.911,...,-16.0,-16.0,-17.0,-17.0,-17.0,-17.0,-15.0,-15.0,-15.0,-15.0
25%,21493.0,25.0,363.0,0.0,1.0,0.0,6.0,162.183211,161.487432,159.7775,...,8.0,8.0,5.0,5.0,5.0,5.0,9.0,9.0,9.0,9.0
50%,30638.0,150.0,576.0,0.0,2.0,0.0,15.0,189.774368,188.766667,187.542771,...,14.0,14.0,11.0,11.0,11.0,11.0,15.0,15.0,15.0,15.0
75%,38611.0,334.0,750.0,1.0,2.0,1.0,44.0,212.35912,211.787684,211.280811,...,19.0,19.0,16.0,16.0,16.0,16.0,20.0,20.0,20.0,20.0
max,47850.0,578.0,1997.0,1.0,3.0,2.0,86.0,295.831622,283.097806,298.556174,...,36.0,36.0,28.0,28.0,29.0,29.0,41.0,41.0,41.0,41.0


In [8]:
window_len = sum(X_df.columns.str.match('^SPEED_AVG_-.*$')*1)
for i in range(1, window_len+1):
    time = 'DATETIME_UTC_-' + str(i)
    speed_avg = 'SPEED_AVG_-' + str(i)
    speed_max = 'SPEED_MAX_-' + str(i)
    speed_min = 'SPEED_MIN_-' + str(i)
    speed_std = 'SPEED_SD_-' + str(i)
    n_cars = 'N_VEHICLES_-' + str(i)
    X_df[time] = pd.to_datetime(X_df[time])

    X_df.drop(columns=[speed_avg, speed_max, speed_min, speed_std, n_cars], inplace=True)
    X_df = pd.merge(X_df, speeds_all[[KEY, KM, DATETIME, SPEED_AVG, SPEED_MAX, SPEED_MIN, SPEED_SD, N_CARS]],
                    left_on=[KEY, KM, time], right_on=[KEY, KM, DATETIME], how='left')
    X_df.rename(columns={SPEED_AVG: speed_avg, SPEED_MAX: speed_max, SPEED_MIN: speed_min, SPEED_SD: speed_std, N_CARS: n_cars}, inplace=True)

In [10]:
X_df.describe()

Unnamed: 0,event_index,KEY,KM,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,SPEED_AVG_Y_0,SPEED_AVG_Y_1,SPEED_AVG_Y_2,...,SPEED_AVG_-3,SPEED_MAX_-3,SPEED_MIN_-3,SPEED_SD_-3,N_VEHICLES_-3,SPEED_AVG_-4,SPEED_MAX_-4,SPEED_MIN_-4,SPEED_SD_-4,N_VEHICLES_-4
count,27545.0,27545.0,27545.0,27545.0,27545.0,27545.0,27545.0,13425.0,13479.0,12897.0,...,13779.0,13779.0,13779.0,13779.0,13779.0,13715.0,13715.0,13715.0,13715.0,13715.0
mean,29208.569468,198.094137,600.053984,0.279506,1.704738,0.358613,23.651988,182.25256,182.202198,181.92296,...,184.820312,250.379297,125.299999,23.506733,252.892341,185.674353,250.913814,126.31939,23.514246,250.053152
std,12723.926453,178.988736,358.76836,0.448765,0.751309,0.589432,21.9028,36.016595,36.232696,36.023743,...,34.163939,57.672978,36.066503,8.767394,320.471272,33.835603,57.412213,35.782251,8.994007,317.529297
min,1.0,0.0,17.0,0.0,1.0,0.0,-1.0,61.484,59.866,60.675,...,63.911,66.338,59.866,0.0,2.0,59.866,59.866,59.866,0.0,2.0
25%,21493.0,25.0,363.0,0.0,1.0,0.0,6.0,157.792998,157.542879,157.280759,...,160.623273,208.722,100.316,17.516894,41.0,161.493491,208.722,101.934,17.472763,41.0
50%,30638.0,150.0,576.0,0.0,2.0,0.0,15.0,186.173277,186.077479,184.990546,...,188.0116,252.408,131.058,23.217494,108.0,189.066296,254.026,132.676,23.293109,108.0
75%,38611.0,334.0,750.0,1.0,2.0,1.0,44.0,210.638053,210.48326,210.224429,...,211.641069,289.622,152.092,28.822014,324.5,212.223947,292.858,152.092,28.8749,319.5
max,47850.0,578.0,1997.0,1.0,3.0,2.0,86.0,285.341042,292.5344,286.937591,...,378.612,470.838,367.286,106.215002,1518.0,297.924545,470.838,278.296,203.649581,1547.0
