In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
sns.set(style="white", color_codes=True)
sns.set_context(rc={'font.family': 'sans', 'font.size': 24, 'axes.titlesize':24, 'axes.labelsize':24})
#if you want to know current working dir
os.chdir('../..')

from src.utils import *
from src.utility import merge_speed_events
import src.data as data
import src.utility as utils
from src.preprocessing.other_features import avg_speed_for_roadtype_event

%matplotlib inline

In [2]:
speeds_train = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_train_imputed_time.csv.gz'))
speeds_test = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_test_imputed_time.csv.gz'))
speeds_2019 = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_2019_imputed_time.csv.gz'))
print(speeds_train.shape, speeds_test.shape, speeds_2019.shape)

(11652930, 9) (3762660, 9) (438361, 9)


In [3]:
speeds_all = pd.concat([speeds_train, speeds_test, speeds_2019], ignore_index=True)
speeds_all.sort_values([KEY, KM, DATETIME], inplace=True)
speeds_all.drop_duplicates([KEY, KM, DATETIME], inplace=True)
speeds_all[DATETIME] = pd.to_datetime(speeds_all[DATETIME])
speeds_all.shape


(15823003, 9)

In [6]:
X_df = data.base_dataset(mode='full')
X_df.shape

(15457, 60)

In [11]:
print(X_df.columns)
X_df.describe()

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_Y_0',
       'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3', 'SPEED_SD_-4',
       'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_MAX_-4', 'SPEED_MAX_-3',
       'SPEED_MAX_-2', 'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'WEATHER_-4',
       'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4', 'DISTANCE_-3',
       'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4', 'TEMPERATURE_-3',
       'TEMPERATURE_-2', 'TEMPERATURE_-1', 'MIN_TEMPERATURE_-4',
       'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2', 'MIN_TEMPERATURE_-1',
       'MAX_TEMPERATURE_-4', 'MAX_TEMPERATURE_-3', 'MAX_TEMPERATURE_-2'

Unnamed: 0,event_index,KEY,KM,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,SPEED_AVG_-4,SPEED_AVG_-3,SPEED_AVG_-2,...,TEMPERATURE_-2,TEMPERATURE_-1,MIN_TEMPERATURE_-4,MIN_TEMPERATURE_-3,MIN_TEMPERATURE_-2,MIN_TEMPERATURE_-1,MAX_TEMPERATURE_-4,MAX_TEMPERATURE_-3,MAX_TEMPERATURE_-2,MAX_TEMPERATURE_-1
count,15457.0,15457.0,15457.0,15457.0,15457.0,15457.0,15457.0,7330.0,7350.0,7369.0,...,9805.0,9806.0,9804.0,9796.0,9805.0,9806.0,9804.0,9796.0,9805.0,9806.0
mean,24697.527916,188.34017,612.704018,0.298506,1.732225,0.38041,19.946109,184.337667,183.227174,181.584885,...,16.029067,16.042015,13.23521,13.236423,13.200408,13.164593,17.096695,17.108003,17.085365,17.084234
std,10184.356458,175.679147,363.274057,0.457617,0.761076,0.591913,20.334143,33.925221,34.39475,35.47863,...,6.916509,6.938402,5.943575,5.97844,6.01407,6.031367,7.077833,7.101379,7.147008,7.14859
min,1.0,0.0,17.0,0.0,1.0,0.0,-1.0,59.866,63.911,63.911,...,-14.0,-14.0,-14.0,-14.0,-14.0,-14.0,-14.0,-14.0,-14.0,-14.0
25%,17908.0,16.0,385.0,0.0,1.0,0.0,4.0,160.056549,158.735212,156.440375,...,12.0,12.0,10.0,10.0,10.0,10.0,13.0,13.0,13.0,13.0
50%,26954.0,146.0,578.0,0.0,2.0,0.0,12.0,187.927385,186.346261,184.705083,...,16.0,16.0,14.0,14.0,14.0,14.0,18.0,18.0,18.0,18.0
75%,33017.0,326.0,750.0,1.0,2.0,1.0,34.0,210.415534,209.658235,208.471324,...,21.0,21.0,18.0,18.0,18.0,18.0,22.0,22.0,22.0,22.0
max,38005.0,578.0,1997.0,1.0,3.0,2.0,85.0,295.831622,283.097806,289.376848,...,36.0,36.0,28.0,28.0,28.0,28.0,39.0,39.0,39.0,35.0


In [12]:
window_len = sum(X_df.columns.str.match('^SPEED_AVG_-.*$')*1)
for i in range(2, window_len+2):
    time = 'DATETIME_UTC_-' + str(i)
    speed_avg = 'SPEED_AVG_-' + str(i)
    speed_max = 'SPEED_MAX_-' + str(i)
    speed_min = 'SPEED_MIN_-' + str(i)
    speed_std = 'SPEED_SD_-' + str(i)
    n_cars = 'N_VEHICLES_-' + str(i)
    X_df[time] = pd.to_datetime(X_df[time])

    X_df.drop(columns=[speed_avg, speed_max, speed_min, speed_std, n_cars], inplace=True)
    X_df = pd.merge(X_df, speeds_all[[KEY, KM, DATETIME, SPEED_AVG, SPEED_MAX, SPEED_MIN, SPEED_SD, N_CARS]],
                    left_on=[KEY, KM, time], right_on=[KEY, KM, DATETIME], how='left')
    X_df.rename(columns={SPEED_AVG: speed_avg, SPEED_MAX: speed_max, SPEED_MIN: speed_min, SPEED_SD: speed_std, N_CARS: n_cars}, inplace=True)

ValueError: You are trying to merge on datetime64[ns] and object columns. If you wish to proceed you should use pd.concat