In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
sns.set(style="white", color_codes=True)
sns.set_context(rc={'font.family': 'sans', 'font.size': 24, 'axes.titlesize':24, 'axes.labelsize':24})
#if you want to know current working dir
os.chdir('../..')

from src.utils import *
from src.utility import merge_speed_events
import src.data as data
import src.utility as utils
from src.preprocessing.other_features import avg_speed_for_roadtype_event

%matplotlib inline

In [135]:
df = data.base_dataset()
sensors = data.sensors().drop_duplicates().sort_values([KEY, KM])
speeds = pd.concat([data.speeds_original('train'), data.speeds_original('test'), data.speeds_original('test2')]).drop_duplicates()

In [4]:
df.columns

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

In [7]:
sensors.head(10)

Unnamed: 0,KEY,EMERGENCY_LANE,KM,LANES,ROAD_TYPE
926,0.0,1,333,2,1
1588,0.0,1,337,2,1
472,0.0,1,342,2,1
1771,0.0,1,343,2,1
283,0.0,1,357,2,1
1196,0.0,1,385,2,1
659,0.0,1,403,2,1
1396,0.0,1,407,2,1
1821,0.0,1,411,2,1
215,0.0,1,423,2,1


In [86]:
sensors['KM_BEFORE'] = sensors['KM'].shift(1)
sensors['KEY_BEFORE'] = sensors['KEY'].shift(1)
sensors['KM_AFTER'] = sensors['KM'].shift(-1)
sensors['KEY_AFTER'] = sensors['KEY'].shift(-1)

sensors.loc[sensors.KEY_AFTER != sensors.KEY, 'KM_AFTER'] = np.nan
sensors.loc[sensors.KEY_BEFORE != sensors.KEY, 'KM_BEFORE'] = np.nan

sensors.drop(['KEY_BEFORE', 'KEY_AFTER'], axis=1, inplace=True)
sensors = sensors[[KEY, KM, 'KM_BEFORE', 'KM_AFTER']]
sensors.head(30)

Unnamed: 0,KEY,KM,KM_BEFORE,KM_AFTER
926,0.0,333,,337.0
1588,0.0,337,333.0,342.0
472,0.0,342,337.0,343.0
1771,0.0,343,342.0,357.0
283,0.0,357,343.0,385.0
1196,0.0,385,357.0,403.0
659,0.0,403,385.0,407.0
1396,0.0,407,403.0,411.0
1821,0.0,411,407.0,423.0
215,0.0,423,411.0,443.0


In [91]:
merged = pd.merge(df, sensors, left_on=[KEY, KM], right_on=[KEY, KM])

In [60]:
merged.columns

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

In [92]:
for i in range(1, 5):
    speed_avg_before = 'SPEED_AVG_BEFORE_-' + str(i)
    speed_avg_after = 'SPEED_AVG_AFTER_-' + str(i)
    datetime = 'DATETIME_UTC_-' + str(i)
    

    speeds[speed_avg_before] = speeds[SPEED_AVG]
    speeds[speed_avg_after] = speeds[SPEED_AVG]
    merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_before]], left_on=[KEY, 'KM_BEFORE', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i)))
    
    merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_after]], left_on=[KEY, 'KM_AFTER', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i)))


merged.drop(columns=['KM', 'DATETIME_UTC_y_-3', 'KM_y_-3', 'DATETIME_UTC_y_-4',
                     'DATETIME_UTC_y_-2', 'KM_y_-2', 'DATETIME_UTC_y_-1', 'KM_x_-2',
                     'KM_y_-1', 'KM_x_-3',
                     'KM_x_-4', 'KM_y_-4', 'DATETIME_UTC_y_-4'], inplace=True)
merged.rename(columns={'KM_x_-1': 'KM',
                       'DATETIME_UTC_x_-4': 'DATETIME_UTC_-4',
                       'DATETIME_UTC_x_-3': 'DATETIME_UTC_-3',
                       'DATETIME_UTC_x_-2': 'DATETIME_UTC_-2',
                       'DATETIME_UTC_x_-1': 'DATETIME_UTC_-1'}, inplace=True)
merged['DELTA_BEFORE'] = merged[KM] - merged['KM_BEFORE']
merged['DELTA_AFTER'] = merged['KM_AFTER'] - merged[KM]

In [80]:
merged[[KM, 'KM_x']].head(3)

KeyError: "['KM_x'] not in index"

In [93]:
merged.columns

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

In [94]:
merged.head(3)

Unnamed: 0,event_index,KEY,KM,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,EVENT_TYPE,DATETIME_UTC_-4,DATETIME_UTC_-3,...,SPEED_AVG_BEFORE_-2,SPEED_AVG_AFTER_-2,DATETIME_UTC_-3.1,SPEED_AVG_BEFORE_-3,SPEED_AVG_AFTER_-3,DATETIME_UTC_-4.1,SPEED_AVG_BEFORE_-4,SPEED_AVG_AFTER_-4,DELTA_BEFORE,DELTA_AFTER
0,116,525,300,0,2,0,5,Ostacolo_in_carreggiata,2018-09-04 07:15:00,2018-09-04 07:30:00,...,218.067176,247.819246,2018-09-04 07:30:00,224.399862,248.938775,2018-09-04 07:15:00,223.633488,243.045416,22.0,5.0
1,183,525,300,0,2,0,34,Segnaletica_verticale,2018-09-24 15:00:00,2018-09-24 15:15:00,...,212.442126,221.511905,2018-09-24 15:15:00,211.08209,227.570281,2018-09-24 15:00:00,206.397549,219.455493,22.0,5.0
2,212,525,300,0,2,0,6,Gestione_viabilita,2018-09-30 19:30:00,2018-09-30 19:45:00,...,215.172896,87.804721,2018-09-30 19:45:00,216.567354,84.925738,2018-09-30 19:30:00,214.499183,82.108627,22.0,5.0


In [100]:
to_keep_1 = ['DATETIME_UTC_-' + str(k) for k in range(1, 5)]
to_keep_2 = ['SPEED_AVG_BEFORE_-' + str(k) for k in range(1, 5)]
to_keep_3 = ['SPEED_AVG_AFTER_-' + str(k) for k in range(1, 5)]
to_keep_4 = ['DELTA_BEFORE', 'DELTA_AFTER']
to_keep = [KEY, KM, *to_keep_1, *to_keep_2, *to_keep_3, *to_keep_4]
to_keep

['KEY',
 'KM',
 'DATETIME_UTC_-1',
 'DATETIME_UTC_-2',
 'DATETIME_UTC_-3',
 'DATETIME_UTC_-4',
 'SPEED_AVG_BEFORE_-1',
 'SPEED_AVG_BEFORE_-2',
 'SPEED_AVG_BEFORE_-3',
 'SPEED_AVG_BEFORE_-4',
 'SPEED_AVG_AFTER_-1',
 'SPEED_AVG_AFTER_-2',
 'SPEED_AVG_AFTER_-3',
 'SPEED_AVG_AFTER_-4',
 'DELTA_BEFORE',
 'DELTA_AFTER']

In [102]:
merged = merged[to_keep]
merged.head(3)

Unnamed: 0,KEY,KM,DATETIME_UTC_-1,DATETIME_UTC_-1.1,DATETIME_UTC_-2,DATETIME_UTC_-2.1,DATETIME_UTC_-3,DATETIME_UTC_-3.1,DATETIME_UTC_-4,DATETIME_UTC_-4.1,SPEED_AVG_BEFORE_-1,SPEED_AVG_BEFORE_-2,SPEED_AVG_BEFORE_-3,SPEED_AVG_BEFORE_-4,SPEED_AVG_AFTER_-1,SPEED_AVG_AFTER_-2,SPEED_AVG_AFTER_-3,SPEED_AVG_AFTER_-4,DELTA_BEFORE,DELTA_AFTER
0,525,300,2018-09-04 08:00:00,2018-09-04 08:00:00,2018-09-04 07:45:00,2018-09-04 07:45:00,2018-09-04 07:30:00,2018-09-04 07:30:00,2018-09-04 07:15:00,2018-09-04 07:15:00,226.159361,218.067176,224.399862,223.633488,248.404948,247.819246,248.938775,243.045416,22.0,5.0
1,525,300,2018-09-24 15:45:00,2018-09-24 15:45:00,2018-09-24 15:30:00,2018-09-24 15:30:00,2018-09-24 15:15:00,2018-09-24 15:15:00,2018-09-24 15:00:00,2018-09-24 15:00:00,213.37205,212.442126,211.08209,206.397549,226.651698,221.511905,227.570281,219.455493,22.0,5.0
2,525,300,2018-09-30 20:15:00,2018-09-30 20:15:00,2018-09-30 20:00:00,2018-09-30 20:00:00,2018-09-30 19:45:00,2018-09-30 19:45:00,2018-09-30 19:30:00,2018-09-30 19:30:00,222.587884,215.172896,216.567354,214.499183,92.977894,87.804721,84.925738,82.108627,22.0,5.0


In [113]:
X, y = data.dataset(onehot=False)

In [119]:
pd.set_option('display.max_columns', 500)
X[['SPEED_AVG_AFTER_-3', 'SPEED_AVG_-1']].describe()

Unnamed: 0,SPEED_AVG_AFTER_-3,SPEED_AVG_-1
count,3063.0,7308.0
mean,184.696591,179.689756
std,35.926888,36.410732
min,67.847096,66.947681
25%,156.956769,153.682752
50%,189.752345,182.519146
75%,213.563807,207.330876
max,284.138778,287.24403


In [125]:
features = pd.read_csv(resources_path('dataset', 'preprocessed', 'local', 'features', 'avg_speed_km_before_after', 'features.csv.gz'))

In [126]:
features.describe()

Unnamed: 0,KEY,KM,SPEED_AVG_BEFORE_-1,SPEED_AVG_BEFORE_-2,SPEED_AVG_BEFORE_-3,SPEED_AVG_BEFORE_-4,SPEED_AVG_AFTER_-1,SPEED_AVG_AFTER_-2,SPEED_AVG_AFTER_-3,SPEED_AVG_AFTER_-4,DELTA_BEFORE,DELTA_AFTER
count,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0,2969.0
mean,135.286965,640.294712,184.440586,185.630692,186.745411,188.06154,181.905023,183.270866,184.854566,186.416992,12.868643,12.934995
std,166.412912,355.344056,33.774266,33.001264,32.364021,31.583739,38.229867,37.111844,35.811351,35.242737,12.998351,12.462105
min,0.0,71.0,69.259146,68.550853,67.847096,69.201434,66.947681,68.550853,67.847096,69.201434,1.0,1.0
25%,16.0,458.0,157.862867,159.86343,161.238194,163.235841,151.897353,155.011435,157.23111,159.81566,4.0,4.0
50%,46.0,576.0,187.674092,189.426635,190.924,192.316182,186.281043,187.478894,189.950271,191.318634,7.0,7.0
75%,205.0,658.0,211.672028,211.912257,212.550951,213.107148,211.980472,212.076873,213.350233,213.907897,19.0,20.0
max,578.0,1907.0,261.004843,261.719755,262.4396,262.347143,300.5435,292.231677,284.138778,287.888429,139.0,74.0


In [127]:
dataset = data.base_dataset()

In [133]:
from src.utils.datetime_converter import convert_to_datetime
f = convert_to_datetime(features)
ssss = pd.merge(dataset, f, left_on=[KEY, KM, 'DATETIME_UTC_-1'], right_on=[KEY, KM, 'DATETIME_UTC_-1'], how='left')
ssss.dropna().head(2)


Unnamed: 0,event_index,KEY,KM,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,EVENT_TYPE,DATETIME_UTC_-4_x,DATETIME_UTC_-3_x,DATETIME_UTC_-2_x,DATETIME_UTC_-1,DATETIME_UTC_y_0,DATETIME_UTC_y_1,DATETIME_UTC_y_2,DATETIME_UTC_y_3,SPEED_AVG_-4,SPEED_AVG_-3,SPEED_AVG_-2,SPEED_AVG_-1,SPEED_AVG_Y_0,SPEED_AVG_Y_1,SPEED_AVG_Y_2,SPEED_AVG_Y_3,SPEED_SD_-4,SPEED_SD_-3,SPEED_SD_-2,SPEED_SD_-1,SPEED_MAX_-4,SPEED_MAX_-3,SPEED_MAX_-2,SPEED_MAX_-1,SPEED_MIN_-4,SPEED_MIN_-3,SPEED_MIN_-2,SPEED_MIN_-1,N_VEHICLES_-4,N_VEHICLES_-3,N_VEHICLES_-2,N_VEHICLES_-1,WEATHER_-4,WEATHER_-3,WEATHER_-2,WEATHER_-1,DISTANCE_-4,DISTANCE_-3,DISTANCE_-2,DISTANCE_-1,TEMPERATURE_-4,TEMPERATURE_-3,TEMPERATURE_-2,TEMPERATURE_-1,MIN_TEMPERATURE_-4,MIN_TEMPERATURE_-3,MIN_TEMPERATURE_-2,MIN_TEMPERATURE_-1,MAX_TEMPERATURE_-4,MAX_TEMPERATURE_-3,MAX_TEMPERATURE_-2,MAX_TEMPERATURE_-1,DATETIME_UTC_-1.1,DATETIME_UTC_-2_y,DATETIME_UTC_-2.1,DATETIME_UTC_-3_y,DATETIME_UTC_-3.1,DATETIME_UTC_-4_y,DATETIME_UTC_-4.1,SPEED_AVG_BEFORE_-1,SPEED_AVG_BEFORE_-2,SPEED_AVG_BEFORE_-3,SPEED_AVG_BEFORE_-4,SPEED_AVG_AFTER_-1,SPEED_AVG_AFTER_-2,SPEED_AVG_AFTER_-3,SPEED_AVG_AFTER_-4,DELTA_BEFORE,DELTA_AFTER
94,507,234,663,0,1,0,36,Barriere,2018-09-01 11:00:00,2018-09-01 11:15:00,2018-09-01 11:30:00,2018-09-01 11:45:00,2018-09-01 12:00:00,2018-09-01 12:15:00,2018-09-01 12:30:00,2018-09-01 12:45:00,153.157183,150.934599,144.65403,150.882144,151.90883,151.67174,152.233575,153.837737,11.940589,12.845398,11.736729,13.266288,199.014,184.452,182.834,192.542,132.676,113.26,111.642,118.114,120.0,137.0,134.0,111.0,Debole Neve,Debole Neve,Debole Neve,Debole Neve,27.0,27.0,27.0,27.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,-4.0,2018-09-01 11:45:00,2018-09-01 11:30:00,2018-09-01 11:30:00,2018-09-01 11:15:00,2018-09-01 11:15:00,2018-09-01 11:00:00,2018-09-01 11:00:00,167.0585,165.203379,160.203865,156.791905,156.000092,169.014689,169.285687,159.81566,25.0,33.0
95,511,234,696,0,1,0,38,Regimazione_delle_acque,2018-09-04 14:30:00,2018-09-04 14:45:00,2018-09-04 15:00:00,2018-09-04 15:15:00,2018-09-04 15:30:00,2018-09-04 15:45:00,2018-09-04 16:00:00,2018-09-04 16:15:00,172.560984,169.511319,166.965154,162.584485,162.400226,171.227962,167.010508,166.029509,13.720444,26.292014,18.88538,16.017992,210.34,226.52,208.722,221.666,144.002,88.99,129.44,129.44,63.0,47.0,52.0,66.0,Quasi Sereno,Quasi Sereno,Quasi Sereno,Quasi Sereno,82.0,82.0,82.0,137.0,25.0,25.0,25.0,26.0,17.0,17.0,17.0,13.0,24.0,24.0,24.0,26.0,2018-09-04 15:15:00,2018-09-04 15:00:00,2018-09-04 15:00:00,2018-09-04 14:45:00,2018-09-04 14:45:00,2018-09-04 14:30:00,2018-09-04 14:30:00,152.128224,153.235759,149.764569,147.124637,131.058,115.172182,120.303059,109.5386,33.0,18.0


In [137]:
speeds[(speeds.KEY == 234) & (speeds.KM == 663) & (speeds.DATETIME_UTC == '2018-09-04 14:30:00')]

Unnamed: 0,KEY,DATETIME_UTC,KM,SPEED_AVG,SPEED_SD,SPEED_MIN,SPEED_MAX,N_VEHICLES,KEY_2
5170363,234.0,2018-09-04 14:30:00,663,147.124637,14.548297,111.642,194.16,157.0,234_663
