In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
sns.set(style="white", color_codes=True)
sns.set_context(rc={'font.family': 'sans', 'font.size': 24, 'axes.titlesize':24, 'axes.labelsize':24})
#if you want to know current working dir
os.chdir('../..')

from src.utils import *
from src.utility import merge_speed_events
import src.data as data
import src.utility as utils
from src.preprocessing.other_features import avg_speed_for_roadtype_event

%matplotlib inline

In [37]:
df = data.base_dataset()
sensors = data.sensors().drop_duplicates().sort_values([KEY, KM])
speeds = pd.concat([data.speeds_original('train'), data.speeds_original('test'), data.speeds_original('test2')]).drop_duplicates()

In [4]:
df.columns

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

In [7]:
sensors.head(10)

Unnamed: 0,KEY,EMERGENCY_LANE,KM,LANES,ROAD_TYPE
926,0.0,1,333,2,1
1588,0.0,1,337,2,1
472,0.0,1,342,2,1
1771,0.0,1,343,2,1
283,0.0,1,357,2,1
1196,0.0,1,385,2,1
659,0.0,1,403,2,1
1396,0.0,1,407,2,1
1821,0.0,1,411,2,1
215,0.0,1,423,2,1


In [86]:
sensors['KM_BEFORE'] = sensors['KM'].shift(1)
sensors['KEY_BEFORE'] = sensors['KEY'].shift(1)
sensors['KM_AFTER'] = sensors['KM'].shift(-1)
sensors['KEY_AFTER'] = sensors['KEY'].shift(-1)

sensors.loc[sensors.KEY_AFTER != sensors.KEY, 'KM_AFTER'] = np.nan
sensors.loc[sensors.KEY_BEFORE != sensors.KEY, 'KM_BEFORE'] = np.nan

sensors.drop(['KEY_BEFORE', 'KEY_AFTER'], axis=1, inplace=True)
sensors = sensors[[KEY, KM, 'KM_BEFORE', 'KM_AFTER']]
sensors.head(30)

Unnamed: 0,KEY,KM,KM_BEFORE,KM_AFTER
926,0.0,333,,337.0
1588,0.0,337,333.0,342.0
472,0.0,342,337.0,343.0
1771,0.0,343,342.0,357.0
283,0.0,357,343.0,385.0
1196,0.0,385,357.0,403.0
659,0.0,403,385.0,407.0
1396,0.0,407,403.0,411.0
1821,0.0,411,407.0,423.0
215,0.0,423,411.0,443.0


In [91]:
merged = pd.merge(df, sensors, left_on=[KEY, KM], right_on=[KEY, KM])

In [60]:
merged.columns

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

In [92]:
for i in range(1, 5):
    speed_avg_before = 'SPEED_AVG_BEFORE_-' + str(i)
    speed_avg_after = 'SPEED_AVG_AFTER_-' + str(i)
    datetime = 'DATETIME_UTC_-' + str(i)
    

    speeds[speed_avg_before] = speeds[SPEED_AVG]
    speeds[speed_avg_after] = speeds[SPEED_AVG]
    merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_before]], left_on=[KEY, 'KM_BEFORE', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i)))
    
    merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_after]], left_on=[KEY, 'KM_AFTER', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i)))


merged.drop(columns=['KM', 'DATETIME_UTC_y_-3', 'KM_y_-3', 'DATETIME_UTC_y_-4',
                     'DATETIME_UTC_y_-2', 'KM_y_-2', 'DATETIME_UTC_y_-1', 'KM_x_-2',
                     'KM_y_-1', 'KM_x_-3',
                     'KM_x_-4', 'KM_y_-4', 'DATETIME_UTC_y_-4'], inplace=True)
merged.rename(columns={'KM_x_-1': 'KM',
                       'DATETIME_UTC_x_-4': 'DATETIME_UTC_-4',
                       'DATETIME_UTC_x_-3': 'DATETIME_UTC_-3',
                       'DATETIME_UTC_x_-2': 'DATETIME_UTC_-2',
                       'DATETIME_UTC_x_-1': 'DATETIME_UTC_-1'}, inplace=True)
merged['DELTA_BEFORE'] = merged[KM] - merged['KM_BEFORE']
merged['DELTA_AFTER'] = merged['KM_AFTER'] - merged[KM]

In [80]:
merged[[KM, 'KM_x']].head(3)

KeyError: "['KM_x'] not in index"

In [93]:
merged.columns

Index(['event_index', 'KEY', 'KM', 'EMERGENCY_LANE', 'LANES', 'ROAD_TYPE',
       'EVENT_DETAIL', 'EVENT_TYPE', 'DATETIME_UTC_-4', 'DATETIME_UTC_-3',
       'DATETIME_UTC_-2', 'DATETIME_UTC_-1', 'DATETIME_UTC_y_0',
       'DATETIME_UTC_y_1', 'DATETIME_UTC_y_2', 'DATETIME_UTC_y_3',
       'SPEED_AVG_-4', 'SPEED_AVG_-3', 'SPEED_AVG_-2', 'SPEED_AVG_-1',
       'SPEED_AVG_Y_0', 'SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3',
       'SPEED_SD_-4', 'SPEED_SD_-3', 'SPEED_SD_-2', 'SPEED_SD_-1',
       'SPEED_MAX_-4', 'SPEED_MAX_-3', 'SPEED_MAX_-2', 'SPEED_MAX_-1',
       'SPEED_MIN_-4', 'SPEED_MIN_-3', 'SPEED_MIN_-2', 'SPEED_MIN_-1',
       'N_VEHICLES_-4', 'N_VEHICLES_-3', 'N_VEHICLES_-2', 'N_VEHICLES_-1',
       'WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1', 'DISTANCE_-4',
       'DISTANCE_-3', 'DISTANCE_-2', 'DISTANCE_-1', 'TEMPERATURE_-4',
       'TEMPERATURE_-3', 'TEMPERATURE_-2', 'TEMPERATURE_-1',
       'MIN_TEMPERATURE_-4', 'MIN_TEMPERATURE_-3', 'MIN_TEMPERATURE_-2',
      

In [94]:
merged.head(3)

Unnamed: 0,event_index,KEY,KM,EMERGENCY_LANE,LANES,ROAD_TYPE,EVENT_DETAIL,EVENT_TYPE,DATETIME_UTC_-4,DATETIME_UTC_-3,...,SPEED_AVG_BEFORE_-2,SPEED_AVG_AFTER_-2,DATETIME_UTC_-3.1,SPEED_AVG_BEFORE_-3,SPEED_AVG_AFTER_-3,DATETIME_UTC_-4.1,SPEED_AVG_BEFORE_-4,SPEED_AVG_AFTER_-4,DELTA_BEFORE,DELTA_AFTER
0,116,525,300,0,2,0,5,Ostacolo_in_carreggiata,2018-09-04 07:15:00,2018-09-04 07:30:00,...,218.067176,247.819246,2018-09-04 07:30:00,224.399862,248.938775,2018-09-04 07:15:00,223.633488,243.045416,22.0,5.0
1,183,525,300,0,2,0,34,Segnaletica_verticale,2018-09-24 15:00:00,2018-09-24 15:15:00,...,212.442126,221.511905,2018-09-24 15:15:00,211.08209,227.570281,2018-09-24 15:00:00,206.397549,219.455493,22.0,5.0
2,212,525,300,0,2,0,6,Gestione_viabilita,2018-09-30 19:30:00,2018-09-30 19:45:00,...,215.172896,87.804721,2018-09-30 19:45:00,216.567354,84.925738,2018-09-30 19:30:00,214.499183,82.108627,22.0,5.0


In [100]:
to_keep_1 = ['DATETIME_UTC_-' + str(k) for k in range(1, 5)]
to_keep_2 = ['SPEED_AVG_BEFORE_-' + str(k) for k in range(1, 5)]
to_keep_3 = ['SPEED_AVG_AFTER_-' + str(k) for k in range(1, 5)]
to_keep_4 = ['DELTA_BEFORE', 'DELTA_AFTER']
to_keep = [KEY, KM, *to_keep_1, *to_keep_2, *to_keep_3, *to_keep_4]
to_keep

['KEY',
 'KM',
 'DATETIME_UTC_-1',
 'DATETIME_UTC_-2',
 'DATETIME_UTC_-3',
 'DATETIME_UTC_-4',
 'SPEED_AVG_BEFORE_-1',
 'SPEED_AVG_BEFORE_-2',
 'SPEED_AVG_BEFORE_-3',
 'SPEED_AVG_BEFORE_-4',
 'SPEED_AVG_AFTER_-1',
 'SPEED_AVG_AFTER_-2',
 'SPEED_AVG_AFTER_-3',
 'SPEED_AVG_AFTER_-4',
 'DELTA_BEFORE',
 'DELTA_AFTER']

In [102]:
merged = merged[to_keep]
merged.head(3)

Unnamed: 0,KEY,KM,DATETIME_UTC_-1,DATETIME_UTC_-1.1,DATETIME_UTC_-2,DATETIME_UTC_-2.1,DATETIME_UTC_-3,DATETIME_UTC_-3.1,DATETIME_UTC_-4,DATETIME_UTC_-4.1,SPEED_AVG_BEFORE_-1,SPEED_AVG_BEFORE_-2,SPEED_AVG_BEFORE_-3,SPEED_AVG_BEFORE_-4,SPEED_AVG_AFTER_-1,SPEED_AVG_AFTER_-2,SPEED_AVG_AFTER_-3,SPEED_AVG_AFTER_-4,DELTA_BEFORE,DELTA_AFTER
0,525,300,2018-09-04 08:00:00,2018-09-04 08:00:00,2018-09-04 07:45:00,2018-09-04 07:45:00,2018-09-04 07:30:00,2018-09-04 07:30:00,2018-09-04 07:15:00,2018-09-04 07:15:00,226.159361,218.067176,224.399862,223.633488,248.404948,247.819246,248.938775,243.045416,22.0,5.0
1,525,300,2018-09-24 15:45:00,2018-09-24 15:45:00,2018-09-24 15:30:00,2018-09-24 15:30:00,2018-09-24 15:15:00,2018-09-24 15:15:00,2018-09-24 15:00:00,2018-09-24 15:00:00,213.37205,212.442126,211.08209,206.397549,226.651698,221.511905,227.570281,219.455493,22.0,5.0
2,525,300,2018-09-30 20:15:00,2018-09-30 20:15:00,2018-09-30 20:00:00,2018-09-30 20:00:00,2018-09-30 19:45:00,2018-09-30 19:45:00,2018-09-30 19:30:00,2018-09-30 19:30:00,222.587884,215.172896,216.567354,214.499183,92.977894,87.804721,84.925738,82.108627,22.0,5.0
