In [None]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import os
import numpy as np

os.chdir('../..')

from src.utils import *
from src.utility import merge_speed_events

In [None]:
speeds_df = pd.read_csv(resources_path('dataset', 'preprocessed', 'speeds_train_imputed_time.csv.gz'))
events_df = pd.read_csv(resources_path('dataset', 'preprocessed', 'events_train.csv.gz'))
sensors_df = pd.read_csv(resources_path('dataset', 'originals', 'sensors.csv.gz'))
speeds_df.shape

In [None]:
speeds_df.dropna(inplace=True)
speeds_df.shape

In [None]:
speeds_events_df = merge_speed_events(speeds_df, events_df)
speeds_events_df.shape

In [None]:
speeds_events_df = pd.merge(speeds_events_df, sensors_df, left_on=[KEY, KM], right_on=[KEY, KM], how='left')
speeds_events_df = speeds_events_df[[KEY, KM, DATETIME, SPEED_AVG, N_CARS, 'IMPUTED', EVENT_TYPE, KM_END, KM_START, 'KM_EVENT', 'step_duration', ROAD_TYPE, LANES, 'EMERGENCY_LANE']]
speeds_events_df.head(4)

In [None]:
idx = speeds_events_df['KM_EVENT'].isnull()
speeds_events_df.loc[idx, 'KM_EVENT'] = speeds_events_df.loc[idx, 'KM_START'] + (speeds_events_df.loc[idx, 'KM_END'] - speeds_events_df.loc[idx, 'KM_START'])/2
speeds_events_df.head(4)

In [None]:
speeds_events_df['DISTANCE'] = speeds_events_df['KM_EVENT'] - speeds_events_df[KM]
speeds_events_df = speeds_events_df[[KEY, KM, DATETIME, SPEED_AVG, N_CARS, 'IMPUTED', EVENT_TYPE, 'DISTANCE', 'step_duration']]
speeds_events_df.head(4)

In [None]:
min_time = pd.to_datetime(speeds_events_df.DATETIME_UTC).astype('int').min()
min_time = (min_time//(10**9))//60
# speeds_events_df[DATETIME] = ((pd.to_datetime(speeds_events_df.DATETIME_UTC).astype('int')//(10**9))//60 - min_time)//15
speeds_events_df.head(4)

In [None]:
speeds_events_df[EVENT_TYPE] = speeds_events_df.EVENT_TYPE.fillna('no_event')
speeds_events_df[DISTANCE] = speeds_events_df.DISTANCE.fillna(0)
speeds_events_df['step_duration'] = speeds_events_df.step_duration.fillna(0)
speeds_events_df.sort_values(DATETIME).head()

In [None]:
speeds_events_onehot_df = pd.concat([speeds_events_df, pd.get_dummies(speeds_events_df[EVENT_TYPE])], axis=1)

In [None]:
speeds_events_onehot_df.drop('EVENT_TYPE', axis=1, inplace=True)
speeds_events_onehot_df.head(4)

In [None]:
final_df = speeds_events_onehot_df.groupby([KEY, KM, DATETIME, SPEED_AVG, N_CARS, 'IMPUTED', 'DISTANCE', 'step_duration']).sum().reset_index()
final_df[final_df[DISTANCE] > 0.0].head(10)

In [None]:
final_df.drop_duplicates([KEY, KM, DATETIME], inplace=True)
final_df.shape

In [None]:
final_df.sort_values([KEY, KM, DATETIME], inplace=True)
final_df = final_df.loc[:, ~final_df.columns.str.contains('^Unnamed')]
final_df.to_csv(resources_path('dataset', 'training', 'train_2.csv.gz'))

## Fitting some models

In [None]:
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(resources_path('dataset', 'training', 'train_2.csv.gz'))
sensors_df = pd.read_csv(resources_path('dataset', 'originals', 'sensors.csv.gz'))
df.head(5)

In [None]:
sensors_df.head(5)

In [None]:
df_1 = df[(df[KEY] == 0) & (df[KM] == 333)]
df_1.sort_values(DATETIME).head(5)

In [None]:
df_1.shape

In [None]:
y = df_1[[DATETIME, SPEED_AVG]].set_index(DATETIME)
u = df_1.drop([KEY, KM, SPEED_AVG, 'IMPUTED'], axis=1)
u = u.loc[:, ~u.columns.str.contains('^Unnamed')]
u = df_1[[DATETIME, N_CARS]]
u.set_index(DATETIME, inplace=True)
u.head(10)

In [None]:
y.head()

In [None]:
u = np.array(u)
print(y.shape)
print(u.shape)
y_train = y[0:7000]
u = u[0:7000]
u_test = u[7000:]
y_test = y[7000:]

In [None]:
# armax = sm.tsa.ARMA(y_train, exog=u, order=(16, 8))
sarima = sm.tsa.SARIMAX(y_train, order=(8, 2, 4), seasonal_order=(1, 1, 1, 96), trend='c', enforce_stationarity=False, enforce_invertibility=False)

In [None]:
results_model = sarima.fit(maxiter=1, disp=True, callback=lambda x: print('End of iteration'))

In [None]:
results_model.summary()

In [None]:
residuals = pd.DataFrame(results_model.resid)
residuals.plot(figsize=(20, 10))
plt.show()

In [None]:
residuals.plot(figsize=(20, 10), kind='kde');

In [None]:
residuals.describe()

In [None]:
predicted = np.array(results_model.forecast(steps=len(y_test)))

In [None]:
predicted

In [None]:
y_test = np.array(y_test)

In [None]:
y_test

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(y_test[0:100])
plt.plot(predicted[0:100], color='red')
plt.show()