# Library Imports

### giotto-time

In [1]:
# Feature creation
from giottotime.feature_creation import CalendarFeature
from giottotime.feature_creation import DetrendedFeature
from giottotime.feature_creation import PeriodicSeasonalFeature
from giottotime.feature_creation import ShiftFeature, MovingAverageFeature, ExogenousFeature
from giottotime.feature_creation import FeatureCreation

# Causality testing
from giottotime.causality_tests.shifted_pearson_correlation import ShiftedPearsonCorrelation
from giottotime.causality_tests.shifted_linear_coefficient import ShiftedLinearCoefficient

# Models
from giottotime.models.time_series_models import GAR
from giottotime.model_selection import FeatureSplitter

# Detrending
from giottotime.models import PolynomialTrend
from giottotime.models import ExponentialTrend

### Other imports

In [106]:
# Data handling
import pandas as pd
import numpy as np

# Scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Plotting
import matplotlib.pyplot as plt
from src.plotting import plot_time_series
from src.tests import dickey_fuller_test

# Statstools
import statsmodels.api as sm

# Import Data

In [3]:
data = pd.read_csv('data/raw/df_accidents.csv')
data['date'] = [pd.to_datetime(d) for d in data['date'].values]

In [4]:
time_series = data[['number of accidents', 'date']].copy()
time_series.set_index('date', inplace=True) 
data.set_index('date', inplace=True)
time_series.head()

Unnamed: 0_level_0,number of accidents
date,Unnamed: 1_level_1
2014-01-01,608
2014-01-02,1702
2014-01-03,1371
2014-01-04,903
2014-01-05,775


# Detrending

In [5]:
polynomial_trend = PolynomialTrend(order=3)
polynomial_trend.fit(data['Temperature'])

PolynomialTrend(3, <function mean_squared_error at 0x13d070320>, BFGS)

In [6]:
data['Temperature'] = polynomial_trend.transform(data['Temperature'])

In [105]:
plot_time_series(data, 
                 y_columns=['Temperature', 'number of accidents'], 
                 names = ['Temp', 'accidents'], 
                 title='Accidents', 
                 y_axis_titles=['acc', 'temp'])

# Causality Testing

In [9]:
cause = ShiftedLinearCoefficient(target_col="Temperature", )
cause.fit(data[['number of accidents', 'Temperature', 'windspeed']])
cause.max_corrs_

y,Temperature,number of accidents,windspeed
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Temperature,1.021058,1.517919,0.023959
number of accidents,0.01807,0.316543,0.00071
windspeed,-1.278742,5.418223,0.413698


In [10]:
cause.best_shifts_

y,Temperature,number of accidents,windspeed
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Temperature,1,9,4
number of accidents,7,7,1
windspeed,8,1,1


In [11]:
cause.transform(data).head(10)

Unnamed: 0_level_0,number of accidents,Temperature,windspeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-01-01,,-95.905985,
2014-01-02,,-93.529539,
2014-01-03,,-99.696935,
2014-01-04,,-99.239679,
2014-01-05,,-92.161693,2.166667
2014-01-06,,-79.158588,3.291667
2014-01-07,,-94.744516,3.583333
2014-01-08,,-93.634691,1.708333
2014-01-09,,-85.738826,0.958333
2014-01-10,608.0,-81.590094,2.208333


# Feature Creation

In [14]:
# List of all features
temperature_feature = ExogenousFeature(data['Temperature'], output_name='temperature')
windspeed_feature = ExogenousFeature(data['windspeed'], output_name='windspeed')
weekly_period = PeriodicSeasonalFeature(start_date=time_series.index.min(), output_name='weekly')

features_creation = FeatureCreation(
    horizon=20,
    time_series_features = [
        temperature_feature, 
        windspeed_feature, 
        weekly_period
    ]
)

X, y = features_creation.fit_transform(time_series)

Float64Index([                  0.0, 0.0027397260273972603,
               0.005479452054794521,   0.00821917808219178,
               0.010958904109589041,    0.0136986301369863,
                0.01643835616438356,  0.019178082191780823,
               0.021917808219178082,  0.024657534246575342,
              ...
                 2.9753424657534246,     2.978082191780822,
                 2.9808219178082194,    2.9835616438356163,
                 2.9863013698630136,     2.989041095890411,
                 2.9917808219178084,    2.9945205479452053,
                 2.9972602739726026,                   3.0],
             dtype='float64', name='date', length=1096)


# Prediction

In [15]:
train_test_splitter = FeatureSplitter()
X_train, y_train, X_test, y_test = train_test_splitter.transform(X, y)

time_series_model = GAR(base_model=RandomForestRegressor())
time_series_model.fit(X_train, y_train)
predictions = time_series_model.predict(X_test)
predictions

Unnamed: 0_level_0,y_1,y_2,y_3,y_4,y_5,y_6,y_7,y_8,y_9,y_10,y_11,y_12,y_13,y_14,y_15,y_16,y_17,y_18,y_19,y_20
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2016-12-12,1064.92,931.04,1011.93,1133.85,1104.42,1141.51,1063.77,1060.52,1053.18,835.64,886.52,858.05,890.1,631.34,778.53,741.05,612.79,781.51,790.67,732.24
2016-12-13,1065.4,916.33,937.86,1133.78,1098.83,1101.64,1123.14,1076.77,1026.48,833.59,891.94,906.89,878.97,631.34,776.26,747.91,602.94,768.54,786.92,752.92
2016-12-14,1064.45,924.91,929.93,1133.23,1098.09,1101.64,1124.61,1081.59,1022.43,819.54,891.69,897.83,886.81,637.75,772.13,740.43,605.04,773.04,839.53,720.05
2016-12-15,938.53,912.36,1006.54,1040.04,1150.79,1185.65,1119.15,1082.6,872.28,787.8,902.32,895.27,935.95,752.35,780.55,636.55,690.72,801.77,800.94,885.72
2016-12-16,1004.75,922.34,950.29,1045.66,1150.76,1134.98,1119.02,1063.56,910.59,814.7,895.74,874.99,941.67,691.06,742.71,700.03,675.23,732.42,844.97,810.37
2016-12-17,1059.12,930.06,998.25,1092.38,1098.9,1127.87,1071.63,1065.52,1047.76,840.27,877.76,863.01,886.83,647.92,790.37,779.91,633.55,785.53,766.15,761.62
2016-12-18,999.56,914.55,950.29,1045.81,1150.76,1120.74,1118.16,1063.56,910.59,821.29,898.82,869.98,931.61,688.01,767.62,716.88,651.4,737.92,844.6,775.54
2016-12-19,1000.89,950.13,954.63,1029.76,1147.14,1119.94,1117.74,1063.56,911.73,830.59,884.74,868.72,932.7,739.24,750.0,742.92,652.01,727.51,847.24,764.48
2016-12-20,1063.82,930.33,993.29,1098.02,1124.1,1125.68,1049.25,1059.1,1008.9,835.61,899.17,841.42,875.52,666.67,791.32,743.12,671.22,864.13,772.84,709.33
2016-12-21,1048.38,924.57,883.73,1089.34,1096.67,1091.1,1129.38,1074.98,1018.32,837.22,883.62,925.58,875.17,674.88,760.82,728.61,629.42,815.25,780.55,725.7


In [16]:
# Score (how to do this?)
# mean_absolute_error(predictions, y_test)

# Comparison

In [27]:
ratio = 0.7
idx = int(len(time_series)*ratio)
y_train_arma = time_series.iloc[:idx]
y_test_arma = time_series.iloc[idx:]

arma_model = sm.tsa.ARMA(y_train_arma, (5,0), freq='D').fit()

predictions = arma_model.predict(start=y_test_arma.index.min(), end=y_test_arma.index.max())

In [68]:
results = pd.DataFrame([predictions.values, y_test_arma.values.flatten()]).T
results.columns = ['pred', 'y_test']
results.index = predictions.index

results

In [70]:
plot_time_series(results, y_columns=['pred', 'y_test'], names=['predict', 'reference'])