### A Multivariate Time Series Guide to Forecasting and Modeling

In [123]:
#import required packages
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#read the data
df = pd.read_csv("AirQualityUCI.csv",parse_dates=[['Date', 'Time']])

#check the dtypes
df.dtypes

Date_Time        datetime64[ns]
CO(GT)                  float64
PT08.S1(CO)               int64
NMHC(GT)                  int64
C6H6(GT)                float64
PT08.S2(NMHC)             int64
NOx(GT)                   int64
PT08.S3(NOx)              int64
NO2(GT)                   int64
PT08.S4(NO2)              int64
PT08.S5(O3)               int64
T                       float64
RH                      float64
AH                      float64
dtype: object

In [124]:
df

Unnamed: 0,Date_Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-10-03 18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,2004-10-03 19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,2004-10-03 20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,2004-10-03 21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,2004-10-03 22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,2005-04-04 10:00:00,3.1,1314,-200,13.5,1101,472,539,190,1374,1729,21.9,29.3,0.7568
9353,2005-04-04 11:00:00,2.4,1163,-200,11.4,1027,353,604,179,1264,1269,24.3,23.7,0.7119
9354,2005-04-04 12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406
9355,2005-04-04 13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139


In [125]:
df['Date_Time'] = pd.to_datetime(df.Date_Time , format = '%d/%m/%Y %H.%M.%S')
data = df.drop(['Date_Time'], axis=1)
data.index = df.Date_Time

In [126]:
#missing value treatment
cols = data.columns
for j in cols:
    for i in range(0,len(data)):
       if data[j][i] == -200:
           data[j][i] = data[j][i-1]

#checking stationarity
from statsmodels.tsa.vector_ar.vecm import coint_johansen
#since the test works for only 12 variables, I have randomly dropped
#in the next iteration, I would drop another and check the eigenvalues
johan_test_temp = data.drop([ 'CO(GT)'], axis=1)

coint_johansen(johan_test_temp,-1,1).eig

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array([1.75510896e-01, 1.52389933e-01, 1.15120416e-01, 1.04126281e-01,
       9.29485509e-02, 6.89397159e-02, 5.77070988e-02, 3.43554214e-02,
       3.05980659e-02, 1.18697142e-02, 2.46766099e-03, 7.09584856e-05])

In [127]:
#creating the train and validation set
train = data[:int(0.8*(len(data)))]
valid = data[int(0.8*(len(data))):]

#fit the model
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()

# make prediction on validation
prediction = model_fit.forecast(model_fit.y, steps=len(valid))

  obj = getattr(results, attr)


In [139]:
#converting predictions to dataframe
import math
from sklearn.metrics import mean_squared_error
pred = pd.DataFrame(index=range(0,len(prediction)),columns=cols)
for j in range(0,13):
    for i in range(0, len(prediction)):
        pred.iloc[i][j] = prediction[i][j]

#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', math.sqrt(mean_squared_error(pred[i], valid[i])))

rmse value for CO(GT) is :  1.408688883687367
rmse value for PT08.S1(CO) is :  205.89558284022903
rmse value for NMHC(GT) is :  6.673548711348792
rmse value for C6H6(GT) is :  7.13008724870588
rmse value for PT08.S2(NMHC) is :  277.8484437680197
rmse value for NOx(GT) is :  214.7832234091323
rmse value for PT08.S3(NOx) is :  244.95769661937948
rmse value for NO2(GT) is :  66.69695211710147
rmse value for PT08.S4(NO2) is :  490.08388934114504
rmse value for PT08.S5(O3) is :  446.51541648882096
rmse value for T is :  10.721325795600743
rmse value for RH is :  17.11167624817302
rmse value for AH is :  0.5216247245185525


In [141]:
#make final predictions
model = VAR(endog=data)
model_fit = model.fit()
yhat = model_fit.forecast(model_fit.y, steps=5)
print(yhat)

[[2.34596328e+00 1.08633212e+03 2.80762173e+02 1.24130779e+01
  1.05535947e+03 2.80882233e+02 6.59534851e+02 1.68444418e+02
  1.15918056e+03 8.50845529e+02 2.73639014e+01 1.55311062e+01
  5.15317053e-01]
 [2.45308371e+00 1.09701970e+03 2.84860181e+02 1.26808042e+01
  1.05798069e+03 2.93265167e+02 6.66191599e+02 1.68520527e+02
  1.18047398e+03 8.80226856e+02 2.63058958e+01 1.77966232e+01
  5.27377242e-01]
 [2.52881822e+00 1.10419777e+03 2.87641649e+02 1.27763638e+01
  1.05651409e+03 3.02654521e+02 6.73675534e+02 1.68229199e+02
  1.19528761e+03 9.04734692e+02 2.53214196e+01 1.99101731e+01
  5.38878549e-01]
 [2.57938356e+00 1.10874388e+03 2.89392031e+02 1.27529209e+01
  1.05219269e+03 3.09532654e+02 6.81729036e+02 1.67598574e+02
  1.20540194e+03 9.24982666e+02 2.44055266e+01 2.18844523e+01
  5.49769526e-01]
 [2.60995661e+00 1.11133845e+03 2.90344425e+02 1.26491506e+01
  1.04594002e+03 3.14342095e+02 6.90130990e+02 1.66671502e+02
  1.21213722e+03 9.41571389e+02 2.35532502e+01 2.37310011e+0



In [142]:
pd.DataFrame(yhat, columns=col)

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.345963,1086.332115,280.762173,12.413078,1055.359467,280.882233,659.534851,168.444418,1159.180558,850.845529,27.363901,15.531106,0.515317
1,2.453084,1097.019702,284.860181,12.680804,1057.980688,293.265167,666.191599,168.520527,1180.47398,880.226856,26.305896,17.796623,0.527377
2,2.528818,1104.197767,287.641649,12.776364,1056.51409,302.654521,673.675534,168.229199,1195.287607,904.734692,25.32142,19.910173,0.538879
3,2.579384,1108.743882,289.392031,12.752921,1052.192689,309.532654,681.729036,167.598574,1205.401944,924.982666,24.405527,21.884452,0.54977
4,2.609957,1111.338453,290.344425,12.649151,1045.940017,314.342095,690.13099,166.671502,1212.137217,941.571389,23.55325,23.731001,0.560033
