In [183]:
# !wget https://raw.githubusercontent.com/dasarpai/DAI-Projects/main/Infra/AirQuality-Prediction/AirQualityUCI.csv

### A Multivariate Time Series Guide to Forecasting and Modeling

In [184]:
#import required packages
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#read the data
data = pd.read_csv("AirQualityUCI.csv")

#check the dtypes
data.dtypes

Unnamed: 0,0
Date,object
Time,object
CO(GT),float64
PT08.S1(CO),int64
NMHC(GT),int64
C6H6(GT),float64
PT08.S2(NMHC),int64
NOx(GT),int64
PT08.S3(NOx),int64
NO2(GT),int64


In [185]:
data

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,10-03-04,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578
1,10-03-04,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255
2,10-03-04,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502
3,10-03-04,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867
4,10-03-04,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,04-04-05,10:00:00,3.1,1314,-200,13.5,1101,472,539,190,1374,1729,21.9,29.3,0.7568
9353,04-04-05,11:00:00,2.4,1163,-200,11.4,1027,353,604,179,1264,1269,24.3,23.7,0.7119
9354,04-04-05,12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406
9355,04-04-05,13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139


In [186]:
# Combine 'Date' and 'Time' into a single datetime column
data['Date_Time'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], format='%y-%m-%d %H:%M:%S')

# Drop the original 'Date' and 'Time' columns if no longer needed
data.drop(columns=['Date', 'Time'], inplace=True)

data.index = data.Date_Time

In [187]:
data

Unnamed: 0_level_0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Date_Time
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2010-03-04 18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578,2010-03-04 18:00:00
2010-03-04 19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255,2010-03-04 19:00:00
2010-03-04 20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502,2010-03-04 20:00:00
2010-03-04 21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867,2010-03-04 21:00:00
2010-03-04 22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888,2010-03-04 22:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004-04-05 10:00:00,3.1,1314,-200,13.5,1101,472,539,190,1374,1729,21.9,29.3,0.7568,2004-04-05 10:00:00
2004-04-05 11:00:00,2.4,1163,-200,11.4,1027,353,604,179,1264,1269,24.3,23.7,0.7119,2004-04-05 11:00:00
2004-04-05 12:00:00,2.4,1142,-200,12.4,1063,293,603,175,1241,1092,26.9,18.3,0.6406,2004-04-05 12:00:00
2004-04-05 13:00:00,2.1,1003,-200,9.5,961,235,702,156,1041,770,28.3,13.5,0.5139,2004-04-05 13:00:00


In [188]:
colnames = ['PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
        'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
        'T', 'RH', 'AH']

## coint_johansen
The `coint_johansen` function from the `statsmodels` library in Python is used to perform the **Johansen Cointegration Test**. This test is a statistical method used to determine whether two or more time series are cointegrated, i.e., whether they share a common long-term stochastic trend.

### Key Points about Cointegration:
- **Cointegration** refers to the relationship between two or more non-stationary time series where a linear combination of them results in a stationary series. This is important in time series analysis because it indicates a long-term equilibrium relationship.
- If time series are cointegrated, it suggests that they move together over time, even if individually they are non-stationary (they may contain trends or random walks).

The **Johansen Cointegration Test** is a more advanced version of the cointegration test and allows for testing the presence of multiple cointegration relationships (unlike the simpler **Engle-Granger two-step method**). The `coint_johansen` function implements this test and provides information about the number of cointegrating relationships in a multivariate system.

### Function Signature:
```python
statsmodels.tsa.vector_ar.vecm.coint_johansen(endog, det_order=0, k_ar_diff=1, method='trace', signif=0.05)
```

### Parameters:
- **endog**: (required) This is the matrix of endogenous variables (the time series data). The input should be a DataFrame or NumPy array with time series data arranged in columns.
- **det_order**: (optional) Specifies the deterministic trend component to include in the model.
  - `det_order=0`: No deterministic trend (only a constant)
  - `det_order=1`: Linear trend
  - `det_order=2`: Quadratic trend
- **k_ar_diff**: (optional) The number of lags to use in the differencing of the series. It refers to the difference order for the autoregressive process.
- **method**: (optional) Specifies the test statistic method:
  - `method='trace'`: The Trace test statistic (default)
  - `method='eigen'`: The Eigenvalue test statistic
- **signif**: (optional) The significance level for the test (default is `0.05`).

### Returns:
- **JohansenResult**: A `JohansenResult` object that contains the test results, including:
  - **eigenvalues**: The eigenvalues of the system.
  - **trace_statistic**: The trace test statistic.
  - **critical_values**: The critical values for the test at different significance levels.
  - **coint_rank**: The number of cointegrating relationships (the rank of the cointegration matrix).

### Example Usage:
```python
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.vector_ar.vecm import coint_johansen

# Sample data: two time series
data = {
    'series1': [1, 2, 3, 4, 5],
    'series2': [2, 4, 6, 8, 10]
}
df = pd.DataFrame(data)

# Perform Johansen cointegration test
result = coint_johansen(df, det_order=0, k_ar_diff=1)

# Display test results
print(result.summary())
```

### Example Output:
```
Johansen cointegration test result
---------------------------------------------------
Test statistics
Trace Statistic 0.340  Critical Value 0.05
Eigenvalue Statistic 0.450  Critical Value 0.05
...
---------------------------------------------------
Cointegration rank: 1
```

### Interpretation:
- **Trace Statistic** and **Eigenvalue Statistic** are compared against critical values to determine the number of cointegration relationships. If the statistic exceeds the critical value, then the null hypothesis of no cointegration is rejected.
- **Cointegration Rank**: The number of cointegrated relationships found in the dataset. For example, if the cointegration rank is 1, it suggests that there is one cointegrated relationship between the time series.

### Applications:
- The Johansen Cointegration test is used in **Vector Autoregressive (VAR)** models, particularly when you need to model multiple time series and test for long-run relationships.
- It is widely used in econometrics for testing relationships between variables like exchange rates, stock prices, interest rates, and economic indicators.

By using this function, you can statistically verify if there are long-term equilibrium relationships between multiple time series and use the results in further modeling, such as Vector Error Correction Models (VECM).

In [189]:
#missing value treatment
rows = data.shape[0]
cols = data.shape[1]

for col in range(0, cols):
    for row in range(0,rows):
       if data.iloc[row][col] == -200:
           data.iloc[row][col] = data.iloc[row-1,col]

#checking stationarity
from statsmodels.tsa.vector_ar.vecm import coint_johansen
#since the test works for only 12 variables, I have randomly dropped
#in the next iteration, I would drop another and check the eigenvalues
johan_test_temp = data.drop([ 'CO(GT)', 'Date_Time'], axis=1) # Drop 'Date_Time' column

eigenvalues = coint_johansen(johan_test_temp,-1,1).eig

  if data.iloc[row][col] == -200:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.iloc[row][col] = data.iloc[row-1,col]
  data.iloc[row][col] = data.iloc[row-1,col]


In [190]:
for i in range(len(eigenvalues)):
    print(f"Eigenvalue {i+1}: {eigenvalues[i]} - Variable: {colnames[i]}")

Eigenvalue 1: 0.18513743063741273 - Variable: PT08.S1(CO)
Eigenvalue 2: 0.1495623792150434 - Variable: NMHC(GT)
Eigenvalue 3: 0.09984669895419282 - Variable: C6H6(GT)
Eigenvalue 4: 0.09156222574725424 - Variable: PT08.S2(NMHC)
Eigenvalue 5: 0.06599205007773086 - Variable: NOx(GT)
Eigenvalue 6: 0.05020557634373844 - Variable: PT08.S3(NOx)
Eigenvalue 7: 0.04014846784875554 - Variable: NO2(GT)
Eigenvalue 8: 0.03355809436054943 - Variable: PT08.S4(NO2)
Eigenvalue 9: 0.01841784474750858 - Variable: PT08.S5(O3)
Eigenvalue 10: 0.015015840606022547 - Variable: T
Eigenvalue 11: 0.004148479212795655 - Variable: RH
Eigenvalue 12: 7.219441079072714e-05 - Variable: AH


In [191]:
#creating the train and validation set
train = data[:int(0.8*(len(data)))]
valid = data[int(0.8*(len(data))):]

#fit the model
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train[colnames])
model_fit = model.fit()

# make prediction on validation
prediction = model_fit.forecast(model_fit.endog, steps=len(valid))

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [192]:
# import numpy as np
# # Convert all columns to numeric, coercing errors to NaN

# for col in train.columns:
#     train[col] = pd.to_numeric(train[col], errors='coerce')

# # Drop any remaining non-numeric columns
# train = train.select_dtypes(include=np.number)

# # Alternatively, if you have specific columns you want to keep:
# # numeric_cols = ['PT08.S1(CO)', 'NMHC(GT)', ...] # List your desired numeric columns
# # train = train[numeric_cols]

# # Fit the model with the modified DataFrame
# model = VAR(endog=train)
# model_fit = model.fit()

In [194]:
pred = pd.DataFrame(prediction,columns=colnames)
pred

Unnamed: 0,PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,849.252790,-192.016548,1.851780,609.118972,124.027976,1106.866789,73.256733,837.856679,557.933913,10.008702,33.937241,-0.092438
1,878.229765,-183.815219,2.007840,642.583834,134.885919,1076.309393,65.159080,869.371371,618.827679,8.788085,35.534689,-0.589755
2,905.044708,-175.677925,2.253168,674.490737,146.135621,1049.784728,59.823468,900.741687,674.663358,7.706238,36.863461,-1.062788
3,929.279153,-167.795077,2.518685,703.826466,157.426515,1026.805076,56.561831,930.393233,725.472673,6.739953,37.974532,-1.513505
4,950.807665,-160.290400,2.764857,730.138561,168.489624,1006.930510,54.839176,957.528265,771.359162,5.871861,38.905710,-1.943758
...,...,...,...,...,...,...,...,...,...,...,...,...
1867,1051.868278,-149.489173,2.659535,916.553561,140.855908,808.869259,39.697070,1469.255434,978.403622,11.885836,39.697391,-6.478639
1868,1051.868278,-149.489173,2.659535,916.553561,140.855908,808.869259,39.697070,1469.255434,978.403622,11.885836,39.697391,-6.478639
1869,1051.868278,-149.489173,2.659535,916.553561,140.855908,808.869259,39.697070,1469.255434,978.403622,11.885836,39.697391,-6.478639
1870,1051.868278,-149.489173,2.659535,916.553561,140.855908,808.869259,39.697070,1469.255434,978.403622,11.885836,39.697391,-6.478639


In [195]:
#check rmse
for col in colnames:
    print('rmse value for', col, 'is : ',
          math.sqrt(mean_squared_error(pred[col], valid[col])))

rmse value for PT08.S1(CO) is :  337.86175951358575
rmse value for NMHC(GT) is :  55.12556171010204
rmse value for C6H6(GT) is :  44.230631550178494
rmse value for PT08.S2(NMHC) is :  350.71780437815113
rmse value for NOx(GT) is :  261.4056922645097
rmse value for PT08.S3(NOx) is :  322.00698960569315
rmse value for NO2(GT) is :  121.17689891771846
rmse value for PT08.S4(NO2) is :  549.2334124571738
rmse value for PT08.S5(O3) is :  504.4017368871024
rmse value for T is :  45.78149700640014
rmse value for RH is :  54.772571336799565
rmse value for AH is :  42.0849086360914


In [196]:
#make final predictions
# model = VAR(endog=data[colnames])
# model_fit = model.fit()
yhat = model_fit.forecast(model_fit.endog, steps=5)
print(yhat)

[[ 8.49252790e+02 -1.92016548e+02  1.85178015e+00  6.09118972e+02
   1.24027976e+02  1.10686679e+03  7.32567332e+01  8.37856679e+02
   5.57933913e+02  1.00087016e+01  3.39372413e+01 -9.24378606e-02]
 [ 8.78229765e+02 -1.83815219e+02  2.00784048e+00  6.42583834e+02
   1.34885919e+02  1.07630939e+03  6.51590804e+01  8.69371371e+02
   6.18827679e+02  8.78808488e+00  3.55346893e+01 -5.89754873e-01]
 [ 9.05044708e+02 -1.75677925e+02  2.25316780e+00  6.74490737e+02
   1.46135621e+02  1.04978473e+03  5.98234680e+01  9.00741687e+02
   6.74663358e+02  7.70623762e+00  3.68634606e+01 -1.06278792e+00]
 [ 9.29279153e+02 -1.67795077e+02  2.51868544e+00  7.03826466e+02
   1.57426515e+02  1.02680508e+03  5.65618312e+01  9.30393233e+02
   7.25472673e+02  6.73995310e+00  3.79745324e+01 -1.51350468e+00]
 [ 9.50807665e+02 -1.60290400e+02  2.76485659e+00  7.30138561e+02
   1.68489624e+02  1.00693051e+03  5.48391763e+01  9.57528265e+02
   7.71359162e+02  5.87186096e+00  3.89057103e+01 -1.94375846e+00]]


In [197]:
pd.DataFrame(yhat, columns=colnames)

Unnamed: 0,PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,849.25279,-192.016548,1.85178,609.118972,124.027976,1106.866789,73.256733,837.856679,557.933913,10.008702,33.937241,-0.092438
1,878.229765,-183.815219,2.00784,642.583834,134.885919,1076.309393,65.15908,869.371371,618.827679,8.788085,35.534689,-0.589755
2,905.044708,-175.677925,2.253168,674.490737,146.135621,1049.784728,59.823468,900.741687,674.663358,7.706238,36.863461,-1.062788
3,929.279153,-167.795077,2.518685,703.826466,157.426515,1026.805076,56.561831,930.393233,725.472673,6.739953,37.974532,-1.513505
4,950.807665,-160.2904,2.764857,730.138561,168.489624,1006.93051,54.839176,957.528265,771.359162,5.871861,38.90571,-1.943758


In [208]:
valid.iloc[:5][colnames]

Unnamed: 0_level_0,PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-01-05 15:00:00,833,-200,2.0,584,107,1144,80,821,463,11.3,32.5,0.4334
2016-01-05 16:00:00,877,-200,2.8,642,176,1037,112,859,565,11.0,33.0,0.4331
2016-01-05 17:00:00,892,-200,3.3,668,180,1017,121,872,632,10.3,35.0,0.4377
2016-01-05 18:00:00,899,-200,3.4,674,212,1002,132,893,691,8.4,40.9,0.4542
2016-01-05 19:00:00,1008,-200,7.1,861,331,839,160,977,943,8.3,38.5,0.4228
