In [2]:
import numpy as np
import pandas as pd
import statsmodels
from statsmodels.regression import linear_model
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model

In [3]:
df = pd.read_csv('../data/air_quality_uci.csv', encoding='cp949')
df.head(3)

Unnamed: 0,Date,Time,CO,PT08_S1,NMHC,C6H6,PT08_S2,NOx,PT08_S3,NO2,PT08_S4,PT08_S5,T,RH,AH
0,2005-02-16,17:00:00,1.6,1083,-200,7.0,856,252,720,148,1010,975,8.2,41.8,0.4563
1,2005-02-16,18:00:00,2.2,1143,-200,9.5,960,313,641,153,1110,1136,7.4,44.5,0.4582
2,2005-02-16,19:00:00,3.4,1288,-200,15.9,1178,433,539,171,1300,1434,7.0,43.4,0.4366


In [4]:
df.shape

(1000, 15)

In [5]:
df['DateTime'] = df['Date'] + ' ' + df['Time']
df.head(3)

Unnamed: 0,Date,Time,CO,PT08_S1,NMHC,C6H6,PT08_S2,NOx,PT08_S3,NO2,PT08_S4,PT08_S5,T,RH,AH,DateTime
0,2005-02-16,17:00:00,1.6,1083,-200,7.0,856,252,720,148,1010,975,8.2,41.8,0.4563,2005-02-16 17:00:00
1,2005-02-16,18:00:00,2.2,1143,-200,9.5,960,313,641,153,1110,1136,7.4,44.5,0.4582,2005-02-16 18:00:00
2,2005-02-16,19:00:00,3.4,1288,-200,15.9,1178,433,539,171,1300,1434,7.0,43.4,0.4366,2005-02-16 19:00:00


In [6]:
df['DateTime'][3]

'2005-02-16 20:00:00'

In [7]:
type(df['DateTime'][3])

str

In [8]:
df['DateTimeIndex'] = pd.to_datetime(df['DateTime'])
df.head(3)

Unnamed: 0,Date,Time,CO,PT08_S1,NMHC,C6H6,PT08_S2,NOx,PT08_S3,NO2,PT08_S4,PT08_S5,T,RH,AH,DateTime,DateTimeIndex
0,2005-02-16,17:00:00,1.6,1083,-200,7.0,856,252,720,148,1010,975,8.2,41.8,0.4563,2005-02-16 17:00:00,2005-02-16 17:00:00
1,2005-02-16,18:00:00,2.2,1143,-200,9.5,960,313,641,153,1110,1136,7.4,44.5,0.4582,2005-02-16 18:00:00,2005-02-16 18:00:00
2,2005-02-16,19:00:00,3.4,1288,-200,15.9,1178,433,539,171,1300,1434,7.0,43.4,0.4366,2005-02-16 19:00:00,2005-02-16 19:00:00


In [9]:
type(df['DateTimeIndex'][3])

pandas._libs.tslibs.timestamps.Timestamp

In [10]:
df = df.set_index(keys=['DateTimeIndex'])

In [11]:
df.head(3)

Unnamed: 0_level_0,Date,Time,CO,PT08_S1,NMHC,C6H6,PT08_S2,NOx,PT08_S3,NO2,PT08_S4,PT08_S5,T,RH,AH,DateTime
DateTimeIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2005-02-16 17:00:00,2005-02-16,17:00:00,1.6,1083,-200,7.0,856,252,720,148,1010,975,8.2,41.8,0.4563,2005-02-16 17:00:00
2005-02-16 18:00:00,2005-02-16,18:00:00,2.2,1143,-200,9.5,960,313,641,153,1110,1136,7.4,44.5,0.4582,2005-02-16 18:00:00
2005-02-16 19:00:00,2005-02-16,19:00:00,3.4,1288,-200,15.9,1178,433,539,171,1300,1434,7.0,43.4,0.4366,2005-02-16 19:00:00


In [12]:
df.isnull().sum()

Date        0
Time        0
CO          0
PT08_S1     0
NMHC        0
C6H6        0
PT08_S2     0
NOx         0
PT08_S3     0
NO2         0
PT08_S4     0
PT08_S5     0
T           0
RH          0
AH          0
DateTime    0
dtype: int64

In [13]:
train_ratio = 0.9
n = len(df)
n_train = int(n*train_ratio)
df_train = df[:n_train]
df_test = df[n_train:]

In [14]:
df.columns

Index(['Date', 'Time', 'CO', 'PT08_S1', 'NMHC', 'C6H6', 'PT08_S2', 'NOx',
       'PT08_S3', 'NO2', 'PT08_S4', 'PT08_S5', 'T', 'RH', 'AH', 'DateTime'],
      dtype='object')

In [15]:
features = ['T', 'AH']
target = 'PT08_S4'

X_tn = df_train[features]
y_tn = df_train[target]
X_te = df_test[features]
y_te = df_test[target]

In [16]:
X_tn

Unnamed: 0_level_0,T,AH
DateTimeIndex,Unnamed: 1_level_1,Unnamed: 2_level_1
2005-02-16 17:00:00,8.2,0.4563
2005-02-16 18:00:00,7.4,0.4582
2005-02-16 19:00:00,7.0,0.4366
2005-02-16 20:00:00,6.6,0.4241
2005-02-16 21:00:00,5.9,0.4137
...,...,...
2005-03-26 00:00:00,17.1,1.2926
2005-03-26 01:00:00,16.8,1.3135
2005-03-26 02:00:00,16.6,1.3078
2005-03-26 03:00:00,16.1,1.2955


In [17]:
ols_reg = linear_model.OLS(endog=y_tn, exog=X_tn)
ols_model = ols_reg.fit()

In [18]:
ols_model.params

T     92.087713
AH   -87.960934
dtype: float64

In [21]:
ols_model.summary()

0,1,2,3
Dep. Variable:,PT08_S4,R-squared (uncentered):,0.791
Model:,OLS,Adj. R-squared (uncentered):,0.791
Method:,Least Squares,F-statistic:,1701.0
Date:,"Mon, 01 Jul 2024",Prob (F-statistic):,3.95e-306
Time:,14:40:57,Log-Likelihood:,-6942.3
No. Observations:,900,AIC:,13890.0
Df Residuals:,898,BIC:,13900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
T,92.0877,1.590,57.911,0.000,88.967,95.209
AH,-87.9609,3.266,-26.930,0.000,-94.371,-81.550

0,1,2,3
Omnibus:,56.134,Durbin-Watson:,0.13
Prob(Omnibus):,0.0,Jarque-Bera (JB):,65.528
Skew:,-0.622,Prob(JB):,5.9e-15
Kurtosis:,3.446,Cond. No.,2.71


In [19]:
ols_model.predict()

array([ 7.14982671e+02,  6.41145375e+02,  6.06210246e+02,  5.70474673e+02,
        5.06928067e+02,  4.24383377e+02,  5.01211345e+02,  4.62573061e+02,
        4.90744733e+02,  4.90727140e+02,  4.99012322e+02,  4.87094354e+02,
        5.95831594e+02,  5.98250520e+02,  5.52153887e+02,  5.69225627e+02,
        6.70020734e+02,  8.90186820e+02,  7.86788070e+02,  7.59056203e+02,
        7.18254079e+02,  7.73770590e+02,  7.73348377e+02,  7.83287224e+02,
        8.12663961e+02,  7.29758631e+02,  6.55771801e+02,  6.38603304e+02,
        6.38752838e+02,  5.92902495e+02,  6.11012175e+02,  5.92533059e+02,
        5.84045568e+02,  5.38045692e+02,  4.72986159e+02,  5.47016969e+02,
        5.93219155e+02,  5.46700309e+02,  5.46383650e+02,  5.92277973e+02,
        7.30242416e+02,  9.52378827e+02,  9.89002806e+02,  1.04478320e+03,
        1.01961100e+03,  9.66284712e+02,  1.00321655e+03,  1.03107157e+03,
        9.84851788e+02,  8.55137342e+02,  7.44605698e+02,  7.25528449e+02,
        6.97576679e+02,  

In [20]:
ols_model.predict(X_te)

DateTimeIndex
2005-03-26 05:00:00    1387.973531
2005-03-26 06:00:00    1370.558743
2005-03-26 07:00:00    1371.579090
2005-03-26 08:00:00    1408.150292
2005-03-26 09:00:00    1489.956110
                          ...     
2005-03-30 04:00:00     973.827329
2005-03-30 05:00:00     854.042934
2005-03-30 06:00:00     928.926965
2005-03-30 07:00:00     902.312202
2005-03-30 08:00:00    1030.707234
Length: 100, dtype: float64