In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('C:/Pythondatasets/Dataset_Sarajevo_Sa_Diskretnim_Vremenom.csv', parse_dates=True)
data

Unnamed: 0,Site,Parameter,Date (LT),Year,Month,Day,Hour,NowCast Conc.,AQI,AQI Category,Raw Conc.,Conc. Unit,Duration,QC Name,pomocna,diskretno_vrijeme
0,Sarajevo,PM2.5 - Principal,2018-05-22 17:00:00,2018,5,22,17,10.0,42,Good,7.0,UG/M3,1 Hr,Valid,,0.0
1,Sarajevo,PM2.5 - Principal,2018-05-22 18:00:00,2018,5,22,18,15.7,59,Moderate,20.0,UG/M3,1 Hr,Valid,2018-05-22 17:00:00,1.0
2,Sarajevo,PM2.5 - Principal,2018-05-22 19:00:00,2018,5,22,19,14.2,55,Moderate,13.0,UG/M3,1 Hr,Valid,2018-05-22 18:00:00,2.0
3,Sarajevo,PM2.5 - Principal,2018-05-22 20:00:00,2018,5,22,20,16.7,61,Moderate,19.0,UG/M3,1 Hr,Valid,2018-05-22 19:00:00,3.0
4,Sarajevo,PM2.5 - Principal,2018-05-22 21:00:00,2018,5,22,21,13.8,55,Moderate,11.0,UG/M3,1 Hr,Valid,2018-05-22 20:00:00,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41472,Sarajevo,PM2.5 - Principal,2023-04-30 21:00:00,2023,4,30,21,15.2,58,Moderate,21.0,UG/M3,1 Hr,Valid,2023-04-30 20:00:00,41472.0
41473,Sarajevo,PM2.5 - Principal,2023-04-30 22:00:00,2023,4,30,22,23.6,75,Moderate,32.0,UG/M3,1 Hr,Valid,2023-04-30 21:00:00,41473.0
41474,Sarajevo,PM2.5 - Principal,2023-04-30 23:00:00,2023,4,30,23,23.8,76,Moderate,24.0,UG/M3,1 Hr,Valid,2023-04-30 22:00:00,41474.0
41475,Sarajevo,PM2.5 - Principal,2023-05-01 00:00:00,2023,5,1,0,25.4,79,Moderate,27.0,UG/M3,1 Hr,Valid,2023-04-30 23:00:00,41475.0


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
X=data[['diskretno_vrijeme', 'Hour']] 
Y=data['AQI']

In [6]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.3)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(29033, 2) (12444, 2) (29033,) (12444,)


In [7]:
# Linear Regression

model=LinearRegression()
model.fit(X_train, Y_train)

train_pred=model.predict(X_train)
test_pred=model.predict(X_test)

RMSE_train=(np.sqrt(metrics.mean_squared_error(Y_train, train_pred)))
RMSE_test=(np.sqrt(metrics.mean_squared_error(Y_test, test_pred)))
print("RMSE Training Data = ", str(RMSE_train))
print("RMSE Test Data = ", str(RMSE_test))
print("-"*50)
print("RSquared value on train: ", model.score(X_train, Y_train))
print("RSquared value on test: ", model.score(X_test, Y_test))

RMSE Training Data =  60.92910302560802
RMSE Test Data =  62.898232243091044
--------------------------------------------------
RSquared value on train:  0.0007618430871274562
RSquared value on test:  0.0010540108663007475


In [8]:
# Decision tree Regressor

DT=DecisionTreeRegressor()
DT.fit(X_train, Y_train)

train_preds=DT.predict(X_train)
test_preds=DT.predict(X_test)

RMSE_train=(np.sqrt(metrics.mean_squared_error(Y_train, train_preds)))
RMSE_test=(np.sqrt(metrics.mean_squared_error(Y_test, test_preds)))
print("RMSE Training Data = ", str(RMSE_train))
print("RMSE Test Data = ", str(RMSE_test))
print("-"*50)
print("RSquared value on train: ", DT.score(X_train, Y_train))
print("RSquared value on test: ", DT.score(X_test, Y_test))

# OVERFITTING

RMSE Training Data =  0.4440407451772145
RMSE Test Data =  14.087022040554576
--------------------------------------------------
RSquared value on train:  0.999946928044999
RSquared value on test:  0.9498924098869531


In [9]:
# Random Forest Regressor

RF=RandomForestRegressor().fit(X_train, Y_train)

train_preds1=RF.predict(X_train)
test_preds1=RF.predict(X_test)

RMSE_train=(np.sqrt(metrics.mean_squared_error(Y_train, train_preds1)))
RMSE_test=(np.sqrt(metrics.mean_squared_error(Y_test, test_preds1)))
print("RMSE Training Data = ", str(RMSE_train))
print("RMSE Test Data = ", str(RMSE_test))
print("-"*50)
print("RSquared value on train: ", RF.score(X_train, Y_train))
print("RSquared value on test: ", RF.score(X_test, Y_test))

RMSE Training Data =  4.20392517112808
RMSE Test Data =  11.779921737509689
--------------------------------------------------
RSquared value on train:  0.9952430411507536
RSquared value on test:  0.9649611423868406


# K FOLD CROSS VALIDATION


In [10]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [12]:
model=LinearRegression()
mymodel=model.fit(X_train, Y_train)

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
# K-fold CV
scores=cross_val_score(mymodel, X_train, Y_train, scoring='r2', cv=3)
scores

array([0.00105755, 0.00066062, 0.0001437 ])