## Load Dataset

In [13]:
import pandas as pd  # type: ignore
import os

# Select Pollutant
outputs = ["O3", "SO2", "CO", "NO2", "PM2.5", "PM10"]
pollutant = outputs[5]

# Load data
time_windows = ["hourly", "daily"]
tw = time_windows[1]
data = pd.read_csv(f"subsystems/{pollutant}-all-joined-{tw}.csv")
data = data.dropna()

# if data-clean doesn't exist, create it
if not os.path.exists("data-clean"):
    os.mkdir("data-clean")
data.to_csv(f"data-clean/{pollutant}-clean-{tw}.csv", index=False)

# Split data into train and test
train_set = data.sample(frac=0.8, random_state=1)
test_set = data.drop(train_set.index)

# save train and test
if not os.path.exists("train-set"):
    os.mkdir("train-set")
if not os.path.exists("test-set"):
    os.mkdir("test-set")
train_set.to_csv(f"train-set/{pollutant}-train.csv", index=False)
test_set.to_csv(f"test-set/{pollutant}-test.csv", index=False)

print(data.shape)
print(train_set.shape)
print(test_set.shape)

(3646, 11)
(2917, 11)
(729, 11)


## Load Pycaret

In [14]:
from pycaret.regression import setup # type: ignore
s = setup(train_set, target = pollutant, session_id = 123, test_data=test_set)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,PM10
2,Target type,Regression
3,Original data shape,"(3646, 11)"
4,Transformed data shape,"(3646, 11)"
5,Transformed train set shape,"(2917, 11)"
6,Transformed test set shape,"(729, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple


## Save Experiment Configurations

In [15]:
df = s.pull()
if not os.path.exists("configs"):
    os.mkdir("configs")
df.to_csv(f"configs/{pollutant}-setup.csv", index=False)

## Compare Models

In [16]:
best = s.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,6.2272,81.6088,8.9749,0.8229,0.209,0.1833,1.859
et,Extra Trees Regressor,6.5966,90.3385,9.4459,0.8035,0.2186,0.192,0.31
lightgbm,Light Gradient Boosting Machine,6.6139,91.6745,9.4998,0.801,0.2177,0.1934,0.156
xgboost,Extreme Gradient Boosting,6.9224,97.3217,9.8014,0.7883,0.2261,0.201,0.103
rf,Random Forest Regressor,7.0715,99.5406,9.9133,0.7837,0.2295,0.2047,0.625
gbr,Gradient Boosting Regressor,7.4551,105.5852,10.2068,0.7704,0.2405,0.218,0.292
lr,Linear Regression,8.6149,132.2372,11.4521,0.7107,0.2736,0.2428,0.016
ridge,Ridge Regression,8.615,132.2368,11.4521,0.7107,0.2736,0.2428,0.014
br,Bayesian Ridge,8.6186,132.2704,11.4538,0.7106,0.2734,0.2426,0.03
lar,Least Angle Regression,8.6949,133.4039,11.5062,0.708,0.2769,0.2448,0.014


## Save Comparision

In [17]:
df = s.pull()
if not os.path.exists("comparisions"):
    os.mkdir("comparisions")
df.to_csv(f"comparisions/{pollutant}-models.csv", index=False)

## Get the best model

In [18]:
print(best)
s.evaluate_model(best)

<catboost.core.CatBoostRegressor object at 0x000002535A3C0430>


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Get Special Graphs

In [19]:
# import seaborn as sns  # type: ignore
# import matplotlib.pyplot as plt  # type: ignore

# ax = s.plot_model(best, plot="residuals", plot_kwargs={"title": f"Residuals for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="error", plot_kwargs={"title": f"Prediction Error for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="cooks", plot_kwargs={"title": f"Cook's Distance for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="rfe", plot_kwargs={"title": f"RFECV for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="learning", plot_kwargs={"title": f"Learning Curve for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="vc", plot_kwargs={"title": f"Validation Curve for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="manifold", plot_kwargs={"title": f"t-SNE Manifold for {pollutant} Selected Model"}, save=True)
# s.plot_model(best, plot="feature", plot_kwargs={"title": f"Feature Importance for {pollutant} Selected Model"}, save=True)

## Make Predictions

In [20]:
predictions = s.predict_model(best, data=test_set)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,6.3403,81.766,9.0425,0.8343,0.2219,0.2019


## Save Stats of Best Model

In [21]:
df = s.pull()
if not os.path.exists("errors"):
    os.mkdir("errors")
df.to_csv(f"errors/{pollutant}-errors.csv", index=False)

## Save Predictions

In [22]:
if not os.path.exists("predictions"):
    os.mkdir("predictions")
predictions.to_csv(f"predictions/{pollutant}-predictions.csv", index=False)

## Save Model

In [23]:
if not os.path.exists("models"):
    os.mkdir("models")
s.save_model(best, f'models/best_{pollutant}')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['PM2.5', 'HR', 'NO2', 'YEAR', 'WD',
                                              'MONTH', 'CO', 'SEASON', 'O3',
                                              'TEMP'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('clean_column_names',
                  TransformerWrapper(transformer=CleanColumnNames())),
                 ('trained_model',
                  <catboost.core.CatBoostRegressor object at 0x000002535A3C0430>)]),
 'models/best_PM10.pkl')

## Load Model

In [24]:
loaded_model = s.load_model(f'models/best_{pollutant}')
print(loaded_model)

Transformation Pipeline and Model Successfully Loaded


Pipeline(memory=FastMemory(location=C:\Users\FRANCI~1\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['PM2.5', 'HR', 'NO2', 'YEAR', 'WD',
                                             'MONTH', 'CO', 'SEASON', 'O3',
                                             'TEMP'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent'))),
                ('clean_column_names',
                 TransformerWrapper(transformer=CleanColumnNames())),
                ('trained_model',
                 <catboost.core.CatBoostRegressor object at 0x000002535A32A770>)])
