<a href="https://colab.research.google.com/github/costpetrides/Air-pollution-COVID-19-impact/blob/main/MachineLearning/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost - Regression

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


import xgboost as xgb # XGBoost library :)

In [51]:
pip install xgboost optuna



In [52]:
# Import the data!

df=pd.read_csv('AGPATEST19.csv')
cols = df.columns.tolist()
df.head()

Unnamed: 0,NO,NO2,PM25,O3
0,1.5,11.5,4.7,81.8
1,1.1,11.1,4.1,72.0
2,1.1,7.2,7.2,73.5
3,1.0,8.5,8.7,74.2
4,1.0,10.0,9.2,72.2


In [53]:
cols = df.columns.tolist()    # Here we make a list with all labels names

In [54]:
# We split the data into training (60%), validation (20%), and test (20%) sets

train_end = int(0.7 * len(df))

train = df[:train_end]
test = df[train_end:]

train = df[:train_end]
test = df[train_end:]

In [58]:
# Scaling :)

def preprocess_data(dataframe):
    # Separate features and target variable
    x = dataframe.iloc[:, :-1].values
    y = dataframe.iloc[:, -1].values

    return x, y


# Apply preprocessing to your datasets
train_x, train_y = preprocess_data(train)
test_x, test_y = preprocess_data(test)

In [66]:
import optuna


def objective(trial):
    params = {
        'objective': 'reg:squaredlogerror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'verbosity': 0,
        'n_jobs': -1,
        'random_state': 42,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-6, 1.0),
    }


    model = xgb.XGBRegressor(**params)

    # Train the model
    model.fit(train_x, train_y)

    # Make predictions on the test set
    pred_y = model.predict(test_x)

    # Evaluate the model on the test set using mean squared error
    mse = mean_squared_error(test_y, pred_y)

    return mse  # Optimize to minimize mean squared error

# Perform hyperparameter optimization with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params

# Use the best hyperparameters in your program
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(train_x, train_y)

# Now, you can use the best_model for predictions or any other tasks

[I 2023-10-14 12:25:30,809] A new study created in memory with name: no-name-986331e7-7eda-4d65-8b65-9a26856749e9
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-6, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-6, 1.0),
[I 2023-10-14 12:25:30,989] Trial 0 finished with value: 10872.265212210645 and parameters: {'learning_rate': 0.06316283913427446, 'n_estimators': 116, 'max_depth': 18, 'min_child_weight': 6, 'subsample': 0.9718568136118435, 'colsample_bytree': 0.8693450531637388, 'gamma': 0.006974542900134162, 'alpha': 0.003358268461199744}. Best is trial 0 with value: 10872.265212210645.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsampl

In [67]:
pred_y = best_model.predict(test_x)

# Evaluate the model
mse = mean_squared_error(test_y, pred_y)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 4030.70


In [68]:
pred_y = best_model.predict(test_x)

# Evaluate the model
mae = mean_absolute_error(test_y, pred_y)
mse = mean_squared_error(test_y, pred_y)
r2 = r2_score(test_y, pred_y)

# Display the custom regression report
print("Custom Regression Report:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Custom Regression Report:
Mean Absolute Error: 23.79
Mean Squared Error: 4030.70
R-squared (R2): -0.03
