<a href="https://colab.research.google.com/github/costpetrides/Air-pollution-COVID-19-impact/blob/main/MachineLearning/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost - Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


import xgboost as xgb # XGBoost library :)

In [3]:
pip install xgboost optuna



In [5]:
# Import the data!

df=pd.read_csv('TestingData.csv')
cols = df.columns.tolist()
df.head()

Unnamed: 0,Mtemp,Htemp,Ltemp,Heat,Cool,Rain,WMSpeed,HWSpeed,NO,NO2,PM2.5,O3
0,13.4,17.4,9.7,4.9,0.0,2.6,6.9,33.8,1.5,11.46,4.71,81.79
1,10.4,15.3,7.4,7.9,0.0,2.2,8.4,46.7,1.08,11.13,4.08,72.0
2,7.5,10.4,5.9,10.8,0.0,0.0,7.7,30.6,1.08,7.21,7.23,73.5
3,8.2,10.2,7.2,10.1,0.0,0.0,6.3,38.6,1.04,8.54,8.71,74.17
4,8.1,10.2,6.6,10.3,0.0,0.0,5.1,29.0,1.04,9.96,9.17,72.17


In [6]:
cols = df.columns.tolist()    # Here we make a list with all labels names

In [7]:
# We split the data into training (60%), validation (20%), and test (20%) sets

train_end = int(0.7 * len(df))

train = df[:train_end]
test = df[train_end:]

train = df[:train_end]
test = df[train_end:]

In [8]:
# Scaling :)

def preprocess_data(dataframe):
    # Separate features and target variable
    x = dataframe.iloc[:, :-1].values
    y = dataframe.iloc[:, -1].values

    return x, y


# Apply preprocessing to your datasets
train_x, train_y = preprocess_data(train)
test_x, test_y = preprocess_data(test)

In [9]:
import optuna


def objective(trial):
    params = {
        'objective': 'reg:squaredlogerror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'verbosity': 0,
        'n_jobs': -1,
        'random_state': 42,
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-6, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-6, 1.0),
    }


    model = xgb.XGBRegressor(**params)

    # Train the model
    model.fit(train_x, train_y)

    # Make predictions on the test set
    pred_y = model.predict(test_x)

    # Evaluate the model on the test set using mean squared error
    mse = mean_squared_error(test_y, pred_y)

    return mse  # Optimize to minimize mean squared error

# Perform hyperparameter optimization with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params

# Use the best hyperparameters in your program
best_model = xgb.XGBRegressor(**best_params)
best_model.fit(train_x, train_y)

# Now, you can use the best_model for predictions or any other tasks

[I 2023-10-16 17:37:35,128] A new study created in memory with name: no-name-3a0a9444-feee-49a9-8f35-1c5aebd8058d
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-6, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-6, 1.0),
[I 2023-10-16 17:37:35,258] Trial 0 finished with value: 8862.40075768177 and parameters: {'learning_rate': 0.14932513986244336, 'n_estimators': 405, 'max_depth': 6, 'min_child_weight': 11, 'subsample': 0.7543932456124278, 'colsample_bytree': 0.9378354961645317, 'gamma': 0.0004216671144236118, 'alpha': 1.5673505837478096e-06}. Best is trial 0 with value: 8862.40075768177.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample

In [10]:
pred_y = best_model.predict(test_x)

# Evaluate the model
mse = mean_squared_error(test_y, pred_y)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 327.89


In [11]:
pred_y = best_model.predict(test_x)

# Evaluate the model
mae = mean_absolute_error(test_y, pred_y)
mse = mean_squared_error(test_y, pred_y)
r2 = r2_score(test_y, pred_y)

# Display the custom regression report
print("Custom Regression Report:")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Custom Regression Report:
Mean Absolute Error: 14.09
Mean Squared Error: 327.89
R-squared (R2): -0.13
