<a href="https://colab.research.google.com/github/cmuro27/Machine_Learning_Projects_and_Notes/blob/main/Project_ML_flight_price_prediction_XGB_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning - Comparison of Hyperparameter Tuning between GridSearch and Optuna.
### César Muro Cabral

We evaluate the r2 of regressions using XGBoost. For the hyperparameter tuning we compare optuna and gridsearch. Optuna is a package which allows to find the parameters for the best model and it takes much less time than gridsearch.  
  
We show how optuna achieves a r2 score 0.98 on the test set in 7 minutes meanwhile gridsearch a score of 0.97 in 20 minutes.  

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("/content/drive/MyDrive/Physics/Books_articles_works/Programming/Individual_projects/Machine_learning_projects/flight-price-prediction/clean_Dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


In [4]:
data = data.drop('Unnamed: 0',axis=1)
data.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [None]:
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np

#Specify the independent variables and target variable from the dataframe
X=data.loc[:,~data.columns.isin(['price'])]
y=data['price']

#Lists of the numeric and categorical features
categorical_features = ['airline', 'flight', 'source_city', 'departure_time',
        'stops', 'arrival_time', 'destination_city','class','days_left']


# numerical features
numeric_features= ['duration']

#Pipeline for preprocessing the numeric data
numerical_transformer =  Pipeline(steps=[('imp_num',
                                          SimpleImputer(strategy='median')),
                                         ('sc',StandardScaler())])

# Pipeline for preprocessing the categorical data
categorical_transformer=Pipeline(steps=[('imp_cat',
                                         SimpleImputer(strategy='most_frequent')),
                                        ('onehot',
                                         OneHotEncoder(handle_unknown = 'ignore'))])

# Preprocessor
preprocessor = ColumnTransformer(transformers=[('num',numerical_transformer,
                                                numeric_features),
                                               ('cat', categorical_transformer,
                                                categorical_features)])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                    random_state=27)

# Split the data in training and test sets.
# Although we will perform GridSearh with cross-validation
pipe_xgb=Pipeline(steps=[('preprocess', preprocessor),
                     ('xgb',xgb.XGBRegressor(seed=123))])

kf = KFold(n_splits=4, random_state=42, shuffle=True)

# Defining the grid parameters
gbm_param_grid={'xgb__learning_rate':[0.1,0.5,0.9],
                'xgb__n_estimators':[150],
                'xgb__subsample':[0.3,0.5,0.9],
                'xgb__n_estimators':[50],
                'xgb__max_depth': [2,5]}

xgb_cv = GridSearchCV(estimator=pipe_xgb,
                      param_grid=gbm_param_grid,
                      scoring='r2',
                      verbose=1,
                      cv=kf)

In [None]:
xgb_cv.fit(X_train,y_train)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


In [None]:
#Print the tuned parameters and score
print('Tuned XGB regressor parameters: {}'.format(xgb_cv.best_params_))
print("Tuned XGB regressor r2 score: {}".format(xgb_cv.best_score_))

Tuned XGB regressor parameters: {'xgb__learning_rate': 0.9, 'xgb__max_depth': 5, 'xgb__n_estimators': 50, 'xgb__subsample': 0.9}
Tuned XGB regressor r2 score: 0.9742023198306706


In [None]:
#Print the r2 score on the test set
xgb_best = xgb_cv.best_estimator_
print('The r2 score on the test set is: ',xgb_best.score(X_test,y_test))

The r2 score on the test set is:  0.9737203959516665


In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.1 cmaes-0.9.1 colorlog-6.7.0 optuna-3.2.0


In [5]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    from sklearn.metrics import r2_score
    from sklearn.pipeline import Pipeline
    from sklearn.linear_model import Ridge
    import xgboost as xgb
    from sklearn.impute import SimpleImputer
    from sklearn.model_selection import GridSearchCV, train_test_split, KFold
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import ColumnTransformer
    import numpy as np

#Specify the independent variables and target variable from the dataframe
    X=data.loc[:,~data.columns.isin(['price'])]
    y=data['price']

#Lists of the numeric and categorical features
    categorical_features = ['airline', 'flight', 'source_city', 'departure_time',
        'stops', 'arrival_time', 'destination_city','class','days_left']


# numerical features
    numeric_features= ['duration']

#Pipeline for preprocessing the numeric data
    numerical_transformer =  Pipeline(steps=[('imp_num',
                                          SimpleImputer(strategy='median')),
                                         ('sc',StandardScaler())])

# Pipeline for preprocessing the categorical data
    categorical_transformer=Pipeline(steps=[('imp_cat',
                                         SimpleImputer(strategy='most_frequent')),
                                        ('onehot',
                                         OneHotEncoder(handle_unknown = 'ignore'))])

# Preprocessor
    preprocessor = ColumnTransformer(transformers=[('num',numerical_transformer,
                                                numeric_features),
                                               ('cat', categorical_transformer,
                                                categorical_features)])

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
                                                    random_state=27)

# Split the data in training and test sets.
# Although we will perform GridSearh with cross-validation
    pipe_xgb=Pipeline(steps=[('preprocess', preprocessor),
                     ('xgb',xgb.XGBRegressor(**param,seed=123))])
    pipe_xgb.fit(X_train, y_train)
    y_pred = pipe_xgb.predict(X_test)
    return r2_score(y_test, y_pred)




In [6]:
import optuna
study = optuna.create_study(direction='maximize', study_name='regression')
study.optimize(objective, n_trials=4)

[I 2023-07-16 21:18:54,206] A new study created in memory with name: regression
[I 2023-07-16 21:19:21,707] Trial 0 finished with value: 0.9659560570695426 and parameters: {'max_depth': 2, 'learning_rate': 0.6675822424218378, 'n_estimators': 365, 'min_child_weight': 3, 'gamma': 0.9450149030377069, 'subsample': 0.11695639558812533, 'colsample_bytree': 0.7385922350941926, 'reg_alpha': 0.6906082560230461, 'reg_lambda': 0.28357924690209596, 'random_state': 425}. Best is trial 0 with value: 0.9659560570695426.
[I 2023-07-16 21:22:04,172] Trial 1 finished with value: 0.9869813388912807 and parameters: {'max_depth': 5, 'learning_rate': 0.8935411856709311, 'n_estimators': 921, 'min_child_weight': 9, 'gamma': 0.28949884141023713, 'subsample': 0.7870588986901805, 'colsample_bytree': 0.9131925556604538, 'reg_alpha': 0.17931551520285063, 'reg_lambda': 0.21906673644264937, 'random_state': 715}. Best is trial 1 with value: 0.9869813388912807.
[I 2023-07-16 21:23:12,090] Trial 2 finished with value: 

In [8]:
print(f"The features for a XGB regressor to obtain the best model, with a r2 score of 0.98, are: {study.best_params}")

The features for a XGB regressor to obtain the best model, with a r2 score of 0.98, are: {'max_depth': 7, 'learning_rate': 0.8175001306595665, 'n_estimators': 860, 'min_child_weight': 3, 'gamma': 0.2413895584937705, 'subsample': 0.49232016984239596, 'colsample_bytree': 0.6645017605869546, 'reg_alpha': 0.04499593228587704, 'reg_lambda': 0.38807719841527816, 'random_state': 189}
