In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
X_train=pd.read_csv('X_train.csv', index_col=[0])
X_test=pd.read_csv('X_test.csv', index_col=[0])
y_train=pd.read_csv('y_train.csv', index_col=[0])
y_test=pd.read_csv('y_test.csv', index_col=[0])
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
features_list = ['mkt_op_carrier_difference', 'Type_Cold', 'Type_Fog', 'Type_Hail',
       'Type_Precipitation', 'Type_Rain', 'Type_Snow', 'dow_Friday',
       'dow_Monday', 'dow_Saturday', 'dow_Sunday', 'dow_Thursday',
       'dow_Tuesday', 'arr_dawn', 'arr_evening', 'arr_morning',
       'arr_noon', 'dep_dawn', 'dep_evening', 'dep_morning', 'dep_noon',
       'aircraft_4', 'aircraft_6', 'muc_AS', 'muc_B6', 'muc_DL', 'muc_F9',
       'muc_G4', 'muc_HA', 'muc_NK', 'muc_UA', 'muc_WN',
       'Passengers_Seat_Ratio', 'distance', 'Taxi_Holdup',
       'crs_elapsed_time', 'origin_label', 'dest_label', 
       'origin_0','origin_1', 'origin_2', 'origin_3', 'origin_4', 'origin_5',
       'origin_6', 'origin_7', 'origin_8', 'origin_9', 'dest_0', 'dest_1',
       'dest_2', 'dest_3', 'dest_4', 'dest_5', 'dest_6', 'dest_7', 'dest_8', 'dest_9']

#### Filtered Feature List (Trial and Error & Feature_Importance)

In [3]:
fil_feat = ['mkt_op_carrier_difference',
            'Type_Cold','Type_Fog', 'Type_Hail', 'Type_Precipitation', 'Type_Rain', 'Type_Snow', 
            'dow_Friday', 'dow_Monday', 'dow_Saturday', 'dow_Sunday', 'dow_Thursday', 'dow_Tuesday', 
            'arr_dawn', 'arr_evening', 'arr_morning', 'arr_noon', 
            'dep_dawn', 'dep_evening', 'dep_morning', 'dep_noon', 
            'Passengers_Seat_Ratio', 'distance', 'Taxi_Holdup', 'crs_elapsed_time', 
            'origin_0', 'origin_1', 'origin_2', 'origin_3', 'origin_4', 'origin_5', 'origin_6', 'origin_7', 'origin_8', 'origin_9', 'dest_0', 
            'dest_1', 'dest_2', 'dest_3', 'dest_4', 'dest_5', 'dest_6', 'dest_7', 'dest_8', 'dest_9']

## XGBoost Base Model

In [4]:
import xgboost as xgb

In [5]:
xg_reg = xgb.XGBRegressor()

In [6]:
xg_reg.fit(X_train[fil_feat],np.array(y_train).ravel())

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [7]:
y_pred = xg_reg.predict(X_test[fil_feat])

#### Check Feature Importance (Ranked by XGBoost)

In [8]:
importance = xg_reg.feature_importances_
importance_df = pd.Series(importance, index=fil_feat)

In [9]:
importance_df.sort_values(ascending=False)

dep_morning                  0.057716
Taxi_Holdup                  0.051784
dest_7                       0.038953
origin_7                     0.035915
origin_2                     0.035768
dow_Saturday                 0.035414
origin_1                     0.030707
arr_evening                  0.029565
dep_evening                  0.027424
origin_5                     0.026246
origin_4                     0.025641
Type_Fog                     0.024262
Type_Cold                    0.023990
Type_Snow                    0.023697
dow_Sunday                   0.023478
origin_3                     0.023360
dep_dawn                     0.022956
dow_Friday                   0.022722
dow_Thursday                 0.021377
dest_1                       0.021367
origin_6                     0.021219
dest_9                       0.021154
arr_morning                  0.020735
origin_9                     0.020635
mkt_op_carrier_difference    0.020622
dest_6                       0.020260
origin_8    

### Check Scores

In [10]:
print('The R2 score is:', xg_reg.score(X_test[fil_feat],y_test))
print('The R2 score is:', r2_score(y_test,y_pred))
print('The MSE is:', mean_squared_error(y_test,y_pred))
print('The MAE is:', mean_absolute_error(y_test,y_pred))

The R2 score is: 0.11286055431113862
The R2 score is: 0.11286055431113862
The MSE is: 2678.2325840332996
The MAE is: 24.11173859169771


## Optimize HyperParameters

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
xg_params_grid=[
    {'n_estimators':[45],
     'max_depth':[5],
     'reg_alpha':[0.57],
    }
]

In [13]:
gs_gx = GridSearchCV(estimator=xg_reg,
                     param_grid=xg_params_grid,
                     n_jobs=-1)

In [14]:
gs_gx.fit(X_train[fil_feat],np.array(y_train).ravel())

GridSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0, gpu_id=-1,
                                    importance_type='gain',
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=4,
                                    num_parallel_tree=1, random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', validate_parameters=1,
                                    verbosity=None),
             n_jobs=-1,
 

In [15]:
#print('The best estimator:', gs_gx.best_estimator_)
print('The best parameters are:', gs_gx.best_params_)
print('The best CV score was:', gs_gx.best_score_)

The best parameters are: {'max_depth': 5, 'n_estimators': 45, 'reg_alpha': 0.57}
The best CV score was: 0.11602180747596143


### NOTE: GridSearch was conducted in batches due to computational strain. Working with large datasets with high cardinality required long wait times.

### Optimized XGBoost Model

In [16]:
xg_model = xgb.XGBRegressor(max_depth=5, n_estimators=45,reg_alpha=0.57)

In [17]:
xg_model.fit(X_train[fil_feat],np.array(y_train).ravel())

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=45, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0.57, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
y_pred = xg_model.predict(X_test[fil_feat])

In [19]:
importance_model = xg_model.feature_importances_
importance_df_model = pd.Series(importance_model, index=fil_feat)

In [20]:
importance_df_model.sort_values(ascending=False)

Taxi_Holdup                  0.089861
dep_morning                  0.081416
dest_7                       0.035911
origin_7                     0.034678
origin_6                     0.030319
arr_evening                  0.028679
dow_Saturday                 0.027361
Type_Fog                     0.027100
Type_Cold                    0.027018
dep_evening                  0.025810
dest_5                       0.025135
dest_8                       0.024622
mkt_op_carrier_difference    0.024315
dow_Thursday                 0.023323
origin_2                     0.023209
dest_2                       0.022676
dow_Monday                   0.022124
origin_5                     0.022039
dow_Tuesday                  0.021484
origin_3                     0.021371
origin_8                     0.020928
dow_Friday                   0.020846
arr_morning                  0.020595
origin_1                     0.019886
dow_Sunday                   0.019070
dest_4                       0.018934
dest_9      

In [21]:
print('The R2 score is:', xg_model.score(X_test[fil_feat],y_test))
print('The R2 score is:', r2_score(y_test,y_pred))
print('The MSE is:', mean_squared_error(y_test,y_pred))
print('The MAE is:', mean_absolute_error(y_test,y_pred))

The R2 score is: 0.1186715106110856
The R2 score is: 0.1186715106110856
The MSE is: 2660.6895781591475
The MAE is: 24.254642642709292


### Export Model to Pickle

In [22]:
import pickle

In [24]:
pickle.dump(xg_model, open('final_gxboost_model.sav', 'wb'))