In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 671993

In [2]:
df = pd.read_csv('../data/forbes_2000_iqr_range_feature_engineering.csv')

In [3]:
def train_test_split_by_year(x_df: pd.DataFrame, y_df: pd.DataFrame):
    X_train = x_df[x_df.test == False].drop('test', axis=1).to_numpy()
    X_test = x_df[x_df.test].drop('test', axis=1).to_numpy()
    y_train = y_df[x_df.test == False].drop('test', axis=1).to_numpy().flatten()
    y_test = y_df[x_df.test].drop('test', axis=1).to_numpy().flatten()

    return X_train, X_test, y_train, y_test

In [4]:
x=df.drop(['company','market_value','year','industry','sector','country','continent'], axis=1)
y=df.loc[:,['market_value','test']]

X_train, X_test, y_train, y_test = train_test_split_by_year(x, y)

In [5]:
params = {
    'max_depth': [3, 5, 6, 10, 15, 20],
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2, 0.3],
    'subsample': np.arange(0.5, 1.0, 0.1),
    'colsample_bytree': np.arange(0.4, 1.0, 0.1),
    'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
    'n_estimators': [100, 500, 1000],
    'reg_alpha':[0, 1, 10, 50, 100],
    'reg_lambda': [1, 10, 50, 100]
}


# Instantiate an XGBoost object with hyperparameters
xgb_reg = xgb.XGBRegressor(
    n_jobs=2,
    objective='reg:squarederror',
    booster='gbtree',
    random_state=RANDOM_SEED
)

In [6]:
clf = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=params,
    scoring='neg_mean_squared_error',
    n_iter=25,
    verbose=1250
)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5; 1/25] START colsample_bylevel=0.8999999999999999, colsample_bytree=0.5, learning_rate=0.01, max_depth=15, n_estimators=1000, reg_alpha=0, reg_lambda=50, subsample=0.5
[CV 1/5; 1/25] END colsample_bylevel=0.8999999999999999, colsample_bytree=0.5, learning_rate=0.01, max_depth=15, n_estimators=1000, reg_alpha=0, reg_lambda=50, subsample=0.5;, score=-63.562 total time= 1.2min
[CV 2/5; 1/25] START colsample_bylevel=0.8999999999999999, colsample_bytree=0.5, learning_rate=0.01, max_depth=15, n_estimators=1000, reg_alpha=0, reg_lambda=50, subsample=0.5
[CV 2/5; 1/25] END colsample_bylevel=0.8999999999999999, colsample_bytree=0.5, learning_rate=0.01, max_depth=15, n_estimators=1000, reg_alpha=0, reg_lambda=50, subsample=0.5;, score=-26.156 total time= 1.3min
[CV 3/5; 1/25] START colsample_bylevel=0.8999999999999999, colsample_bytree=0.5, learning_rate=0.01, max_depth=15, n_estimators=1000, reg_alpha=0, reg_lambda=50, subsam

RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster='gbtree',
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_...
                   param_distributions={'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
           

In [7]:
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))


Best parameters: {'subsample': 0.5, 'reg_lambda': 50, 'reg_alpha': 0, 'n_estimators': 1000, 'max_depth': 15, 'learning_rate': 0.01, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.8999999999999999}
Lowest RMSE:  5.454528827536231
