# Imports

In [1]:
import pandas as pd

# Read Data

In [2]:
X_train = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/X_train_normalized.csv", index_col=[0])
X_test = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/X_test_normalized.csv", index_col=[0])
y_train = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/y_train.csv", index_col=[0])
y_test = pd.read_csv("/kaggle/input/ibm-classification-feature-selection/y_test.csv", index_col=[0]).reset_index(drop=True)

In [3]:
X_train.head(5)

Unnamed: 0,Volume,Sector_Basic Materials,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,...,Open 60-Day Shifted Differenced,Adj Close 7-Day Upper Bollinger Band Differenced,True Range Differenced,True Range 26-Day SMA Differenced,Adj Close Differenced,High Differenced,Low Differenced,Open Differenced,Date,Symbol
0,-0.082146,-0.211123,-0.211123,-0.355936,-0.279668,4.506073,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.099365,-0.065462,0.021662,0.207475,-0.143825,0.001395,-0.21695,-0.124697,2014-09-04,14
1,-0.199298,-0.211123,-0.211123,-0.355936,-0.279668,4.506073,-0.394063,-0.386806,-0.411949,-0.256814,...,0.128093,-0.026616,-0.135444,0.292787,-0.199996,-0.087753,-0.355281,-0.261008,2014-09-04,262
2,0.256066,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.127797,-0.051266,-0.014738,0.014914,0.041423,-0.083876,-0.089539,-0.100843,2014-09-04,105
3,0.809517,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,2.537663,-0.386806,-0.411949,-0.256814,...,0.163632,0.078857,0.052865,0.340484,0.049223,0.117676,0.048791,-0.012241,2014-09-04,395
4,-0.205682,-0.211123,-0.211123,2.809494,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,0.106768,-0.007671,-0.240677,-0.105232,-0.049846,-0.320311,-0.064057,-0.346203,2014-09-04,202


In [4]:
X_test.head(5)

Unnamed: 0,Volume,Sector_Basic Materials,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,...,Open 60-Day Shifted Differenced,Adj Close 7-Day Upper Bollinger Band Differenced,True Range Differenced,True Range 26-Day SMA Differenced,Adj Close Differenced,High Differenced,Low Differenced,Open Differenced,Date,Symbol
0,-0.263726,-0.211123,4.736577,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.181107,-0.262042,-0.154274,-1.323139,-0.492231,-0.014108,-0.377124,0.161556,2021-01-04,139
1,-0.186121,-0.211123,-0.211123,-0.355936,3.575666,-0.221923,-0.394063,-0.386806,-0.411949,-0.256814,...,-0.216647,-0.008371,-0.027088,-0.381858,-0.122099,0.016899,-0.064058,0.035468,2021-01-04,255
2,-0.309133,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,2.537663,-0.386806,-0.411949,-0.256814,...,-0.778181,-0.13468,-0.002115,-0.513907,-0.962961,0.303723,-0.621021,0.386469,2021-01-04,378
3,-0.225167,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,2.427487,-0.256814,...,0.348439,-0.106183,-0.067764,-0.330715,-0.801156,0.156436,-0.497253,0.076363,2021-01-04,141
4,0.330085,-0.211123,-0.211123,-0.355936,-0.279668,-0.221923,-0.394063,-0.386806,-0.411949,3.893864,...,-0.056716,-0.11587,0.046599,-0.063529,-0.173969,0.040155,-0.140504,0.015022,2021-01-04,253


In [5]:
y_train.head(5)

Unnamed: 0,1-week Forward Return Sign
0,0.0
1,0.0
2,1.0
3,0.0
4,1.0


In [6]:
y_test.head(5)

Unnamed: 0,1-week Forward Return Sign
0,1.0
1,0.0
2,1.0
3,1.0
4,1.0


# Fit XGBoost Model

In [7]:
from xgboost import XGBClassifier

### Parameter Optimization

https://xgboost.readthedocs.io/en/stable/parameter.html
* We draw from uniform distribution when we want to give equal weight to all values in the range
* We draw from loguniform distribution when we want to give more weight to smaller values in the range. This is used for parameters where absolute change in its value has more of an effect at the lower end than the same absolute change at the upper end. 

In [8]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

def objective(trial, X_train, y_train):
    """Objective function returning auc to be used in hyperparameter tuning"""
    
    # Parameters to optimize
    params = {
        'booster': 'gbtree', # Default booster
        'device': 'cuda', # Use GPU
        'verbosity': 0, # Logging
        'objective': 'binary:logistic', # Learn using logistic regression probabilities
        'eta': 0.005, # Learning rate,
        'max_depth': 4, # Max depth of any 1 tree
        'eval_metric': 'auc', # Use auc,
        'scale_pos_weight': (y_train == 0).sum().values[0] / (y_train == 1).sum().values[0], # Handle class imbalance
        'n_estimators': 25000, # Number of trees in the model. Adding trees should improve model up to a point. Can use early stopping rounfs
        'gamma': trial.suggest_uniform('gamma', 0, 1), # Minimum reduction in loss function required to make a partition in the tree. Larger gamma, more conservative with parttions.
        'min_child_weight': trial.suggest_int('min_child_weight', 0, 10), # Minimum sum of weights needed in a child. If a tree partition results in a leaf with sum of weights less than this, there will be no further partitioning
        'lambda': trial.suggest_uniform('lambda', 0, 1), # L2 regularisation. Increasing value will make model more conservative
        'alpha': trial.suggest_uniform('alpha', 0, 1),# L1 regularisation. Increasing value will make model more conservative
    }
    
    # Time series split for cross validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # List to store AUC scores for each fold
    scores = []
    
    # Gap size for purging (4000 rows roughly 10 days)
    gap = 4000
    
    # For each fold, get AUC score
    for train_index, val_index in tscv.split(X_train):
        
        # Split data into training fold and validation fold, purging with a gap
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index[gap:]]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index[gap:]]
        
        # Fit model with these parameters
        model = XGBClassifier(**params) 
        model.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            verbose=0, # Logging
            early_stopping_rounds=500 # Stop adding more estimators if no improvement
        )

        # Predict probabilities
        preds = model.predict_proba(X_val_fold)[:, 1]

        # Get AUC
        auc = roc_auc_score(y_val_fold, preds)
        scores.append(auc)
    
    # Calculate mean AUC across folds
    mean_auc = np.mean(scores)
    
    return mean_auc

In [9]:
# Create study
import optuna
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters
study.optimize(
    lambda trial: objective(
        trial, 
        X_train=X_train.drop(['Date', 'Symbol'], axis=1), 
        y_train=y_train
    ), 
    n_trials=100
)

# Get best parameters
best_params = study.best_trial.params

# Add fixed params that aren't stored in optuna's best params object
best_params['booster'] = 'gbtree'
best_params['device'] = 'cuda'
best_params['verbosity'] = 0
best_params['objective'] = 'binary:logistic'
best_params['eval_metric'] = 'auc'
best_params['eta'] = 0.005
best_params['n_estimators'] = 25000
best_params['max_depth'] = 5
best_params['scale_pos_weight'] = (y_train == 0).sum().values[0] / (y_train == 1).sum().values[0]

[I 2024-06-08 10:41:22,878] A new study created in memory with name: no-name-ca303152-5abf-413b-83d3-ada30851460c
  'gamma': trial.suggest_uniform('gamma', 0, 1), # Minimum reduction in loss function required to make a partition in the tree. Larger gamma, more conservative with parttions.
  'lambda': trial.suggest_uniform('lambda', 0, 1), # L2 regularisation. Increasing value will make model more conservative
  'alpha': trial.suggest_uniform('alpha', 0, 1),# L1 regularisation. Increasing value will make model more conservative
[I 2024-06-08 10:42:12,881] Trial 0 finished with value: 0.5477393394518615 and parameters: {'gamma': 0.06732882488631309, 'min_child_weight': 0, 'lambda': 0.14912055444395533, 'alpha': 0.6720337364235318}. Best is trial 0 with value: 0.5477393394518615.
  'gamma': trial.suggest_uniform('gamma', 0, 1), # Minimum reduction in loss function required to make a partition in the tree. Larger gamma, more conservative with parttions.
  'lambda': trial.suggest_uniform('l

In [10]:
###########################################################################
# Train a model with optimized parameters to work out optimal n_estimators
###########################################################################
# Gap size for purging (4000 rows roughly 10 days)
gap = 4000

# List to store optimal number of estimators for each fold
n_estimators = []

# Time series split for cross validation
tscv = TimeSeriesSplit(n_splits=5)
    
# For each fold, get AUC score
for train_index, val_index in tscv.split(X_train.drop(['Date', 'Symbol'], axis=1)):

    # Split data into training fold and validation fold, purging with a gap of 4000 rows (roughly 10 days)
    X_train_fold, X_val_fold = X_train.drop(['Date', 'Symbol'], axis=1).iloc[train_index], X_train.drop(['Date', 'Symbol'], axis=1).iloc[val_index[gap:]]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index[gap:]]

    # Fit model with these parameters
    model = XGBClassifier(**best_params) 
    model.fit(
        X_train_fold,
        y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=0, # Logging
        early_stopping_rounds=500 # Stop adding more estimators if no improvement
    )
    
    # Get optimal number of estimators
    best_iter = model.best_iteration
    n_estimators.append(best_iter)

# Get number of estimators for best params, scaling it
best_params['n_estimators'] = int(np.mean(n_estimators) * 1.05)



In [11]:
# Print and save best parameters
import pickle
with open('best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)
print('Number of finished trials:', len(study.trials))
print('Best trial:', best_params)

Number of finished trials: 100
Best trial: {'gamma': 0.4897634275086528, 'min_child_weight': 1, 'lambda': 0.950630912339358, 'alpha': 0.18663138383301178, 'booster': 'gbtree', 'device': 'cuda', 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'eta': 0.005, 'n_estimators': 495, 'max_depth': 5, 'scale_pos_weight': 0.7928676766322563}


In [12]:
# Plot optimization history
from optuna.visualization import plot_optimization_history
fig = plot_optimization_history(study)
fig.show()

In [13]:
# Plot hyperparameter importances
from optuna.visualization import plot_param_importances
fig = plot_param_importances(study)
fig.show()

### Train Model

In [14]:
# Train model on whole training set with best params
xgb = XGBClassifier(**best_params)
xgb.fit(
    X_train.drop(['Date', 'Symbol'], axis=1), 
    y_train,
    verbose=0
)

In [15]:
# Save model
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)