In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import sklearn
from statsmodels.tsa.stattools import adfuller
import optuna
import scipy
from scipy import stats
from numpy import log
from sklearn.model_selection import  train_test_split

1. Data importation and descriptive statistics

In [None]:
data_train = pd.read_csv('../input/jane-street-market-prediction/train.csv',index_col = 'ts_id', squeeze = True)
data_validation = pd.read_csv('../input/jane-street-market-prediction/example_test.csv',index_col = 'ts_id', squeeze = True)

In [None]:
data_train.describe()

In [None]:
data_train.info()

2.Checking for missing values and imputing them with their mean (if any)

In [None]:
data_train.isna().sum().sum()

In [None]:
data_validation.isna().sum().sum()

There a lot of missing values. They will be imputed with their mean

In [None]:
data_train.fillna((data_train.mean()), inplace=True)
data_validation.fillna((data_validation.mean()), inplace=True)

Checking if all missing values are imputed

In [None]:
data_train.isna().sum().sum()

In [None]:
data_validation.isna().sum().sum()

    3. Creating additional columns  the returns over different time periods. They will be used when
    creating an action column - whether a certain trade to be done. 
    The condition is to have positive return - there will be no money loss
    The output is binary:
    1 - the trade shall be executed
    0 - the trade will be missed
    
    

In [None]:
data_train['ret1'] = data_train['weight']*data_train['resp']
data_train['ret2'] = data_train['weight']*data_train['resp_1']
data_train['ret3'] = data_train['weight']*data_train['resp_2']
data_train['ret4'] = data_train['weight']*data_train['resp_3']
data_train['ret5'] = data_train['weight']*data_train['resp_4']
data_train['action'] = np.where((data_train['ret1']>0)|(data_train['ret2']>0)|(data_train['ret3']>0)|(data_train['ret4']>0)|(data_train['ret5']>0),1,0)

Removing ret columns as they are not needed anymore

In [None]:
data_train.drop(['ret1','ret2','ret3','ret4','ret5'], axis = 1, inplace =True)

4. Checking the distribution of the data. A Jarque-Bera test is performed. A p-value will determine if the featuredata has a normal distribution

In [None]:
for _, i in data_train.loc[:,data_train.columns.str.contains('feature')].items():
  jb_test = stats.jarque_bera(i)
  print(i.name, jb_test)

The p-value for all columns is 0 and the data has a normal distribution

6. The dataset is huge, so a Bootstrapping will be used to get random sample
I will start with 100000 from the features

Bootstrapping

In [None]:
from sklearn.utils import resample
boot = resample(data_train, replace=True, n_samples=100000, random_state=42)
print('Bootstrap Sample: %s' % boot)
# out of bag observations
oob = [x for x in data_train if x not in boot]
print('OOB Sample: %s' % oob)

5. Separating the train data set between feature values (A) and target value column B  

In [None]:
A = boot.loc[:, boot.columns.str.contains('feature')]
B = boot.loc[:,'action']

In [None]:
6. MODEL

In [None]:
%whos

Separating a train and test split

In [None]:
A_train,A_test,B_train, B_test = train_test_split(A,B, test_size=0.3, random_state = 42)


In [None]:
dtrain = xgb.DMatrix(A_train, label=B_train)
dtest  = xgb.DMatrix(A_test, label=B_test)



In [None]:
%whos


In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method' : 'gpu_hist',
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dtest)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(B_test, pred_labels)
    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=5)

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show();

In [None]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['objective'] = 'binary:logistic'
clf = xgb.XGBClassifier(**best_params)

In [None]:
clf.fit(A, B)

In [None]:
%whos

In [None]:

preds =clf.predict(A_test)

In [None]:
print(preds)