#### Create our first numerai model - will use ensemble of randomforest regressor, support vector regression and conditional inference. Will also use a cross-validation approach along with random search for hyperparameter tuning

In [13]:
!pip install xgboost



In [60]:
import pandas as pd
import numpy as np
import numerapi
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
#from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
#from rgf.sklearn import RGFRegressor
#from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.ensemble import VotingRegressor

In [None]:
napi = numerapi.NumerAPI("public_id", "secret_key")
napi.download_current_dataset(unzip=True)
napi.upload_predictions("predictions.csv", model_id="model_id")

In [17]:
train_data=pd.read_csv('../data/numerai_training_data.csv').set_index('id')
test_data=pd.read_csv('../data/numerai_tournament_data.csv').set_index('id')

In [38]:
feature_names=[c for c in train_data.columns if 'feature' in c]

#### Random Forest

Lets consider a number of parameters to tune
- number of forests
- max features to split on each node
- max depth of the tree
- boostrap
- min samples split: minimum number of data points before node is split
- min sample leaf: minimum number of data points allowed in a leaf node

In [48]:
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 2000, num = 15)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

In [49]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [50]:
rf = RandomForestRegressor()
rf_randomsearch=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100)
model_rf=rf_randomsearch.fit(train_data[feature_names],train_data['target'])









In [51]:
print('best model: {}'.format(model_rf.best_estimator_))
print('best score: {}'.format(model_rf.best_score_))
print('best model_params: {}'.format(model_rf.best_params_))

best model: RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=30,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=4, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=585,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
best score: nan
best model_params: {'n_estimators': 585, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}


In [52]:
rf_model_best=model_rf.best_estimator_
rf_model_best.fit(train_data[feature_names],train_data['target'])
predictions=rf_model_best.predict(test_data[feature_names])

In [None]:
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)

#### gradient boosting

In [53]:
loss=['ls', 'lad', 'huber', 'quantile']
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 2000, num = 15)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
learning_rate = [0.1,0.01,0.001]


random_grid = {'loss': loss,
               'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate':learning_rate}

In [54]:
gb = GradientBoostingRegressor()
gb_randomsearch=RandomizedSearchCV(estimator=gb, param_distributions=random_grid, n_iter=100)
gb_model=gb_randomsearch.fit(train_data[feature_names][:5],train_data['target'][:5])









In [55]:
print('best model: {}'.format(model_rf.best_estimator_))
print('best score: {}'.format(model_rf.best_score_))
print('best model_params: {}'.format(model_rf.best_params_))

best model: RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=30,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=4, min_samples_split=5,
                      min_weight_fraction_leaf=0.0, n_estimators=585,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
best score: nan
best model_params: {'n_estimators': 585, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}


In [56]:
gb_model_best=model_rf.best_estimator_
gb_model_best.fit(train_data[feature_names][:5],train_data['target'][:5])
predictions=gb_model_best.predict(test_data[feature_names])

In [57]:
predictions

array([0.4, 0.4, 0.4, ..., 0.4, 0.4, 0.4])

#### xgboost regressor

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 2000, num = 15)]
learning_rate=[0.0001, 0.001, 0.1]
min_child_weight=[0.1, 1, 5, 10, 50]
gamma=[0.5, 1, 1.5, 2, 5]
subsample=[0.6, 0.8, 1.0]
colsample_bytree=[0.6, 0.8, 1.0]
max_depth=[5, 10, 25, 50]
learning_rate=[0.0001, 0.001, 0.1]
n_estimators=[50, 100, 250, 500]
reg_alpha=[0.0001, 0.001, 0.1, 1]
reg_lambda=[0.0001, 0.001, 0.1, 1]

random_grid = {
    'min_child_weight':[0.1, 1, 5, 10, 50],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [5, 10, 25, 50],
    'learning_rate': [0.0001, 0.001, 0.1],
    'n_estimators': [50, 100, 250, 500],
    'reg_alpha': [0.0001, 0.001, 0.1, 1],
    'reg_lambda': [0.0001, 0.001, 0.1, 1]
    }

In [None]:
xgb = XGBRegressor()
xgb_randomsearch=RandomizedSearchCV(estimator=xgb,param_distributions=random_grid,n_iter=100)
xgb_model=xgb_randomsearch.fit(train_data[feature_names],train_data['target'])

In [None]:
print('best model: {}'.format(model_rf.best_estimator_))
print('best score: {}'.format(model_rf.best_score_)
print('best model_params: {}'.format(model_rf.best_params_)

In [None]:
xgb_model_best=model_rf.best_estimator_
xgb_model_best.fit(train_data[feature_names][:5],train_data['target'][:5])
predictions=xgb_model_best.predict(test_data[feature_names])

#### lgbm regressor

In [None]:
lgbm = LGBMBoostRegressor()

#### catboost regressor

In [None]:
cb_model = CatBoostRegressor()

#### support vector Regressor

In [41]:
kernel=['linear', 'poly', 'rbf', 'sigmoid']
degree=[2,3,4,5]
gamma=['scale', 'auto']
C=[1,2,3,4,5]
epsilon=[0.1,0.2,0.3,0.4,0.5]
shrinking=[True,False]

random_grid={'kernel':kernel,
             'degree':degree,
             'gamma':gamma,
             'C': C,
             'epsilon':epsilon,
             'shrinking':shrinking}

In [42]:
svr = SVR()
svr_randomsearch=RandomizedSearchCV(estimator=svr,param_distributions=random_grid,n_iter=100)
model_svr=svr_randomsearch.fit(train_data[feature_names][:20],train_data['target'][:20])



In [43]:
print('best model: {}'.format(model_svr.best_estimator_))
print('best score: {}'.format(model_svr.best_score_))
print('best model_params: {}'.format(model_svr.best_params_))

best model: SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
    kernel='sigmoid', max_iter=-1, shrinking=False, tol=0.001, verbose=False)
best score: -0.37863340992660494
best model_params: {'shrinking': False, 'kernel': 'sigmoid', 'gamma': 'auto', 'epsilon': 0.2, 'degree': 3, 'C': 1}


In [62]:
svr_model_best=model_svr.best_estimator_
svr_model_best.fit(train_data[feature_names][:5],train_data['target'][:5])
predictions=svr_model_best.predict(test_data[feature_names])

#### Ensemble methods with random forest, gradient boosting and support vector regressor

In [None]:
eclf1 = VotingRegressor(estimators=[('rf', rf_model_best), ('gb', gb_model_best), ('svr', svr_model_best)])
eclf1 = eclf1.fit(train_data[feature_names][:5], train_data['target'][:5])
predictions=eclf1.predict(test_data)