In [1]:
import pandas as pd
import numpy as np
import scipy
import sklearn

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectPercentile, chi2, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

In [4]:
import optuna
from optuna.visualization import plot_intermediate_values

In [5]:
import numpy as np
import xgboost as xgb
import lightgbm as lgb

In [6]:
SEED = 85

In [7]:
from hypster_xgboost import *
from hypster import *

# Get Dataset

In [8]:
from scipy.sparse import csr_matrix, save_npz, load_npz

In [9]:
dataset = "adult" #adult, boston

In [10]:
if dataset=="adult":
    X_train = pd.read_pickle("./data/adult_X_train.pkl")
    y_train = pd.read_pickle("./data/adult_y_train.pkl")
    X_test = pd.read_pickle("./data/adult_X_test.pkl")
    y_test = pd.read_pickle("./data/adult_y_test.pkl")
    cat_columns = X_train.select_dtypes(include="object").columns
elif dataset=="newsgroup":
    X_train = load_npz("./data/X_train.npz")
    y_train = pd.read_pickle("./data/y_train.pkl")
    X_test = load_npz("./data/X_test.npz")
    y_test = pd.read_pickle("./data/y_test.pkl")
    cat_columns=None
else:
    X_train = pd.read_pickle("./data/boston_X_train.pkl")
    y_train = pd.read_pickle("./data/boston_y_train.pkl")
    X_test = pd.read_pickle("./data/boston_X_test.pkl")
    y_test = pd.read_pickle("./data/boston_y_test.pkl")
    cat_columns = None

In [11]:
#X_train = X_train.sample(n=10000, random_state=SEED, axis=0)

In [12]:
#y_train = y_train.iloc[X_train.index].reset_index(drop=True)
#X_train.reset_index(inplace=True, drop=True)

In [13]:
#pipeline - pipeline_objective OR regular pipeline
#consider making pre-made steps with best practices (FS, scaling, etc...) then add option to concat to make one pipeline 

In [14]:
#pipeline = Pipeline([("sel", SelectPercentile(chi2))])
#pipe_params = {"sel__percentile" : optuna.distributions.IntUniformDistribution(1,100)}
pipeline = None
pipe_params = None

In [15]:
#TODO: automatic seed
xgb_linear = XGBClassifierHypster(booster_list=['gblinear']
                               ,param_dict={'nthread' : 1
                                            #TODO check what happens when you run over parameters in optuna
                               #,'subsample' : 0.9
                               }
                               )
#gb_dart = XGBClassifierHypster(booster_list=['dart'])
#xgb_tree = XGBClassifierHypster(booster_list=['gbtree', 'dart'], user_param_dict={'max_depth' : 2})
xgb_tree = XGBClassifierHypster(booster_list=['gbtree', 'dart'], 
                               param_dict={'max_depth' : optuna.distributions.IntUniformDistribution(2, 20)
                                               ,'nthread' : 1
                                               #,'subsample' : 0.9 
                                               })
#lgb_estimator = LGBClassifierOptuna()
#sgd_estimator = SGDClassifierOptuna()
#rf_estimator  = RFClassifierOptuna()

In [16]:
estimators = [xgb_linear, xgb_tree]#, sgd|_estimator]

In [17]:
clf = HyPSTERClassifier(estimators, pipeline, pipe_params,
                        scoring="accuracy", cv=StratifiedKFold(n_splits=3, random_state=SEED), #sampler=sampler, 
                        refit=False, random_state=SEED, n_jobs=-1)

In [18]:
%%time
clf.fit(X_train, y_train, cat_columns=cat_columns, n_trials_per_estimator=60)

XGBoost Classifier
Score: 0.75919
Score: 0.75919
Score: 0.75919
Score: 0.75919
Score: 0.75919
Score: 0.75919
Score: 0.84177
Score: 0.81103
Score: 0.79586
Score: 0.84887
Score: 0.84887
Score: 0.85129
Score: 0.85169
Score: 0.85126
Score: 0.84967
Score: 0.85191
Score: 0.84709
Score: 0.84994
Score: 0.84988
Score: 0.84991
Score: 0.84997
Score: 0.85062
XGBoost Classifier
Score: 0.85676
Score: 0.85953
Score: 0.8598
Score: 0.85802
Score: 0.86035
Score: 0.86855
Score: 0.8668
Score: 0.86401
Score: 0.86776
Score: 0.86416
Score: 0.86723
Score: 0.8688
Score: 0.86978
Score: 0.86846
Score: 0.8692
Score: 0.86951
Score: 0.86945
Score: 0.86803
Score: 0.86963
Score: 0.86865
Score: 0.86923
Score: 0.86871
Score: 0.86852
Wall time: 6min 17s


In [28]:
clf.best_score_

0.8697829942805195

In [29]:
clf.best_params_

{'max_depth': 24,
 'scale_pos_weight': 1.0,
 'init_eta': 0.8340622808717366,
 'booster': 'gbtree',
 'lambda': 0.9844984066071544,
 'alpha': 0.0017308004549787055,
 'min_child_weight': 7,
 'gamma': 0.06799113929261205,
 'grow_policy': 'depthwise',
 'subsample': 0.7723758293965605,
 'colsample_bytree': 0.8292889527963906,
 'colsample_bynode': 0.596425664749337,
 'forest_boosting': True,
 'num_parallel_tree': 15}

In [30]:
clf.refit(X_train, y_train)

In [31]:
x = clf.best_estimator_

In [44]:
test_probs = clf.predict(X_test)

In [45]:
sklearn.metrics.accuracy_score(y_test, test_probs)

0.8710767151894847

In [46]:
test_probs = clf.predict_proba(X_test)
test_probs = test_probs[:,1]

In [47]:
sklearn.metrics.roc_auc_score(y_test, test_probs)

0.9249667381146391

# Misc