In [1]:
import pandas as pd
import numpy as np
import scipy
import sklearn

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectPercentile, chi2, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

In [4]:
import optuna
from optuna.visualization import plot_intermediate_values

In [5]:
import numpy as np
import xgboost as xgb
import lightgbm as lgb

In [6]:
SEED = 85

In [7]:
from hypster_xgboost import *
from optuna_utils import *

# Get Dataset

In [8]:
from scipy.sparse import csr_matrix, save_npz, load_npz

In [9]:
dataset = "adult!"

In [10]:
if dataset=="adult":
    X_train = pd.read_pickle("./data/adult_X_train.pkl")
    y_train = pd.read_pickle("./data/adult_y_train.pkl")
    X_test = pd.read_pickle("./data/adult_X_test.pkl")
    y_test = pd.read_pickle("./data/adult_y_test.pkl")
    cat_columns = X_train.select_dtypes(include="object").columns
else:
    X_train = load_npz("./data/X_train.npz")
    y_train = pd.read_pickle("./data/y_train.pkl")
    X_test = load_npz("./data/X_test.npz")
    y_test = pd.read_pickle("./data/y_test.pkl")
    cat_columns=None

In [11]:
#X_train = X_train.sample(n=10000, random_state=SEED, axis=0)

In [12]:
#y_train = y_train.iloc[X_train.index].reset_index(drop=True)
#X_train.reset_index(inplace=True, drop=True)

In [13]:
#pipeline - pipeline_objective OR regular pipeline
#consider making pre-made steps with best practices (FS, scaling, etc...) then add option to concat to make one pipeline 

In [14]:
#pipeline = Pipeline([("sel", SelectPercentile(chi2))])
#pipe_params = {"sel__percentile" : optuna.distributions.IntUniformDistribution(1,100)}
pipeline = None
pipe_params = None

In [15]:
#TODO: automatic seed
xgb_linear = XGBClassifierOptuna(booster_list=['gblinear']
                               ,param_dict={'max_depth' : optuna.distributions.IntUniformDistribution(20, 30)
                               ,'verbosity' : 2
                               #,'subsample' : 0.9
                               }
                               )
#gb_dart = XGBClassifierOptuna(booster_list=['dart'])
#xgb_tree = XGBClassifierOptuna(booster_list=['gbtree', 'dart'], user_param_dict={'max_depth' : 2})
xgb_tree = XGBClassifierOptuna(booster_list=['gbtree', 'dart'], 
                               param_dict={'max_depth' : optuna.distributions.IntUniformDistribution(20, 30)
                                               ,'verbosity' : 2
                                               #,'subsample' : 0.9 
                                               })
#lgb_estimator = LGBClassifierOptuna()
#sgd_estimator = SGDClassifierOptuna()
#rf_estimator  = RFClassifierOptuna()

In [16]:
estimators = [xgb_linear]#, sgd|_estimator]

In [17]:
clf = HyPSTERClassifier(estimators, pipeline, pipe_params,
                        scoring=sklearn.metrics.roc_auc_score, greater_is_better=True,
                        cv=StratifiedKFold(n_splits=3, random_state=SEED), refit=False, random_state=SEED, n_jobs=-1)

In [18]:
%%time
clf.fit(X_train, y_train, cat_columns=cat_columns, n_trials_per_estimator=30)

XGBoost Classifier
Score: 0.5
Score: 0.49393
Score: 0.5
Score: 0.5
Score: 0.5
Score: 0.5
Score: 0.96331
Score: 0.96751
Score: 0.96687
Score: 0.9635
Score: 0.96923
Wall time: 35.9 s


In [19]:
clf.best_score_

0.969228572225576

In [20]:
clf.best_params_

{'max_depth': 27,
 'init_eta': 0.05145325944171825,
 'booster': 'gblinear',
 'lambda': 5.545234877448395e-06,
 'alpha': 2.8182308806808016e-07,
 'shotgun_feature_selector': 'cyclic'}

In [21]:
clf.refit(X_train, y_train)

In [22]:
x = clf.best_estimator_

In [23]:
x.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
clf.predict_proba(X_test)

array([[0.97269696, 0.02730302],
       [0.87589103, 0.12410896],
       [0.98481816, 0.01518183],
       ...,
       [0.90507203, 0.09492799],
       [0.9598829 , 0.04011708],
       [0.92029834, 0.07970167]], dtype=float32)

In [30]:
test_probs = clf.predict_proba(X_test)
test_probs = test_probs[:,1]

In [31]:
sklearn.metrics.roc_auc_score(y_test, test_probs)

0.9710877330161598

# Misc