In [24]:
import pandas as pd
import numpy as np
import scipy
import sklearn

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel, SelectPercentile, chi2, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

In [27]:
import optuna
from optuna.visualization import plot_intermediate_values

In [28]:
import numpy as np
import xgboost as xgb
import lightgbm as lgb

In [29]:
from catboost import CatBoost
from catboost import Pool

In [30]:
SEED = 85

In [31]:
from XGBoostOptuna import *
from optuna_utils import *

# Get Dataset

In [32]:
from scipy.sparse import csr_matrix, save_npz, load_npz

In [33]:
X_train = load_npz("X_train.npz")
y_train = pd.read_pickle("y_train.pkl")

In [34]:
X_test = load_npz("X_test.npz")
y_test = pd.read_pickle("y_test.pkl")

In [47]:
pipeline = Pipeline([("sel", SelectPercentile(chi2))])
pipe_params = {"sel__percentile" : optuna.distributions.IntUniformDistribution(1,100)}

In [48]:
#TODO: automatic seed
xgb_estimator = XGBClassifierOptuna(booster_list=['gblinear'], seed=SEED) #'gbtree', 'dart'
#lgb_estimator = LGBClassifierOptuna(seed=SEED)
#sgd_estimator = SGDClassifierOptuna(seed=SEED)
#rf_estimator  = RFClassifierOptuna(seed=SEED)

In [49]:
estimators = [xgb_estimator]#, sgd_estimator]

In [50]:
studies = Study(X_train, y_train, estimators, pipeline, pipe_params, 
              cv=StratifiedKFold(n_splits=3, random_state=SEED), max_iter=200, max_fails=3,
              scoring=sklearn.metrics.roc_auc_score, greater_is_better=True, random_state=SEED)

In [None]:
studies.run(n_trials_per_estimator=100, n_jobs=-1)

XGBoost Classifier
Score: 0.5
Score: 0.5
Score: 0.5
Score: 0.5
Score: 0.94876
Score: 0.5
Score: 0.5
Score: 0.95842
Score: 0.578
Score: 0.5
Score: 0.5
Score: 0.96628
Score: 0.96599
Score: 0.96543
Score: 0.9658
Score: 0.96583
Score: 0.96582
Score: 0.9659
Score: 0.96158


In [None]:
study = studies.studies[0]

In [None]:
study.best_value

In [None]:
final_pipeline = study.best_trial.user_attrs['pipeline']

In [None]:
study.best_trial.value

In [None]:
final_pipeline = final_pipeline.fit(X_train, y_train)

In [None]:
test_probs = final_pipeline.predict_proba(X_test)
test_probs = test_probs[:,1]

In [None]:
sklearn.metrics.roc_auc_score(y_test, test_probs)

# Drafts

# Train-Test Split

In [95]:
from scipy.sparse import csr_matrix, save_npz, load_npz

In [177]:
save_npz("X_train.npz", X_train)
y_train.to_pickle("y_train.pkl")

In [None]:
X_train = load_npz("X_train.npz")
y_train = pd.read_pickle("y_train.pkl")

In [178]:
save_npz("X_test.npz", X_train)
y_train.to_pickle("y_test.pkl")

In [None]:
X_train = load_npz("X_test.npz")
y_train = pd.read_pickle("y_test.pkl")

In [210]:
df = pd.read_pickle("newsgroup.pkl")

In [211]:
df_train, df_test, y_train, y_test = train_test_split(df, df["binary_label"], 
                                                    test_size = 0.33, 
                                                    random_state = SEED, 
                                                    stratify = df["binary_label"])

In [212]:
#because Pandas....
df_train = df_train.copy()
df_test = df_test.copy()

In [213]:
sum(y_train)

2220

In [214]:
len(y_train)

10309

In [215]:
# Build Pipeline

In [216]:
pipeline = Pipeline([("tfidf", TfidfVectorizer(ngram_range=(1,2), 
                                               min_df=5,
                                               use_idf=False, 
                                               smooth_idf=False,                                               
                                               strip_accents="unicode",
                                               encoding = "latin1",
                                               stop_words="english",
                                               sublinear_tf=True,
                                               norm="l2"))

In [217]:
X_train = pipeline.fit_transform(df_train["text"], y_train)
X_test = pipeline.transform(df_test["text"])

In [218]:
X_train

<10309x17419 sparse matrix of type '<class 'numpy.float64'>'
	with 1595975 stored elements in Compressed Sparse Row format>

In [14]:
params = {'verbosity': 0, 'scale_pos_weight': 2.3218468468468467, 'objective': 'binary:logistic', 'booster': 'gblinear', 'reg_lambda': 8.473638939787033e-07, 'reg_alpha': 1.234287606808043e-10, 'feature_selector': 'cyclic', 'n_estimators': 10, 'n_jobs': -1, 'random_state': 85, 'learning_rate': 0.11372080930083986}

In [15]:
n_estimators = params['n_estimators']

In [16]:
learning_rates = ([0.11372080930083986] * (n_estimators-5)) + ([0.5] * 5)

In [17]:
x = XGBClassifierLR(learning_rates = learning_rates, **params)

In [18]:
#x = XGBClassifier(**params)

In [19]:
#n_estimators = params.pop('n_estimators')
#learning_rates = [0.11372080930083986] * n_estimators
#x = XGBClassifierLR2(params, n_estimators, learning_rates)

In [20]:
x = x.fit(X_train,np.array(y_train))

In [21]:
test_probs = x.predict_proba(X_test)
test_probs = test_probs[:,1]

In [22]:
sklearn.metrics.roc_auc_score(y_test, test_probs)

0.9671953352716167

In [23]:
x.get_params()

{'base_score': 0.5,
 'booster': 'gblinear',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.11372080930083986,
 'learning_rates': [0.11372080930083986,
  0.11372080930083986,
  0.11372080930083986,
  0.11372080930083986,
  0.11372080930083986,
  0.5,
  0.5,
  0.5,
  0.5,
  0.5],
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 10,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 85,
 'reg_alpha': 1.234287606808043e-10,
 'reg_lambda': 8.473638939787033e-07,
 'scale_pos_weight': 2.3218468468468467,
 'seed': None,
 'silent': True,
 'subsample': 1,
 'verbosity': 0,
 'feature_selector': 'cyclic'}