In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy.sparse import save_npz, load_npz, vstack

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import PredefinedSplit, cross_val_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [4]:
from xgboost import XGBClassifier

In [5]:
from XGBoostOptuna import *
from optuna_utils import *

In [6]:
SEED = 42

# True Test Mode Vs. Inner Test mode

true_test_mode == True means Training on all "train_df" and predicting on "test_df" without knowing the real labels.

In [7]:
true_test_mode = True

# Load Datasets

In [8]:
train_non_numeric = pd.read_pickle("./data/train_non_numeric.pkl")
train_df = load_npz("./data/train_df.npz")

In [9]:
#convert mahzor to code from 0-4 for later use in Cross Validation
mahzor = LabelEncoder().fit_transform(train_non_numeric['mahzor_acharon'])

In [10]:
test_non_numeric = pd.read_pickle("./data/test_non_numeric.pkl")
test_df = load_npz("./data/test_df.npz")

In [11]:
df_columns = pd.read_pickle("./data/df_columns.pkl")

In [12]:
y = np.int32(train_non_numeric["nesher"])

# Create Train/Test Splits

In [13]:
if true_test_mode:
    X_train = train_df
    y_train = train_non_numeric["nesher"].copy()
    X_test = test_df
    train_mahzor = mahzor
else:
    # Creating Inner Test set by predicting last mahzor results
    X_train = train_df[mahzor < 4]
    y_train = y[mahzor<4]
    X_test = train_df[mahzor == 4]
    y_test = y[mahzor == 4]
    train_mahzor = mahzor[mahzor<4]

# Define Model

In [14]:
model = XGBClassifier(max_depth=6, n_jobs=-1, random_state=SEED)

# Find and Remove "Drifts" Between Train & Test

In [15]:
# We want to find and eliminate features that have different distributions between train and test datasets
# These features "confuse" the model and damage performance on test set

In [16]:
# The trick is to concatenate the datasets and define label "0" for samples from the training set
# and label "1" for samples from the test set

In [17]:
# We then extract univariate feature importances and see if there are informative features that help distinguish
# if a sample belongs to the train or test set.

In [18]:
# Finally, we keep only the features that are un-informative

In [19]:
X = vstack([X_train,X_test], format="csr")

In [20]:
# this creates a list of "0"s and "1"s according to the number of rows in each dataset
y = (X_train.shape[0] * [0]) + (X_test.shape[0] * [1])

In [21]:
imp = mutual_info_classif(X,y)

In [22]:
imp_df = pd.DataFrame({"importance" : imp})
imp_df['feature_name'] = df_columns.values
imp_df.sort_values(by="importance", ascending=False, inplace=True)

In [23]:
imp_df.head(20)

Unnamed: 0,importance,feature_name
39,0.662363,madad_eitanut
580,0.177274,"submix_תנועה_ בירורים (ת""ג, מידע, משך / אורך ..."
157,0.177274,"sub2_ בירורים (ת""ג, מידע, משך / אורך שירות)"
555,0.143935,submix_רפואי_שאלון רפואי ומסמכים נוספים
333,0.143935,sub2_שאלון רפואי ומסמכים נוספים
65,0.132642,manila_std_diff
112,0.127556,"sub1_ איתור קד""צ"
4282,0.090851,sub3_לטייס בקשה ליום
2978,0.090851,sub3_לגיוס מה הקוד
9843,0.090851,sub3_שדה חוזר בוגר


In [24]:
similar_distribution_feature_indices = (imp_df[imp_df['importance']<=0.01]).index

In [25]:
# If there are "informative" features - this means that these features have different distributions between train and test.
# Therefore we will keep only those who got low scores in the feature_importance score

In [26]:
X_train = X_train[:,similar_distribution_feature_indices]

In [27]:
X_test = X_test[:,similar_distribution_feature_indices]

In [28]:
df_columns = df_columns[similar_distribution_feature_indices].copy().reset_index(drop=True)

# Feature Importances

In [29]:
model = model.fit(X_train, y_train)
imp_df = pd.DataFrame({"importance" : model.feature_importances_})
imp_df['feature_name'] = df_columns.values
imp_df.sort_values(by="importance", ascending=False, inplace=True)

In [30]:
imp_df.head(20)

Unnamed: 0,importance,feature_name
358,0.022169,application_count_days_before_giyus_after_giyus
326,0.018892,destination_will
1672,0.00789,sub3_גיוס האם
103,0.005905,lohem_will
977,0.005575,kaba
3902,0.004857,dibur_is_missing
242,0.004571,application_count_days_before_giyus_0-14days
1036,0.00427,is_destination_3
6098,0.004135,ind_over_minimal_requirements_mm
3862,0.003938,is_destination_12


# Feature Selection

In [31]:
informative_feature_indices = (imp_df[imp_df['importance']>0]).index

In [32]:
X_train = X_train[:,informative_feature_indices].copy()

In [33]:
X_test = X_test[:,informative_feature_indices].copy()

In [34]:
df_columns = df_columns[informative_feature_indices].copy().reset_index(drop=True)

### Create CV Splits by mahzor

In [35]:
def create_predefined_splits(cv_splits, X, y):
    test_folds_idx = np.zeros(len(y), dtype=np.int)
    i = 0
    for train_index, test_index in cv_splits.split(X, y):
        test_folds_idx[test_index] = i
        i+=1
    return PredefinedSplit(test_folds_idx)

In [36]:
cv_splits = PredefinedSplit(train_mahzor)

# Optuna

In [37]:
#TODO: automatic seed
xgb_estimator = XGBClassifierOptuna(booster_list=['gblinear', 'gbtree', 'dart'], seed=SEED) 
#lgb_estimator = LGBClassifierOptuna(seed=SEED)
#sgd_estimator = SGDClassifierOptuna(seed=SEED)
#rf_estimator  = RFClassifierOptuna(seed=SEED)

In [38]:
estimators = [xgb_estimator]#, sgd_estimator]

In [39]:
studies = Study(X_train, y_train, estimators, #pipeline, pipe_params, 
              cv=cv_splits, max_iter=200, max_fails=3,
              scoring=sklearn.metrics.roc_auc_score, greater_is_better=True, random_state=SEED)

In [40]:
studies.run(n_trials_per_estimator=100, n_jobs=-1)

XGBoost Classifier
Score: 0.76725
Score: 0.76087
Score: 0.60152
Score: 0.77211
Score: 0.77063
Score: 0.77398
Score: 0.75458
Score: 0.67016
Score: 0.7718
Score: 0.76313
Score: 0.76912
Score: 0.77029
Score: 0.76792
Score: 0.77422
Score: 0.76064
Score: 0.78119
Score: 0.7709


[W 2019-06-22 02:38:06,458] Setting status of trial#37 as TrialState.FAIL because of the following error: ValueError("Input contains NaN, infinity or a value too large for dtype('float32').")
Traceback (most recent call last):
  File "C:\Users\user\Anaconda3\lib\site-packages\optuna\study.py", line 398, in _run_trial
    result = func(trial)
  File "c:\python_workspace\optuna-on-steroids\optuna_utils.py", line 60, in __call__
    fold_score = estimator.score_test(self.scoring)
  File "c:\python_workspace\optuna-on-steroids\XGBoostOptuna.py", line 102, in score_test
    return scorer(self.dtest.get_label(), preds)
  File "C:\Users\user\Anaconda3\lib\site-packages\sklearn\metrics\ranking.py", line 355, in roc_auc_score
    sample_weight=sample_weight)
  File "C:\Users\user\Anaconda3\lib\site-packages\sklearn\metrics\base.py", line 76, in _average_binary_score
    return binary_metric(y_true, y_score, sample_weight=sample_weight)
  File "C:\Users\user\Anaconda3\lib\site-packages\sklearn\m

Score: 0.77471
Score: 0.7744
Score: 0.77375
Score: 0.78212
Score: 0.78069
Score: 0.77996


In [41]:
study = studies.studies[0]

In [42]:
study.best_value

0.7821180827565675

In [44]:
final_pipeline = study.best_trial.user_attrs['pipeline']

In [45]:
study.best_trial.value

0.7821180827565675

In [46]:
final_pipeline = final_pipeline.fit(X_train, y_train)

In [47]:
test_probs = final_pipeline.predict_proba(X_test)
test_probs = test_probs[:,1]

In [48]:
sklearn.metrics.roc_auc_score(y_test, test_probs)

NameError: name 'y_test' is not defined

# Train Model

In [None]:
scores = cross_val_score(model, X_train, y_train, scoring="roc_auc", cv = cv_splits, n_jobs=-1)

In [None]:
print("We got " + str(round(scores.mean(), 3)) + " AUC with CV on the training set!")

In [None]:
model = model.fit(X_train, y_train)

In [None]:
test_probs = model.predict_proba(X_test)
test_probs = test_probs[:,1]

In [None]:
if true_test_mode == False:
    val_auc = metrics.roc_auc_score(y_test, test_probs)
    print("We got " + str(round(val_auc, 3)) + " AUC on the validation set!")

# Output for Kaggle

In [49]:
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

In [51]:
if true_test_mode:
    submission = pd.DataFrame({"TZ": test_non_numeric['tz'], 'NESHER' : test_probs})
    #submission.to_csv('./submissions/sub' + timestr + '.csv', index=False)
    
    solution = pd.read_csv('./data/solution.csv')
    solution = solution.merge(submission, on="TZ")
    test_auc = metrics.roc_auc_score(solution["NESHER_x"], solution["NESHER_y"])
    print("We got " + str(round(test_auc, 3)) + " AUC on the test set!")

We got 0.771 AUC on the test set!
