In [3]:
import pandas as pd
import numpy as np
import xgboost
from harmonic_mean import *
from sklearn import linear_model, ensemble, tree
from evaluation import *
from data_processing import *
from sklearn import preprocessing, model_selection,pipeline,metrics,externals
from sklearn_pandas.pipeline import TransformerPipeline

In [4]:
# Read csv files

print("=================")
print("Loading data ...")

train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

print("Done\n")

Loading data ...
Done



In [5]:
print("=================")
print("Processing data ...")

# Turn DataFrames into arrays for scikit-learn
# Forget the target column in X
# train_features = [
#     "ps_car_13",  #            : 1571.65 / shadow  609.23
# 	"ps_reg_03",  #            : 1408.42 / shadow  511.15
# 	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
# 	"ps_ind_03",  #            : 1219.47 / shadow  230.55
# 	"ps_ind_15",  #            :  922.18 / shadow  242.00
# 	"ps_reg_02",  #            :  920.65 / shadow  267.50
# 	"ps_car_14",  #            :  798.48 / shadow  549.58
# 	"ps_car_12",  #            :  731.93 / shadow  293.62
# 	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
# 	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
# 	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
# 	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
# 	"ps_reg_01",  #            :  598.60 / shadow  178.57
# 	"ps_car_15",  #            :  593.35 / shadow  226.43
# 	"ps_ind_01",  #            :  547.32 / shadow  154.58
# 	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
# 	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
# 	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
# 	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
# 	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
# 	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
# 	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
# 	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
# 	"ps_car_11",  #            :  173.28 / shadow   76.45
# 	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
# 	"ps_calc_09",  #           :  169.13 / shadow  129.72
# 	"ps_calc_05",  #           :  148.83 / shadow  120.68
# 	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
# 	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
# 	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
# 	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
# 	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
# 	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
# 	"ps_ind_14",  #            :   37.37 / shadow   16.65
#     "target"
# ]


# train = train[train_features]
train = train.drop(train.columns[train.columns.str.contains("calc")], axis=1)
train_0 = train.drop("target", axis=1)

name_to_index = {name: train_0.columns.get_loc(name) for name in train_0.columns}

X0 = np.array(train_0)
y0 = np.array(train["target"])

X_test = np.array(test)

print("Done\n")

Processing data ...
Done



In [6]:
print("=================")
print("Setting the pipeline ...")

# Define the num pipeline
num_selector = filter_num_transform(name_to_index)
num_imputer = preprocessing.Imputer(missing_values=-1, strategy="mean")
num_pipeline = pipeline.Pipeline([
    ("selector", num_selector),
    ("imputer", num_imputer),
    ("scaler", preprocessing.StandardScaler())
])

# Define the bin pipeline
bin_selector = filter_bin_transform(name_to_index)
bin_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
bin_pipeline = pipeline.Pipeline([
    ("selector", bin_selector),
    ("imputer", bin_imputer)
])

# Define the cat pipeline
cat_selector = filter_cat_transform(name_to_index)
cat_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
cat_pipeline = pipeline.Pipeline([
    ("selector", cat_selector),
    ("imputer", cat_imputer),
    ("binarizer", preprocessing.OneHotEncoder())
])

preprocessor = pipeline.FeatureUnion([("num", num_pipeline), ("bin", bin_pipeline), ("cat", cat_pipeline)])

base = ensemble.GradientBoostingClassifier(verbose=2)
pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf",base)
])

cv = model_selection.StratifiedKFold(n_splits=4)
scorer = metrics.make_scorer(gini_scorer,needs_proba=True)
print("Done\n")

Setting the pipeline ...
Done



In [29]:
clf1 = xgboost.XGBClassifier(
    scale_pos_weight=1.6,
    max_depth=3,
    n_estimators=400,
    learning_rate=0.07,
)

clf2 = xgboost.XGBClassifier(
    scale_pos_weight=1.6,
    max_depth=4,
    n_estimators=300,
    learning_rate=0.07,
)

clf3 = xgboost.XGBClassifier(
    scale_pos_weight=1.6,
    max_depth=4,
    n_estimators=400,
    learning_rate=0.07,
)

harmonic_clf = HarmonicMeanClassifier(list_classifiers=[clf1, clf2, clf3])

pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf", harmonic_clf)
])
cross_val = model_selection.cross_val_score(pipe, X0, y0, cv=cv, scoring=scorer, verbose=10, n_jobs=4)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=nan, n_estimators=400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.6, seed=0, silent=True, subsample=1)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=nan, n_estimators=400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.6, seed=0, silent=True, subsample=1)
XGBClassifier(base_score=0.5, colsample_bylevel=

[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed: 23.2min


[CV] ..................... , score=-0.27037248733437785, total=23.3min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 23.3min remaining: 23.3min


[CV] ...................... , score=-0.2791996513997706, total=23.4min
[CV] ....................... , score=-0.286552356281614, total=23.5min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 23.5min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 23.5min finished


In [30]:
pipe.fit(X0, y0)
print(gini_normalized(y0, pipe.predict_proba(X0)[:, 1]))

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.6, seed=0, silent=True, subsample=1)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=300, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.6, seed=0, silent=True, subsample=1)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.6, seed=0, silent=True, subsampl

In [33]:
harmonic_clf.feature_importances_

AttributeError: 'HarmonicMeanClassifier' object has no attribute 'feature_importances_'

In [None]:
print("=================")
print("Predicting for submission ...")
y_test_p = pipe.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(
    index=test.index,
    data=np.round(y_test_p, 3),
    columns=["target"])
prediction.to_csv("data/submission.csv")
print("Done\n")

# externals.joblib.dump(harmonic_clf,"XGB.pkl")

Predicting for submission ...
