In [6]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import linear_model, ensemble, tree
from evaluation import *
from data_processing import *
from sklearn import preprocessing, model_selection,pipeline,metrics,externals
from sklearn_pandas.pipeline import TransformerPipeline

In [7]:
# Read csv files

print("=================")
print("Loading data ...")

train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

print("Done\n")

Loading data ...
Done



In [8]:
print("=================")
print("Processing data ...")

# Turn DataFrames into arrays for scikit-learn
# Forget the target column in X
# train_features = [
#     "ps_car_13",  #            : 1571.65 / shadow  609.23
# 	"ps_reg_03",  #            : 1408.42 / shadow  511.15
# 	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
# 	"ps_ind_03",  #            : 1219.47 / shadow  230.55
# 	"ps_ind_15",  #            :  922.18 / shadow  242.00
# 	"ps_reg_02",  #            :  920.65 / shadow  267.50
# 	"ps_car_14",  #            :  798.48 / shadow  549.58
# 	"ps_car_12",  #            :  731.93 / shadow  293.62
# 	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
# 	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
# 	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
# 	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
# 	"ps_reg_01",  #            :  598.60 / shadow  178.57
# 	"ps_car_15",  #            :  593.35 / shadow  226.43
# 	"ps_ind_01",  #            :  547.32 / shadow  154.58
# 	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
# 	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
# 	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
# 	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
# 	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
# 	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
# 	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
# 	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
# 	"ps_car_11",  #            :  173.28 / shadow   76.45
# 	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
# 	"ps_calc_09",  #           :  169.13 / shadow  129.72
# 	"ps_calc_05",  #           :  148.83 / shadow  120.68
# 	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
# 	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
# 	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
# 	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
# 	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
# 	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
# 	"ps_ind_14",  #            :   37.37 / shadow   16.65
#     "target"
# ]


# train = train[train_features]
train = train.drop(train.columns[train.columns.str.contains("calc")], axis=1)
train_0 = train.drop("target", axis=1)

name_to_index = {name: train_0.columns.get_loc(name) for name in train_0.columns}

X0 = np.array(train_0)
y0 = np.array(train["target"])

X_test = np.array(test)

print("Done\n")

Processing data ...
Done



In [9]:
print("=================")
print("Setting the pipeline ...")

# Define the num pipeline
num_selector = filter_num_transform(name_to_index)
num_imputer = preprocessing.Imputer(missing_values=-1, strategy="mean")
num_pipeline = pipeline.Pipeline([
    ("selector", num_selector),
    ("imputer", num_imputer),
    ("scaler", preprocessing.StandardScaler())
])

# Define the bin pipeline
bin_selector = filter_bin_transform(name_to_index)
bin_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
bin_pipeline = pipeline.Pipeline([
    ("selector", bin_selector),
    ("imputer", bin_imputer)
])

# Define the cat pipeline
cat_selector = filter_cat_transform(name_to_index)
cat_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
cat_pipeline = pipeline.Pipeline([
    ("selector", cat_selector),
    ("imputer", cat_imputer),
    ("binarizer", preprocessing.OneHotEncoder())
])

preprocessor = pipeline.FeatureUnion([("num", num_pipeline), ("bin", bin_pipeline), ("cat", cat_pipeline)])

base = ensemble.GradientBoostingClassifier(verbose=2)
pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf",base)
])

cv = model_selection.StratifiedKFold(n_splits=4)
scorer = metrics.make_scorer(gini_scorer,needs_proba=True)
print("Done\n")

Setting the pipeline ...
Done



In [10]:
a = []
b = []
for x in np.linspace(1, 5, 10):
    clf = xgboost.XGBClassifier(
        scale_pos_weight=x,
        max_depth=4,
        n_estimators=100,
        learning_rate=0.07,
    )

    pipe = pipeline.Pipeline([
        ("preprocessor", preprocessor),
        ("clf", clf)
    ])
    cross_val = model_selection.cross_val_score(pipe, X0, y0, cv=cv, scoring=scorer, verbose=10, n_jobs=4)
    a.append(x)
    b.append(sum(cross_val)/4)
print(clf)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.2791611503519537, total= 2.9min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.9min


[CV] ....................... , score=0.2764091361753211, total= 2.9min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.9min remaining:  2.9min


[CV] ...................... , score=0.26751909287817804, total= 2.9min
[CV] ...................... , score=0.25891061203653554, total= 2.9min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ...................... , score=0.27801411478903526, total= 2.9min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.9min


[CV] ...................... , score=0.26914025162007343, total= 2.9min
[CV] ...................... , score=0.27829409626254625, total= 3.0min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  3.0min remaining:  3.0min


[CV] ....................... , score=0.2603586503996015, total= 2.9min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.0min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.0min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ...................... , score=0.27734728463072295, total= 2.9min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.9min


[CV] ....................... , score=0.2615221236322148, total= 2.9min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.9min remaining:  2.9min


[CV] ....................... , score=0.2805329870990758, total= 2.9min
[CV] ....................... , score=0.2694876380030211, total= 2.9min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.0min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  3.0min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.2803258514103866, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.8min


[CV] ....................... , score=0.2785671868012637, total= 2.8min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min


[CV] ...................... , score=0.26981083746761186, total= 2.9min
[CV] ....................... , score=0.2609603601539795, total= 2.9min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.2801740357430447, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.8min


[CV] ...................... , score=0.26263862848090247, total= 2.8min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min


[CV] ....................... , score=0.2708533540839348, total= 2.8min
[CV] ...................... , score=0.27885474597327686, total= 2.8min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ...................... , score=0.27176044453112824, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.8min


[CV] ...................... , score=0.27987386078457266, total= 2.8min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min


[CV] ....................... , score=0.2634212488238146, total= 2.8min
[CV] ....................... , score=0.2786208377624861, total= 2.8min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.2724978176481746, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.8min


[CV] ....................... , score=0.2627800918002209, total= 2.8min
[CV] ....................... , score=0.2811597356036787, total= 2.8min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min


[CV] ....................... , score=0.2783167702066814, total= 2.8min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ...................... , score=0.28035240723134247, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.9min


[CV] ....................... , score=0.2713676546347742, total= 2.9min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.9min remaining:  2.9min


[CV] ........................ , score=0.281299446239154, total= 2.9min
[CV] ........................ , score=0.263354891191366, total= 2.9min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.9min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.2806759843780024, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.8min


[CV] ...................... , score=0.27274620354105744, total= 2.8min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min


[CV] ........................ , score=0.264429618357046, total= 2.8min
[CV] ....................... , score=0.2785748905422197, total= 2.8min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ...................... , score=0.27937435784438985, total= 2.8min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  2.8min


[CV] ....................... , score=0.2714690673874034, total= 2.8min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  2.8min remaining:  2.8min


[CV] ...................... , score=0.26473531411586404, total= 2.8min
[CV] ....................... , score=0.2796821431851582, total= 2.8min
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=5.0, seed=0, silent=True, subsample=1)


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  2.8min finished


NameError: name 'cross_val' is not defined

In [None]:
pipe.fit(X0, y0)
print(gini_normalized(y0, pipe.predict_proba(X0)[:, 1]))

In [9]:
clf.feature_importances_

array([ 0.11967015,  0.12550281,  0.0802494 ,  0.04806919,  0.04062751,
        0.07542237,  0.02534191,  0.04525342,  0.02453741,  0.04867257,
        0.0114642 ,  0.01488335,  0.02011263,  0.00181014,  0.02272727,
        0.00683829,  0.01106195,  0.00985519,  0.00502816,  0.00683829,
        0.00321802,  0.00160901,  0.02393403,  0.00221239,  0.00784393,
        0.00422365,  0.00261464,  0.00120676,  0.00623492,  0.00221239,
        0.00160901,  0.00241352,  0.00482703,  0.00522928,  0.00080451,
        0.00502816,  0.01005631,  0.00563154,  0.00603379,  0.00402253,
        0.00362027,  0.01106195,  0.        ,  0.00482703,  0.        ,
        0.00362027,  0.00683829,  0.00140788,  0.00341915,  0.00261464,
        0.00261464,  0.00221239,  0.00140788,  0.00040225,  0.00623492,
        0.00281577,  0.00281577,  0.00241352,  0.00040225,  0.00221239,
        0.00522928,  0.00120676,  0.00321802,  0.00160901,  0.00301689,
        0.00724055,  0.        ,  0.00160901,  0.        ,  0.00

In [10]:
print("=================")
print("Predicting for submission ...")
y_test_p = pipe.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(
    index=test.index,
    data=np.round(y_test_p, 3),
    columns=["target"])
prediction.to_csv("data/submission.csv")
print("Done\n")

externals.joblib.dump(clf,"XGB.pkl")

Predicting for submission ...


ValueError: unknown categorical feature present [ 8  7  4 ..., 11 10  9] during transform.