In [3]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import linear_model, ensemble, tree
from evaluation import *
from data_processing import *
from sklearn import preprocessing, model_selection,pipeline,metrics,externals
from sklearn_pandas.pipeline import TransformerPipeline

In [2]:
# Read csv files

print("=================")
print("Loading data ...")

train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

print("Done\n")

Loading data ...
Done



In [5]:
print("=================")
print("Processing data ...")

# Turn DataFrames into arrays for scikit-learn
# Forget the target column in X
train = train.drop(train.columns[train.columns.str.contains("calc")], axis=1)
train_0 = train.drop("target", axis=1)

name_to_index = {name: train_0.columns.get_loc(name) for name in train_0.columns}

X0 = np.array(train_0)
y0 = np.array(train["target"])

X_test = np.array(test)

print("Done\n")

Processing data ...
Done



In [6]:
print("=================")
print("Setting the pipeline ...")

# Define the num pipeline
num_selector = filter_num_transform(name_to_index)
num_imputer = preprocessing.Imputer(missing_values=-1, strategy="mean")
num_pipeline = pipeline.Pipeline([
    ("selector", num_selector),
    ("imputer", num_imputer),
    ("scaler", preprocessing.StandardScaler())
])

# Define the bin pipeline
bin_selector = filter_bin_transform(name_to_index)
bin_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
bin_pipeline = pipeline.Pipeline([
    ("selector", bin_selector),
    ("imputer", bin_imputer)
])

# Define the cat pipeline
cat_selector = filter_cat_transform(name_to_index)
cat_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
cat_pipeline = pipeline.Pipeline([
    ("selector", cat_selector),
    ("imputer", cat_imputer),
    ("binarizer", preprocessing.OneHotEncoder())
])

preprocessor = pipeline.FeatureUnion([("num", num_pipeline), ("bin", bin_pipeline), ("cat", cat_pipeline)])

base = ensemble.GradientBoostingClassifier(verbose=2)
pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf",base)
])

cv = model_selection.StratifiedKFold(n_splits=4)
scorer = metrics.make_scorer(gini_scorer,needs_proba=True)
print("Done\n")

Setting the pipeline ...
Done



In [7]:
clf = xgboost.XGBClassifier(max_depth=4, n_estimators=400, learning_rate=0.07)
# clf = xgboost.XGBClassifier(learning_rate=0.01, n_estimators=300, nthread=4, max_depth=16)

pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf", clf)
])
print(clf)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


In [8]:
cross_val = model_selection.cross_val_score(pipe, X0, y0, cv=cv, scoring=scorer, verbose=10, n_jobs=4)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................ , score=0.278275383498173, total= 1.7min
[CV] ........................ , score=0.280650159281337, total= 1.7min
[CV] ...................... , score=0.26945143863430343, total= 1.7min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  1.8min remaining:  1.8min


[CV] ....................... , score=0.2593114689307156, total= 1.7min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.8min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  1.8min finished


In [20]:
pipe.fit(X0, y0)
print(gini_normalized(y0, pipe.predict_proba(X0)[:, 1]))

0.299570483301


In [19]:
clf.feature_importances_

array([ 0.035499  ,  0.11989284,  0.        ,  0.06028131,  0.05291359,
        0.04219692,  0.06363028,  0.00401875,  0.00468855,  0.09845947,
        0.05090422,  0.01004689,  0.00468855,  0.00334896,  0.00401875,
        0.00133958,  0.00200938,  0.00133958,  0.00468855,  0.00535834,
        0.00334896,  0.00401875,  0.00200938,  0.00066979,  0.00468855,
        0.00602813,  0.02478232,  0.01540522,  0.00535834,  0.00803751,
        0.        ,  0.        ,  0.00200938,  0.        ,  0.01607502,
        0.06229069,  0.        ,  0.00133958,  0.        ,  0.        ,
        0.00133958,  0.        ,  0.00133958,  0.00334896,  0.0087073 ,
        0.        ,  0.00133958,  0.00468855,  0.        ,  0.07702612,
        0.        ,  0.01473543,  0.00468855,  0.00200938,  0.        ,
        0.00803751,  0.        ,  0.        ,  0.        ,  0.        ,
        0.00334896,  0.        ,  0.00468855,  0.0174146 ,  0.        ,
        0.00736772,  0.00200938,  0.00133958,  0.00133958,  0.  

In [21]:
print("=================")
print("Predicting for submission ...")
y_test_p = pipe.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(
    index=test.index,
    data=np.round(y_test_p, 3),
    columns=["target"])
prediction.to_csv("data/submission.csv")
print("Done\n")

externals.joblib.dump(clf,"XGB.pkl")

Predicting for submission ...
Done



['XGB.pkl']