In [3]:
import pandas as pd
import numpy as np
import xgboost
from sklearn import linear_model, ensemble, tree
from evaluation import *
from data_processing import *
from sklearn import preprocessing, model_selection,pipeline,metrics,externals
from sklearn_pandas.pipeline import TransformerPipeline

In [2]:
# Read csv files

print("=================")
print("Loading data ...")

train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

print("Done\n")

Loading data ...
Done



In [5]:
print("=================")
print("Processing data ...")

# Turn DataFrames into arrays for scikit-learn
# Forget the target column in X
train = train.drop(train.columns[train.columns.str.contains("calc")], axis=1)
train_0 = train.drop("target", axis=1)

name_to_index = {name: train_0.columns.get_loc(name) for name in train_0.columns}

X0 = np.array(train_0)
y0 = np.array(train["target"])

X_test = np.array(test)

print("Done\n")

Processing data ...
Done



In [6]:
print("=================")
print("Setting the pipeline ...")

# Define the num pipeline
num_selector = filter_num_transform(name_to_index)
num_imputer = preprocessing.Imputer(missing_values=-1, strategy="mean")
num_pipeline = pipeline.Pipeline([
    ("selector", num_selector),
    ("imputer", num_imputer),
    ("scaler", preprocessing.StandardScaler())
])

# Define the bin pipeline
bin_selector = filter_bin_transform(name_to_index)
bin_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
bin_pipeline = pipeline.Pipeline([
    ("selector", bin_selector),
    ("imputer", bin_imputer)
])

# Define the cat pipeline
cat_selector = filter_cat_transform(name_to_index)
cat_imputer = preprocessing.Imputer(missing_values=-1, strategy="most_frequent")
cat_pipeline = pipeline.Pipeline([
    ("selector", cat_selector),
    ("imputer", cat_imputer),
    ("binarizer", preprocessing.OneHotEncoder())
])

preprocessor = pipeline.FeatureUnion([("num", num_pipeline), ("bin", bin_pipeline), ("cat", cat_pipeline)])

base = ensemble.GradientBoostingClassifier(verbose=2)
pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf",base)
])

cv = model_selection.StratifiedKFold(n_splits=4)
scorer = metrics.make_scorer(gini_scorer,needs_proba=True)
print("Done\n")

Setting the pipeline ...
Done



In [9]:
clf = xgboost.XGBClassifier(max_depth=4, n_estimators=400, learning_rate=0.07)
# clf = xgboost.XGBClassifier(learning_rate=0.01, n_estimators=300, nthread=4, max_depth=16)

pipe = pipeline.Pipeline([
    ("preprocessor", preprocessor),
    ("clf", clf)
])
print(clf)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=400, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


In [10]:
cross_val = model_selection.cross_val_score(pipe, X0, y0, cv=cv, scoring=scorer, verbose=10, n_jobs=4)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ...................... , score=0.27923597421049573, total= 9.1min


[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  9.1min


[CV] ...................... , score=0.28202590412605383, total= 9.1min


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  9.1min remaining:  9.1min


[CV] ...................... , score=0.27047795784995754, total= 9.1min
[CV] ....................... , score=0.2841556422667079, total= 9.1min


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.1min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.1min finished


In [12]:
pipe.fit(X0, y0)
print(gini_normalized(y0, pipe.predict_proba(X0)[:, 1]))

0.358067388413


In [13]:
clf.feature_importances_

array([ 0.04730687,  0.0813253 ,  0.00194897,  0.05439405,  0.04429483,
        0.04518072,  0.12455705,  0.00673281,  0.01612332,  0.11924167,
        0.0504961 ,  0.01913537,  0.00885897,  0.01257973,  0.00673281,
        0.00868179,  0.00017718,  0.00017718,  0.00283487,  0.        ,
        0.00797307,  0.01948972,  0.00248051,  0.00832743,  0.00673281,
        0.00301205,  0.00318923,  0.01328845,  0.        ,  0.02090716,
        0.00212615,  0.00797307,  0.00531538,  0.00230333,  0.00070872,
        0.00673281,  0.00230333,  0.00141743,  0.00194897,  0.00230333,
        0.00478384,  0.00159461,  0.00336641,  0.00868179,  0.00478384,
        0.00690999,  0.0042523 ,  0.00496102,  0.00389794,  0.        ,
        0.00496102,  0.        ,  0.00106308,  0.00141743,  0.00549256,
        0.00035436,  0.00141743,  0.        ,  0.00177179,  0.00141743,
        0.00017718,  0.00017718,  0.00726435,  0.        ,  0.00124026,
        0.00212615,  0.00141743,  0.00283487,  0.00106308,  0.00

In [14]:
print("=================")
print("Predicting for submission ...")
y_test_p = pipe.predict_proba(X_test)[:,1]
prediction = pd.DataFrame(
    index=test.index,
    data=np.round(y_test_p, 3),
    columns=["target"])
prediction.to_csv("data/submission.csv")
print("Done\n")

externals.joblib.dump(clf,"XGB.pkl")

Predicting for submission ...
Done



['XGB.pkl']