In [38]:
import pandas as pd
import numpy as np

from sklearn import ensemble
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import feature_selection
from sklearn import calibration

import time

seed = 42
grid = False

In [39]:
df_train = pd.read_csv("dataset/train.csv")
df_test = pd.read_csv("dataset/test.csv")

In [40]:
df_train = df_train.loc[:, (df_train != df_train.iloc[0]).any()]
df_train = df_train.T.drop_duplicates().T
df_test = df_test[df_train.drop("TARGET", axis=1).columns]

In [41]:
y_data = df_train["TARGET"].values
df_train.drop(["ID", "TARGET"], axis=1, inplace=True)
x_data = df_train

In [42]:
id_test = df_test["ID"].values
x_test = df_test.drop(["ID"], axis=1)

In [43]:
x_data_norm = preprocessing.normalize(x_data, axis=0, norm="l2")
x_test_norm = preprocessing.normalize(x_test, axis=0, norm="l2")

In [44]:
pca = decomposition.PCA(5)
x_data_pca = pca.fit_transform(x_data_norm)
x_test_pca = pca.transform(x_test_norm)

In [47]:
x_data["pca_0"] = x_data_pca[:,0]
x_data["pca_1"] = x_data_pca[:,1]
x_data["pca_2"] = x_data_pca[:,2]
x_data["pca_3"] = x_data_pca[:,3]
x_data["pca_4"] = x_data_pca[:,4]

In [49]:
x_test["pca_0"] = x_test_pca[:,0]
x_test["pca_1"] = x_test_pca[:,1]
x_test["pca_2"] = x_test_pca[:,2]
x_test["pca_3"] = x_test_pca[:,3]
x_test["pca_4"] = x_test_pca[:,4]

In [50]:
x_data_norm = preprocessing.normalize(x_data, axis=0, norm="l2")
x_test_norm = preprocessing.normalize(x_test, axis=0, norm="l2")

In [51]:
model_extra_tree = ensemble.ExtraTreesClassifier(class_weight="balanced", random_state=seed)
model_extra_tree_fitted = model_extra_tree.fit(x_data_norm, y_data)

In [52]:
model_select_feat = feature_selection.SelectFromModel(model_extra_tree_fitted, prefit=True)

x_data_norm = model_select_feat.transform(x_data_norm)
x_test_norm = model_select_feat.transform(x_test_norm)

In [48]:
if(grid):
    model_gradient_boosting = ensemble.GradientBoostingClassifier()

    n_estimators_values = [200, 300, 400, 500]
    learning_rate_values = [0.01, 0.001]
    max_depth_values = [3, 6, 9]

    grid = {"n_estimators": n_estimators_values, 
            "learning_rate": learning_rate_values,
            "max_depth": max_depth_values
           }

    kfold = model_selection.KFold(3, True, random_state=seed)

    grid_search = model_selection.GridSearchCV(estimator=model_gradient_boosting, param_grid=grid, cv=kfold, scoring='accuracy', n_jobs=8)
    grid_result = grid_search.fit(x_data, y_data)

    print("Accuracy: %f. Best params: %s" % (grid_result.best_score_, grid_result.best_params_))

Melhor Acurácia: 0.960445 utilizando {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}


In [53]:
model_gradient_boosting = ensemble.GradientBoostingClassifier(n_estimators=200, learning_rate=0.01, max_depth=3, subsample=0.95, random_state=seed)
model_calibrated = calibration.CalibratedClassifierCV(model_gradient_boosting, method='isotonic', cv=10)

model_calibrated.fit(x_data_norm, y_data)

CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              presort='auto', random_state=42, subsample=0.95, verbose=0,
              warm_start=False),
            cv=10, method='isotonic')

In [54]:
model_predict_proba = model_calibrated.predict_proba(x_test_norm)

df_sub = pd.DataFrame({"ID":id_test, "TARGET": model_predict_proba[:,1]})
df_sub.to_csv("submission1.csv", index=False)