In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics, cross_validation
from sklearn.grid_search import GridSearchCV

%matplotlib inline

In [31]:
train = pd.read_csv('data/train_modified.csv')
target = 'Disbursed'
IDcol = 'ID'

train[target].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

In [32]:
# prepare the data
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
Y = train[target]

In [33]:
# 先用默认的方法拟合数据
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X, Y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:, 1]
print("accuracy = %.4g" % metrics.accuracy_score(Y, y_pred))
print("prob = %f" % metrics.roc_auc_score(Y, y_predprob))

accuracy = 0.9852
prob = 0.900531


In [35]:
# 首先设置一个较小的时间步长来搜索最好的迭代次数
param_test1 = {'n_estimators':np.arange(20, 81, 10)}
gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300
            , min_samples_leaf=20, max_depth=8, max_features='sqrt',subsample=0.8, random_state=10)
            , param_grid=param_test1, scoring='roc_auc', iid=False, cv=5)
gsearch1.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.81285, std: 0.01967, params: {'n_estimators': 20},
  mean: 0.81438, std: 0.01947, params: {'n_estimators': 30},
  mean: 0.81404, std: 0.01970, params: {'n_estimators': 40},
  mean: 0.81593, std: 0.01868, params: {'n_estimators': 50},
  mean: 0.81927, std: 0.01596, params: {'n_estimators': 60},
  mean: 0.81722, std: 0.01750, params: {'n_estimators': 70},
  mean: 0.81485, std: 0.01732, params: {'n_estimators': 80}],
 {'n_estimators': 60},
 0.8192660696138212)

In [38]:
param_test2 = {'max_depth': np.arange(3, 14, 2), 'min_samples_split': np.arange(100, 801, 200)}
gsearch2 = gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1
            , n_estimators=60, min_samples_leaf=20, max_features='sqrt',subsample=0.8, random_state=10)
            , param_grid=param_test2, scoring='roc_auc', iid=False, cv=5)
gsearch2.fit(X, Y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_


([mean: 0.81199, std: 0.02073, params: {'max_depth': 3, 'min_samples_split': 100},
  mean: 0.81267, std: 0.01985, params: {'max_depth': 3, 'min_samples_split': 300},
  mean: 0.81238, std: 0.01937, params: {'max_depth': 3, 'min_samples_split': 500},
  mean: 0.80925, std: 0.02051, params: {'max_depth': 3, 'min_samples_split': 700},
  mean: 0.81846, std: 0.01843, params: {'max_depth': 5, 'min_samples_split': 100},
  mean: 0.81630, std: 0.01810, params: {'max_depth': 5, 'min_samples_split': 300},
  mean: 0.81315, std: 0.01898, params: {'max_depth': 5, 'min_samples_split': 500},
  mean: 0.81262, std: 0.02090, params: {'max_depth': 5, 'min_samples_split': 700},
  mean: 0.81807, std: 0.02004, params: {'max_depth': 7, 'min_samples_split': 100},
  mean: 0.82137, std: 0.01733, params: {'max_depth': 7, 'min_samples_split': 300},
  mean: 0.81681, std: 0.01786, params: {'max_depth': 7, 'min_samples_split': 500},
  mean: 0.81383, std: 0.02327, params: {'max_depth': 7, 'min_samples_split': 700},
  me

In [41]:
param_test3 = {'min_samples_leaf': np.arange(60, 101, 10), 'min_samples_split': np.arange(800, 1901, 200)}
gsearch3 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1
            , n_estimators=60, max_depth=7, max_features='sqrt',subsample=0.8, random_state=10)
            , param_grid=param_test3, scoring='roc_auc', iid=False, cv=5)
gsearch3.fit(X, Y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_


([mean: 0.81828, std: 0.02251, params: {'min_samples_leaf': 60, 'min_samples_split': 800},
  mean: 0.81731, std: 0.02344, params: {'min_samples_leaf': 60, 'min_samples_split': 1000},
  mean: 0.82220, std: 0.02250, params: {'min_samples_leaf': 60, 'min_samples_split': 1200},
  mean: 0.81447, std: 0.02125, params: {'min_samples_leaf': 60, 'min_samples_split': 1400},
  mean: 0.81495, std: 0.01626, params: {'min_samples_leaf': 60, 'min_samples_split': 1600},
  mean: 0.81528, std: 0.02140, params: {'min_samples_leaf': 60, 'min_samples_split': 1800},
  mean: 0.81590, std: 0.02517, params: {'min_samples_leaf': 70, 'min_samples_split': 800},
  mean: 0.81573, std: 0.02207, params: {'min_samples_leaf': 70, 'min_samples_split': 1000},
  mean: 0.82021, std: 0.02521, params: {'min_samples_leaf': 70, 'min_samples_split': 1200},
  mean: 0.81512, std: 0.01995, params: {'min_samples_leaf': 70, 'min_samples_split': 1400},
  mean: 0.81395, std: 0.02081, params: {'min_samples_leaf': 70, 'min_samples_split

In [42]:
gbm1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_depth=7,
        min_samples_leaf=60, min_samples_split=1200, max_features='sqrt', subsample=0.8, random_state=10)
gbm1.fit(X, Y)
y_pred = gbm1.predict(X)
y_predprob = gbm1.predict_proba(X)[:, 1]
print("accuracy = %.4g" % metrics.accuracy_score(Y, y_pred))
print("prob = %f" % metrics.roc_auc_score(Y, y_predprob))

accuracy = 0.984
prob = 0.908099


In [43]:
param_test4 = {'max_features': np.arange(7, 20, 2)}
gsearch4 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1
            , n_estimators=60, max_depth=7, subsample=0.8, random_state=10)
            , param_grid=param_test4, scoring='roc_auc', iid=False, cv=5)
gsearch4.fit(X, Y)
gsearch4.grid_scores_, gsearch3.best_params_, gsearch3.best_score_


([mean: 0.81312, std: 0.01925, params: {'max_features': 7},
  mean: 0.80756, std: 0.02441, params: {'max_features': 9},
  mean: 0.80348, std: 0.02478, params: {'max_features': 11},
  mean: 0.80439, std: 0.01530, params: {'max_features': 13},
  mean: 0.79978, std: 0.02228, params: {'max_features': 15},
  mean: 0.80019, std: 0.01149, params: {'max_features': 17},
  mean: 0.80774, std: 0.01601, params: {'max_features': 19}],
 {'min_samples_leaf': 60, 'min_samples_split': 1200},
 0.8222032996697154)

In [44]:
param_test5 = {'subsample': [0.6, 0.7, 0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1
            , n_estimators=60, max_depth=7, min_samples_split=1200, min_samples_leaf=60, random_state=10)
            , param_grid=param_test5, scoring='roc_auc', iid=False, cv=5)
gsearch5.fit(X, Y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_


([mean: 0.80741, std: 0.01607, params: {'subsample': 0.6},
  mean: 0.81122, std: 0.01778, params: {'subsample': 0.7},
  mean: 0.81192, std: 0.01814, params: {'subsample': 0.75},
  mean: 0.81053, std: 0.01817, params: {'subsample': 0.8},
  mean: 0.81075, std: 0.01675, params: {'subsample': 0.85},
  mean: 0.81147, std: 0.01636, params: {'subsample': 0.9}],
 {'subsample': 0.75},
 0.8119168413363822)

In [45]:
gbm2 = GradientBoostingClassifier(learning_rate=0.05, n_estimators=120, max_depth=7,
        min_samples_leaf=60, min_samples_split=1200, max_features='sqrt', subsample=0.75, random_state=10)
gbm2.fit(X, Y)
y_pred = gbm2.predict(X)
y_predprob = gbm2.predict_proba(X)[:, 1]
print("accuracy = %.4g" % metrics.accuracy_score(Y, y_pred))
print("prob = %f" % metrics.roc_auc_score(Y, y_predprob))


accuracy = 0.984
prob = 0.904292


In [46]:
gbm3 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=600, max_depth=7,
        min_samples_leaf=60, min_samples_split=1200, max_features='sqrt', subsample=0.75, random_state=10)
gbm3.fit(X, Y)
y_pred = gbm3.predict(X)
y_predprob = gbm3.predict_proba(X)[:, 1]
print("accuracy = %.4g" % metrics.accuracy_score(Y, y_pred))
print("prob = %f" % metrics.roc_auc_score(Y, y_predprob))


accuracy = 0.984
prob = 0.907265


In [47]:
gbm4 = GradientBoostingClassifier(learning_rate=0.005, n_estimators=1200, max_depth=7,
        min_samples_leaf=60, min_samples_split=1200, max_features='sqrt', subsample=0.75, random_state=10)
gbm4.fit(X, Y)
y_pred = gbm4.predict(X)
y_predprob = gbm4.predict_proba(X)[:, 1]
print("accuracy = %.4g" % metrics.accuracy_score(Y, y_pred))
print("prob = %f" % metrics.roc_auc_score(Y, y_predprob))



accuracy = 0.984
prob = 0.906993
