In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV # 能够在指定的范围内自动搜索具有不同超参数的不同模型组合
from sklearn import cross_validation, metrics # metrics用来计算真实值与预测值之间的预测误差

import matplotlib.pylab as plt
%matplotlib inline

# RandomForestClassifier
# RandomForestRegressor
# ExtraTreesClassifier
# ExtraTreesRegressor

In [3]:
# 读取数据
train = pd.read_csv("train_modified.csv")
target = 'Disbursed'  # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

In [7]:
# 选取样本的特征和类别输出
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [8]:
# 默认拟合看输出
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X, y)
print(rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:, 1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=10, verbose=0, warm_start=False)

0.98005
AUC Score (Train): 0.999833


In [16]:
# 对n_estimators指定的范围内自动进行网格搜索
param_test1 = {'n_estimators': [10, 20, 30, 40, 50, 60, 70]}
gsearch1 = GridSearchCV(
    estimator=RandomForestClassifier(
        min_samples_split=100,
        min_samples_leaf=20,
        max_depth=8,
        max_features='sqrt',
        random_state=10),
    param_grid=param_test1,
    scoring='roc_auc',
    cv=5)
gsearch1.fit(X, y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 30, 40, 50, 60, 70]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

([mean: 0.80681, std: 0.02236, params: {'n_estimators': 10},
  mean: 0.81600, std: 0.03275, params: {'n_estimators': 20},
  mean: 0.81818, std: 0.03136, params: {'n_estimators': 30},
  mean: 0.81838, std: 0.03118, params: {'n_estimators': 40},
  mean: 0.82034, std: 0.03001, params: {'n_estimators': 50},
  mean: 0.82113, std: 0.02966, params: {'n_estimators': 60},
  mean: 0.81992, std: 0.02836, params: {'n_estimators': 70}],
 {'n_estimators': 60},
 0.8211334476626017)

In [17]:
# 对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split指定的范围内自动进行网格搜索
param_test2 = {
    'max_depth': [3, 5, 7, 9, 11, 13],
    'min_samples_split': [50, 70, 90, 110, 130, 150, 170, 190]
}
gsearch2 = GridSearchCV(
    estimator=RandomForestClassifier(
        n_estimators=60,
        min_samples_leaf=20,
        max_features='sqrt',
        oob_score=True,
        random_state=10),
    param_grid=param_test2,
    scoring='roc_auc',
    iid=False,
    cv=5)
gsearch2.fit(X, y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=True, random_state=10, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=1,
       param_grid={'max_depth': [3, 5, 7, 9, 11, 13], 'min_samples_split': [50, 70, 90, 110, 130, 150, 170, 190]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

([mean: 0.79379, std: 0.02347, params: {'max_depth': 3, 'min_samples_split': 50},
  mean: 0.79339, std: 0.02410, params: {'max_depth': 3, 'min_samples_split': 70},
  mean: 0.79350, std: 0.02462, params: {'max_depth': 3, 'min_samples_split': 90},
  mean: 0.79367, std: 0.02493, params: {'max_depth': 3, 'min_samples_split': 110},
  mean: 0.79387, std: 0.02521, params: {'max_depth': 3, 'min_samples_split': 130},
  mean: 0.79373, std: 0.02524, params: {'max_depth': 3, 'min_samples_split': 150},
  mean: 0.79378, std: 0.02532, params: {'max_depth': 3, 'min_samples_split': 170},
  mean: 0.79349, std: 0.02542, params: {'max_depth': 3, 'min_samples_split': 190},
  mean: 0.80960, std: 0.02602, params: {'max_depth': 5, 'min_samples_split': 50},
  mean: 0.80920, std: 0.02629, params: {'max_depth': 5, 'min_samples_split': 70},
  mean: 0.80888, std: 0.02522, params: {'max_depth': 5, 'min_samples_split': 90},
  mean: 0.80923, std: 0.02777, params: {'max_depth': 5, 'min_samples_split': 110},
  mean: 0.

In [19]:
# 现在模型的袋外分数
rf1 = RandomForestClassifier(
    n_estimators=60,
    max_depth=13,
    min_samples_split=110,
    min_samples_leaf=20,
    max_features='sqrt',
    oob_score=True,
    random_state=10)
rf1.fit(X, y)
rf1.oob_score_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=110,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=True, random_state=10, verbose=0, warm_start=False)

0.984

In [20]:
# 再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参

param_test3 = {
    'min_samples_split': [80, 100, 120, 140],
    'min_samples_leaf': [10, 20, 30, 40, 50]
}
gsearch3 = GridSearchCV(
    estimator=RandomForestClassifier(
        n_estimators=60,
        max_depth=13,
        max_features='sqrt',
        oob_score=True,
        random_state=10),
    param_grid=param_test3,
    scoring='roc_auc',
    iid=False,
    cv=5)
gsearch3.fit(X, y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=True, random_state=10, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=1,
       param_grid={'min_samples_split': [80, 100, 120, 140], 'min_samples_leaf': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

([mean: 0.82093, std: 0.02287, params: {'min_samples_leaf': 10, 'min_samples_split': 80},
  mean: 0.81913, std: 0.02141, params: {'min_samples_leaf': 10, 'min_samples_split': 100},
  mean: 0.82048, std: 0.02328, params: {'min_samples_leaf': 10, 'min_samples_split': 120},
  mean: 0.81798, std: 0.02099, params: {'min_samples_leaf': 10, 'min_samples_split': 140},
  mean: 0.82094, std: 0.02535, params: {'min_samples_leaf': 20, 'min_samples_split': 80},
  mean: 0.82097, std: 0.02327, params: {'min_samples_leaf': 20, 'min_samples_split': 100},
  mean: 0.82487, std: 0.02110, params: {'min_samples_leaf': 20, 'min_samples_split': 120},
  mean: 0.82169, std: 0.02406, params: {'min_samples_leaf': 20, 'min_samples_split': 140},
  mean: 0.82352, std: 0.02271, params: {'min_samples_leaf': 30, 'min_samples_split': 80},
  mean: 0.82164, std: 0.02381, params: {'min_samples_leaf': 30, 'min_samples_split': 100},
  mean: 0.82070, std: 0.02528, params: {'min_samples_leaf': 30, 'min_samples_split': 120},
  

In [21]:
# 最后我们再对最大特征数max_features做调参
param_test4 = {'max_features': [3, 5, 7, 9]}
gsearch4 = GridSearchCV(
    estimator=RandomForestClassifier(
        n_estimators=60,
        max_depth=13,
        min_samples_split=120,
        min_samples_leaf=20,
        oob_score=True,
        random_state=10),
    param_grid=param_test4,
    scoring='roc_auc',
    iid=False,
    cv=5)
gsearch4.fit(X, y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=120,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=True, random_state=10, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=1,
       param_grid={'max_features': [3, 5, 7, 9]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='roc_auc', verbose=0)

([mean: 0.81981, std: 0.02586, params: {'max_features': 3},
  mean: 0.81639, std: 0.02533, params: {'max_features': 5},
  mean: 0.82487, std: 0.02110, params: {'max_features': 7},
  mean: 0.81704, std: 0.02209, params: {'max_features': 9}],
 {'max_features': 7},
 0.8248650279471544)

In [23]:
# 用我们搜索到的最佳参数，我们再看看最终的模型拟合
rf2 = RandomForestClassifier(
    n_estimators=60,
    max_depth=13,
    min_samples_split=120,
    min_samples_leaf=20,
    max_features=7,
    oob_score=True,
    random_state=10)
rf2.fit(X, y)
rf2.oob_score_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=13, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=120,
            min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=1,
            oob_score=True, random_state=10, verbose=0, warm_start=False)

0.984