In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn import metrics

import matplotlib.pylab as plt
%matplotlib inline

In [8]:
train = pd.read_csv('./train_modified/train_modified_1.csv')
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts() 

Disbursed
0    19680
1      320
Name: count, dtype: int64

In [10]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [16]:
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
print (rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.98315
AUC Score (Train): 0.999994


In [26]:
rf1 = RandomForestClassifier(n_estimators= 60, max_depth=13, min_samples_split=110,
                                  min_samples_leaf=20,max_features='sqrt' ,oob_score=True, random_state=10)
rf1.fit(X,y)
print (rf1.oob_score_)
y_predprob = rf1.predict_proba(X)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.984
AUC Score (Train): 0.928755


In [27]:
rf2 = RandomForestClassifier(n_estimators= 60, max_depth=13, min_samples_split=120,
                                  min_samples_leaf=20,max_features=7 ,oob_score=True, random_state=10)
rf2.fit(X,y)
print (rf2.oob_score_)
y_predprob = rf2.predict_proba(X)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.984
AUC Score (Train): 0.926915


In [30]:
param_test1 = {'n_estimators':[10,20,30,40,50,60,70]}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                  min_samples_leaf=20,max_depth=8,max_features='sqrt' ,random_state=10), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([0.09838862, 0.15245118, 0.22026167, 0.29403601, 0.36355453,
         0.4313941 , 0.50080442]),
  'std_fit_time': array([0.01477097, 0.00942052, 0.00221559, 0.00400959, 0.00896303,
         0.00424901, 0.00383059]),
  'mean_score_time': array([0.00930533, 0.01120896, 0.01373477, 0.01822448, 0.02227917,
         0.02318182, 0.02554889]),
  'std_score_time': array([0.00133327, 0.00103048, 0.00123022, 0.00099286, 0.00251708,
         0.00250156, 0.00120107]),
  'param_n_estimators': masked_array(data=[10, 20, 30, 40, 50, 60, 70],
               mask=[False, False, False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'n_estimators': 10},
   {'n_estimators': 20},
   {'n_estimators': 30},
   {'n_estimators': 40},
   {'n_estimators': 50},
   {'n_estimators': 60},
   {'n_estimators': 70}],
  'split0_test_score': array([0.81797431, 0.82673558, 0.8370927 , 0.83676321, 0.8351753 ,
         0.83643769, 0.83286093]),
  'spl

In [41]:
param_test2 = {'max_depth':[5,7,9,11,13,15], 'min_samples_split':[50,70,90,110,130,150,170,190]}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                  min_samples_leaf=20,max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(X,y)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([0.49129376, 0.47828097, 0.47687159, 0.48589921, 0.49862442,
         0.47429762, 0.46076512, 0.47128286, 0.55039992, 0.53791513,
         0.53007936, 0.53249183, 0.53824148, 0.57704897, 0.55669217,
         0.56419601, 0.63999052, 0.61052275, 0.60992126, 0.60951157,
         0.60203562, 0.60724201, 0.62559657, 0.62187357, 0.67820683,
         0.67297196, 0.64773517, 0.62646928, 0.6014976 , 0.60586019,
         0.60852742, 0.66621413, 0.65892005, 0.67895923, 0.65724907,
         0.6762991 , 0.68043671, 0.61486492, 0.62273846, 0.61553617,
         0.64437981, 0.64706955, 0.69526682, 0.7267612 , 0.88064709,
         1.16829796, 1.18097062, 1.17360916]),
  'std_fit_time': array([0.03211026, 0.02228559, 0.01229285, 0.03175376, 0.0221795 ,
         0.0204363 , 0.01104086, 0.01611919, 0.02673691, 0.02136254,
         0.01380047, 0.00981869, 0.0105582 , 0.05252355, 0.02076915,
         0.02629949, 0.03519757, 0.01986391, 0.03081263, 0.02344248,
         0.0314626 , 0.

In [36]:
param_test3 = {'min_samples_split':[80,100,120,140], 'min_samples_leaf':[10,20,30,40,50]}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, max_depth=13,
                                  max_features='sqrt' ,oob_score=True, random_state=10),
   param_grid = param_test3, scoring='roc_auc', cv=5)
gsearch3.fit(X,y)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([0.68342915, 0.65838799, 0.66302476, 0.68330107, 0.63549919,
         0.59536963, 0.59253755, 0.5832088 , 0.58364491, 0.62193518,
         0.62465463, 0.60838799, 0.59512801, 0.60320954, 0.59693599,
         0.57658706, 0.57149906, 0.58114705, 0.5858007 , 0.57308121]),
  'std_fit_time': array([0.02604363, 0.02284157, 0.0301389 , 0.04203803, 0.0169528 ,
         0.00890089, 0.0067608 , 0.00484318, 0.00505838, 0.03341658,
         0.03870672, 0.02806758, 0.01074748, 0.02959104, 0.01161059,
         0.00542265, 0.01176887, 0.013596  , 0.00834515, 0.01647223]),
  'mean_score_time': array([0.03102241, 0.02915578, 0.02774835, 0.02720642, 0.02812848,
         0.02626357, 0.02551064, 0.02574511, 0.0270555 , 0.02787118,
         0.02725344, 0.02661486, 0.02621989, 0.02959027, 0.02509928,
         0.02730293, 0.02560201, 0.02467108, 0.0272172 , 0.02562599]),
  'std_score_time': array([0.00245529, 0.00171689, 0.0026804 , 0.00130649, 0.00341559,
         0.00219389, 0.0018

In [37]:
param_test4 = {'max_features':[3,5,7,9]}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, max_depth=13, min_samples_split=120,
                                  min_samples_leaf=20 ,oob_score=True, random_state=10),
   param_grid = param_test4, scoring='roc_auc', cv=5)
gsearch4.fit(X,y)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

({'mean_fit_time': array([0.41274347, 0.52686086, 0.62125111, 0.6905858 ]),
  'std_fit_time': array([0.00339065, 0.01414571, 0.02077714, 0.01048934]),
  'mean_score_time': array([0.02423925, 0.02845879, 0.02698178, 0.02693725]),
  'std_score_time': array([0.0025778 , 0.00354095, 0.00186718, 0.00186767]),
  'param_max_features': masked_array(data=[3, 5, 7, 9],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'max_features': 3},
   {'max_features': 5},
   {'max_features': 7},
   {'max_features': 9}],
  'split0_test_score': array([0.81893102, 0.82697972, 0.83293834, 0.81775994]),
  'split1_test_score': array([0.79912387, 0.79626763, 0.79838748, 0.80414563]),
  'split2_test_score': array([0.78474935, 0.77782211, 0.80341916, 0.78332023]),
  'split3_test_score': array([0.84169565, 0.83561793, 0.8346374 , 0.83346434]),
  'split4_test_score': array([0.85455967, 0.8452466 , 0.85494276, 0.84648517]),
  'mean_test_score': array(