In [2]:
import pandas as pd
import pprint
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score

In [5]:
df_train = pd.read_csv('../dataset/train_modified.csv')
df_test = pd.read_csv('../dataset/test_modified.csv')
target_var = 'Attrition'
predictor = [x for x in df_train.columns if x != target_var]

In [7]:
validation_size = 0.3
seed = 7
scoring = 'accuracy'
X_train, X_test, y_train, y_test = train_test_split(
    df_train[predictor],
    df_train[target_var],
    test_size=validation_size,
    random_state=seed)
kfold = StratifiedKFold(n_splits=10, random_state=seed)

In [8]:
def cross_val(model, X_train, y_train, X_test, y_test, kfold):
    cv_results = cross_val_score(
        model, X_train, y_train, cv=kfold, scoring=scoring)
    print('cv-mean: %.4f, cv-std: %.4f' % (cv_results.mean(),
                                           cv_results.std()))
    model.fit(X_train, y_train)
    train_result = model.predict(X_train)
    pred_result = model.predict(X_test)
    train_score = accuracy_score(y_train, train_result)
    pred_score = accuracy_score(y_test, pred_result)
    print('训练集分数:  %.4f' % train_score)
    print('测试集分数： %.4f' % pred_score)
    return

In [9]:
model = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1],
    'tol': [1e-6, 1e-5, 1e-4],
    'random_state': [1, 2, 3, 4, 5]
}

print('原始模型:')
cross_val(model, X_train, y_train, X_test, y_test, kfold)

grid_search = GridSearchCV(
    estimator=model, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

原始模型:
cv-mean: 0.8859, cv-std: 0.0278
训练集分数:  0.9039
测试集分数： 0.8394
优化模型:
最佳参数: {'C': 1, 'penalty': 'l1', 'random_state': 1, 'tol': 0.0001}
最佳得分: 0.8870
cv-mean: 0.8872, cv-std: 0.0327
训练集分数:  0.9052
测试集分数： 0.8515


## 随机森林的调参过程
### 默认参数的模型性能

In [129]:
model = RandomForestClassifier()
model.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>

In [130]:
model = RandomForestClassifier(random_state=seed)
print('原始模型:')
cross_val(model, X_train, y_train, X_test, y_test, kfold)

原始模型:
cv-mean: 0.8558, cv-std: 0.0421
训练集分数:  0.9857
测试集分数： 0.8303


### 确定最佳的n_estimators

In [131]:
parameters = {
    'n_estimators': range(10, 110, 10)
}

grid_search = GridSearchCV(
    estimator=model, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)

print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

优化模型:
最佳参数: {'n_estimators': 60}
最佳得分: 0.8675
cv-mean: 0.8675, cv-std: 0.0478
训练集分数:  1.0000
测试集分数： 0.8455


### 确定最大深度max_depth和min_samples_split

In [132]:
parameters = {'max_depth':range(3,14,2)#, 'min_samples_split':range(50,201,20)
             }

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=seed, n_estimators=60),
    param_grid=parameters,
    scoring=scoring, cv=kfold).fit(X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

优化模型:
最佳参数: {'max_depth': 13}
最佳得分: 0.8662
cv-mean: 0.8662, cv-std: 0.0468
训练集分数:  1.0000
测试集分数： 0.8485


In [133]:
parameters = {'max_depth':[13, 14, 15]#, 'min_samples_split':[50, 55, 60]
             }

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=seed, n_estimators=60),
    param_grid=parameters,
    scoring=scoring, cv=kfold).fit(X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

优化模型:
最佳参数: {'max_depth': 14}
最佳得分: 0.8688
cv-mean: 0.8688, cv-std: 0.0473
训练集分数:  1.0000
测试集分数： 0.8424


### 确定min_samples_split和min_samples_leaf

In [134]:
parameters = {'min_samples_split':range(2,20), 'min_samples_leaf':range(1,10)}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=seed, n_estimators=60, max_depth=14), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

KeyboardInterrupt: 

### 确定max_features

In [None]:
parameters = {
    'max_features': range(3, 10)
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=seed, n_estimators=60, max_depth=14,
                                    min_samples_leaf=4, min_samples_split=12), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

通过对随机森林的参数调节，其结果甚至没有最优的logistic回归性能好。

## xgboost调参 

In [52]:
xgb.XGBClassifier()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

### 设定基础参数

In [124]:
from sklearn.metrics import roc_auc_score

In [125]:
def modelfit(alg, X_train, y_train, X_test, y_test,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        xgtest = xgb.DMatrix(X_test)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % accuracy_score(y_train, dtrain_predictions))
    print ("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
    
#     Predict on testing data:
    result = alg.predict_proba(X_test)[:,1]
    print ('AUC Score (Test): %f' % roc_auc_score(y_test, result))
                
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [126]:
modelfit(gbm, X_train, y_train, X_test, y_test, cv_folds=kfold)


Model Report
Accuracy : 0.9961
AUC Score (Train): 0.999987
AUC Score (Test): 0.740979


TypeError: 'str' object is not callable

# gbm = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'reg:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=seed)

print('xgboost基础模型:')
cross_val(gbm, X_train, y_train, X_test, y_test, kfold)

### 确定学习率和对应的n_estimators

In [101]:
parameters = {'n_estimators': range(100, 550, 50)}
grid_search = GridSearchCV(
    estimator=gbm, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定learning_rate=0.1时的n_estimators:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

优化模型:
最佳参数: {'n_estimators': 150}
最佳得分: 0.8792
cv-mean: 0.8792, cv-std: 0.0439
测试集分数： 0.8364


### max_depth 和 min_weight 参数调优

In [116]:
parameters = {
 'max_depth':range(1,15,2),
 'min_child_weight':range(1,6,2)
}
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective= 'reg:logistic',
    learning_rate =0.1,
    n_estimators=150), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定max_depth 和 min_weight:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

确定max_depth 和 min_weight:
最佳参数: {'max_depth': 3, 'min_child_weight': 1}
最佳得分: 0.8818
cv-mean: 0.8818, cv-std: 0.0417
测试集分数： 0.8273


In [117]:
parameters = {
 'min_child_weight':[6, 7, 8, 9]
}
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective= 'reg:logistic',
    learning_rate =0.1,
    n_estimators=150, max_depth=3), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定min_weight:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

确定min_weight:
最佳参数: {'min_child_weight': 6}
最佳得分: 0.8844
cv-mean: 0.8844, cv-std: 0.0383
测试集分数： 0.8364


### gamma调优

In [107]:
parameters = {
 'gamma':[i/10.0 for i in range(0,5)]
}
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective= 'reg:logistic',
    learning_rate =0.1,
    n_estimators=150, max_depth=3, min_child_weight=6), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定gamma:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

确定min_weight:
最佳参数: {'gamma': 0.0}
最佳得分: 0.8844
cv-mean: 0.8844, cv-std: 0.0383
测试集分数： 0.8364


### subsample和colsample_bytree调优

In [110]:
parameters = {
 'subsample':[i/10.0 for i in range(6,11)],
 'colsample_bytree':[i/10.0 for i in range(6,11)]
}
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective= 'reg:logistic',
    learning_rate =0.1,
    n_estimators=150, max_depth=3, min_child_weight=6), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定subsample和colsample_bytree:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

确定subsample和colsample_bytree:
最佳参数: {'colsample_bytree': 0.6, 'subsample': 1.0}
最佳得分: 0.8883
cv-mean: 0.8883, cv-std: 0.0377
测试集分数： 0.8333


### 正则化调优

In [114]:
parameters = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective= 'reg:logistic',
    learning_rate =0.1,
    n_estimators=150, max_depth=3, min_child_weight=6, colsample_bytree=0.6, subsample=1.0), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定alpha:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

确定subsample和colsample_bytree:
最佳参数: {'reg_alpha': 0}
最佳得分: 0.8883
cv-mean: 0.8883, cv-std: 0.0377
测试集分数： 0.8333


In [115]:
parameters = {
 'reg_lambda':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective= 'reg:logistic',
    learning_rate =0.1,
    n_estimators=150, max_depth=3, min_child_weight=6, colsample_bytree=0.6, subsample=1.0), param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('确定lambda:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

确定alpha:
最佳参数: {'reg_lambda': 1}
最佳得分: 0.8883
cv-mean: 0.8883, cv-std: 0.0377
测试集分数： 0.8333
