In [4]:
import pandas as pd
import pprint
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score

In [5]:
df_train = pd.read_csv('../dataset/pfm_train.csv')
df_test = pd.read_csv('../dataset/pfm_test.csv')

df_train.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)
df_test.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)

# 预测变量
target_var = 'Attrition'

# 字符型
character_var = [
    x for x in df_train.dtypes.index if df_train.dtypes[x] == 'object'
]
numeric_var = [
    x for x in df_train.dtypes.index
    if x != target_var and x not in character_var
]

df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

predictor = [x for x in df_train.columns if x != target_var]

In [6]:
validation_size = 0.3
seed = 7
scoring = 'accuracy'
X_train, X_test, y_train, y_test = train_test_split(
    df_train[predictor],
    df_train[target_var],
    test_size=validation_size,
    random_state=seed)
kfold = KFold(n_splits=10, random_state=seed)

In [7]:
def cross_val(model, X_train, y_train, X_test, y_test, kfold):
    cv_results = cross_val_score(
        model, X_train, y_train, cv=kfold, scoring=scoring)
    print('cv-mean: %.4f, cv-std: %.4f' % (cv_results.mean(),
                                           cv_results.std()))
    model.fit(X_train, y_train)
    pred_result = model.predict(X_test)
    pred_score = accuracy_score(y_test, pred_result)
    print('测试集分数： %.4f' % pred_score)
    return

In [8]:
model = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1],
    'tol': [1e-6, 1e-5, 1e-4],
    'random_state': [1, 2, 3, 4, 5]
}

print('原始模型:')
cross_val(model, X_train, y_train, X_test, y_test, kfold)

grid_search = GridSearchCV(
    estimator=model, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

原始模型:
cv-mean: 0.8805, cv-std: 0.0380
测试集分数： 0.8394
优化模型:
最佳参数: {'C': 1, 'penalty': 'l1', 'random_state': 2, 'tol': 0.0001}
最佳得分: 0.8831
cv-mean: 0.8831, cv-std: 0.0423
测试集分数： 0.8485


## 随机森林的调参过程
### 默认参数的模型性能

In [19]:
model = RandomForestClassifier(random_state=seed)
print('原始模型:')
cross_val(model, X_train, y_train, X_test, y_test, kfold)

原始模型:
cv-mean: 0.8558, cv-std: 0.0421
测试集分数： 0.8303


### 确定最佳的n_estimators

In [20]:
parameters = {
    'n_estimators': range(10, 110, 10)
}

grid_search = GridSearchCV(
    estimator=model, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print(grid_search.grid_scores_)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

[mean: 0.85584, std: 0.04206, params: {'n_estimators': 10}, mean: 0.86234, std: 0.04690, params: {'n_estimators': 20}, mean: 0.86623, std: 0.04791, params: {'n_estimators': 30}, mean: 0.86494, std: 0.04506, params: {'n_estimators': 40}, mean: 0.86364, std: 0.04763, params: {'n_estimators': 50}, mean: 0.86753, std: 0.04782, params: {'n_estimators': 60}, mean: 0.86753, std: 0.04675, params: {'n_estimators': 70}, mean: 0.86623, std: 0.04861, params: {'n_estimators': 80}, mean: 0.86623, std: 0.04575, params: {'n_estimators': 90}, mean: 0.86494, std: 0.04761, params: {'n_estimators': 100}]
优化模型:
最佳参数: {'n_estimators': 60}
最佳得分: 0.8675
cv-mean: 0.8675, cv-std: 0.0478
测试集分数： 0.8455


### 确定最大深度max_depth和min_samples_split

In [None]:
# 确定最佳的max_depth
parameters = {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=seed, n_estimators=60),
    param_grid=parameters,
    scoring=scoring, cv=kfold).fit(X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

In [16]:
# 确定最佳的max_depth
model = RandomForestClassifier(random_state=seed, max_features='sqrt', max_depth=7)
parameters = {
    'min_samples_leaf': range(1, 11, 2)
}

grid_search = GridSearchCV(
    estimator=model, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

优化模型:
最佳参数: {'min_samples_leaf': 5}
最佳得分: 0.8701
cv-mean: 0.8701, cv-std: 0.0431
测试集分数： 0.8303


In [17]:
model = RandomForestClassifier(random_state=seed,
                               max_features='sqrt',
                               max_depth=7,
                               min_samples_leaf=5)
parameters = {
    'n_estimators': [10, 100, 500, 1000, 1500]
}

grid_search = GridSearchCV(
    estimator=model, param_grid=parameters, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
print('优化模型:')
print('最佳参数: %s' % str(grid_search.best_params_))
print('最佳得分: %.4f' % grid_search.best_score_)
cross_val(grid_search.best_estimator_, X_train, y_train, X_test, y_test, kfold)

优化模型:
最佳参数: {'n_estimators': 10}
最佳得分: 0.8701
cv-mean: 0.8701, cv-std: 0.0431
测试集分数： 0.8303


In [18]:
gbm = xgb.XGBClassifier(
    min_child_weight=5,
    max_depth=3,
    objective='reg:logistic',
    gamma=0.2,
    reg_alpha=1e-5,
    reg_lambda=1.0,
    learning_rate=0.05,
    colsample_bytree=1.0,
    colsample_bylevel=1.0,
    seed=seed,
    n_estimators=500,
    subsample=1,
    verbose=True)

print('xgboost模型:')
cross_val(gbm, X_train, y_train, X_test, y_test, kfold)

xgboost模型:
cv-mean: 0.8883, cv-std: 0.0368
测试集分数： 0.8242
