In [116]:
'''导入库'''
import pandas as pd
import pprint
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import learning_curve

In [49]:
df_train = pd.read_csv('../dataset/pfm_train.csv')
df_test = pd.read_csv('../dataset/pfm_test.csv')

# 前文分析过，两个变量方差为0，可以删除。
# EmployeeNumber是唯一识别号码，删除
df_train.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)
df_test.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)

# 预测变量
target_var = 'Attrition'

# 字符型
character_var = [
    x for x in df_train.dtypes.index if df_train.dtypes[x] == 'object'
]
numeric_var = [
    x for x in df_train.dtypes.index
    if x != target_var and x not in character_var
]

In [50]:
scaler = MinMaxScaler()
pattern = scaler.fit(df_train[numeric_var])
df_train[numeric_var] = scaler.transform(df_train[numeric_var])
df_test[numeric_var] = scaler.transform(df_test[numeric_var])

In [51]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

predictor = [x for x in df_train.columns if x != target_var]

In [158]:
validation_size = 0.3
seed = 7
scoring = 'accuracy'
X_train, X_test, y_train, y_test = train_test_split(
    df_train[predictor],
    df_train[target_var],
    test_size=validation_size,
    random_state=seed)
kfold = StratifiedKFold(n_splits=10, random_state=seed)

In [10]:
def find_param(estimator, param_grid, scoring, cv, X_train, X_test, y_train, y_test):
    cv_results = cross_val_score(estimator, X_train, y_train, cv=cv, scoring=scoring)
    msg = "原始模型交叉验证分数: %f (%f)" % (cv_results.mean(), cv_results.std())
    print(msg)
    grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring=scoring, cv=cv).fit(X_train, y_train)
    print("优化模型交叉验证分数: %f" % (grid_search.best_score_))
    pred_result = grid_search.best_estimator_.predict(X_test)
    pred_score = accuracy_score(y_test, pred_result)
    print('优化模型测试集分数： %.4f' % pred_score)
    print(grid_search.best_params_)

In [11]:
model = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1],
    'tol': [1e-6, 1e-5, 1e-4],
    'random_state': [1, 2, 3, 4, 5]
}

find_param(model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.890958 (0.015047)
优化模型交叉验证分数: 0.890909
优化模型测试集分数： 0.8485
{'C': 1, 'penalty': 'l2', 'random_state': 1, 'tol': 1e-06}


In [12]:
gbdt_model = GradientBoostingClassifier(learning_rate=0.1,
                           min_samples_split=500,
                           min_samples_leaf=50,
                           max_depth=8,
                           max_features='sqrt',
                           subsample=0.8,
                           random_state=45)
parameters = {'n_estimators':range(20,81,10)}
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.027314)
优化模型交叉验证分数: 0.877922
优化模型测试集分数： 0.8485
{'n_estimators': 80}


In [13]:
parameters =  {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,
                           min_samples_split=500,
                           min_samples_leaf=50,
                           max_depth=8,
                           max_features='sqrt',
                           subsample=0.8,
                           random_state=45)
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.881818
优化模型测试集分数： 0.8485
{'max_depth': 5, 'min_samples_split': 200}


In [17]:
parameters =  {'max_depth':range(1, 6), 'min_samples_split':[100, 200, 300]}
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.884416
优化模型测试集分数： 0.8576
{'max_depth': 3, 'min_samples_split': 100}


In [18]:
parameters =  {'max_depth':range(1, 6), 'min_samples_split':range(80, 101, 10)}
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.884416
优化模型测试集分数： 0.8576
{'max_depth': 3, 'min_samples_split': 80}


In [19]:
parameters =  {'max_depth':range(1, 6), 'min_samples_split':range(10, 81, 10)}
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.884416
优化模型测试集分数： 0.8576
{'max_depth': 3, 'min_samples_split': 10}


In [21]:
parameters =  {'max_depth':range(1, 6), 'min_samples_split':range(2, 11)}
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.884416
优化模型测试集分数： 0.8576
{'max_depth': 3, 'min_samples_split': 2}


In [22]:
parameters =  {'min_samples_leaf':range(10, 31,10), 'min_samples_split':range(2, 11)}
gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,
                           min_samples_split=500,
                           min_samples_leaf=50,
                           max_depth=3,
                           max_features='sqrt',
                           subsample=0.8,
                           random_state=45)
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.889610
优化模型测试集分数： 0.8364
{'min_samples_leaf': 10, 'min_samples_split': 2}


In [23]:
parameters =  {'min_samples_leaf':range(1, 11), 'min_samples_split':range(2, 11)}
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.878205 (0.026016)
优化模型交叉验证分数: 0.890909
优化模型测试集分数： 0.8424
{'min_samples_leaf': 4, 'min_samples_split': 9}


In [29]:
parameters =  {'max_features':range(3, 9)}
gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,
                           min_samples_split=9,
                           min_samples_leaf=4,
                           max_depth=3,
                           max_features='sqrt',
                           subsample=0.8,
                           random_state=45)
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.891093 (0.020985)
优化模型交叉验证分数: 0.890909
优化模型测试集分数： 0.8424
{'max_features': 6}


In [30]:
parameters =  {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gbdt_model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,
                           min_samples_split=9,
                           min_samples_leaf=4,
                           max_depth=3,
                           max_features=6,
                           subsample=0.8,
                           random_state=45)
find_param(gbdt_model, parameters, scoring, kfold, X_train, X_test, y_train, y_test)

原始模型交叉验证分数: 0.891093 (0.020985)
优化模型交叉验证分数: 0.890909
优化模型测试集分数： 0.8424
{'subsample': 0.8}


In [159]:
gbdt_model.fit(X_train, y_train)
gbdt_enc = OneHotEncoder()
gbdt_enc.fit(gbdt_model.apply(X_train)[:, :, 0])
new_feature_train = gbdt_enc.transform(gbdt_model.apply(X_train)[:, :, 0]).toarray()

In [160]:
X_train = np.concatenate([X_train, new_feature], axis=1)

In [162]:
new_feature = gbdt_enc.transform(gbdt_model.apply(X_test)[:, :, 0]).toarray()
X_test = np.concatenate([X_test, new_feature], axis=1)

In [164]:
parameters = {
    'penalty': ['l2'],
    'C': np.linspace(0.1, 0.5, 5),
    'tol': [1e-6,2*1e-6, 3*1e-6]
}

find_param(model, parameters, scoring, kfold, new_feature_train, new_feature_test, y_train, y_test)

原始模型交叉验证分数: 0.928745 (0.038001)
优化模型交叉验证分数: 0.932468
优化模型测试集分数： 0.8303
{'C': 0.30000000000000004, 'penalty': 'l2', 'tol': 1e-06}
