In [236]:
import pandas as pd
import numpy as np
import pprint
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score

from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder

import sys
sys.path.append('../code')

import discretization

In [237]:
df_train = pd.read_csv('../dataset/pfm_train.csv')
df_test = pd.read_csv('../dataset/pfm_test.csv')

df_train['source'] = 'train'
df_test['source'] = 'test'

df = pd.concat([df_train, df_test], axis=0)

df.drop(['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)

In [238]:
nan_number = [x for x in df.dtypes.index if df.dtypes[x] == 'object' and x != 'source']
for col in nan_number:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col])

In [241]:
target_var = 'Attrition'
predictor = [x for x in df.columns if x not in ['source'] ]

In [244]:
df_train = df[df['source']=='train'].copy()
df_test = df[df['source']=='test'].copy()

In [245]:
df_Ent = discretization.discretization(df_train[predictor], 'Attrition')

基本信息增益是: 3.281446
Age的最佳划分点是 33.500000, 最大信息增益是 2.671735。
BusinessTravel的最佳划分点是 0.500000, 最大信息增益是 2.646939。
Department的最佳划分点是 1.500000, 最大信息增益是 2.646455。
DistanceFromHome的最佳划分点是 10.500000, 最大信息增益是 2.649758。
Education的最佳划分点是 4.500000, 最大信息增益是 2.645389。
EducationField的最佳划分点是 4.500000, 最大信息增益是 2.645434。
EnvironmentSatisfaction的最佳划分点是 1.500000, 最大信息增益是 2.651310。
Gender的最佳划分点是 0.500000, 最大信息增益是 2.643008。
JobInvolvement的最佳划分点是 1.500000, 最大信息增益是 2.655163。
JobLevel的最佳划分点是 1.500000, 最大信息增益是 2.672332。
JobRole的最佳划分点是 7.500000, 最大信息增益是 2.656099。
JobSatisfaction的最佳划分点是 1.500000, 最大信息增益是 2.650642。
MaritalStatus的最佳划分点是 1.500000, 最大信息增益是 2.666477。
MonthlyIncome的最佳划分点是 2487.500000, 最大信息增益是 2.672683。
NumCompaniesWorked的最佳划分点是 4.500000, 最大信息增益是 2.645148。
OverTime的最佳划分点是 0.500000, 最大信息增益是 2.689530。
PercentSalaryHike的最佳划分点是 21.500000, 最大信息增益是 2.645870。
PerformanceRating的最佳划分点是 3.500000, 最大信息增益是 2.644305。
RelationshipSatisfaction的最佳划分点是 1.500000, 最大信息增益是 2.645120。
StockOptionLevel的最佳划分点是 0.500000, 最大信息增益是 2

In [248]:
from sklearn.preprocessing import Binarizer
for col in df_Ent.keys():
    bin_encoder = Binarizer(df_Ent[col]['best_point'])
    df[col] = bin_encoder.fit_transform(df[col].reshape(-1,1))

In [249]:
df_train = df[df['source']=='train'].copy()
df_test = df[df['source']=='test'].copy()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,source
0,1,0.0,1,0,0,0,0,0,1,1,...,1,1,1,1,1,1,1,0,1,train
1,1,0.0,1,0,0,0,0,1,0,1,...,0,1,1,1,0,1,1,0,1,train
2,1,1.0,1,0,0,0,0,0,1,0,...,1,0,1,1,1,1,1,0,1,train
3,1,0.0,1,0,0,0,0,1,0,1,...,1,1,1,1,1,1,1,1,1,train
4,0,1.0,1,0,0,0,0,0,1,1,...,0,1,0,1,1,0,0,0,0,train


In [250]:
predictor = [x for x in df.columns if x not in ['source', 'Attrition'] ]

In [251]:
kfold = StratifiedKFold(n_splits=10, random_state=7)

X_train = df_train.loc[train_inx,predictor]
y_train = df_train.loc[train_inx, target_var]
X_test = df_train.loc[test_inx,predictor]
y_test = df_train.loc[test_inx, target_var]

In [199]:
validation_size = 0.3
seed = 7
scoring = 'accuracy'
X_train, X_test, y_train, y_test = train_test_split(
    df_train[predictor],
    df_train[target_var],
    test_size=validation_size,
    random_state=seed)
kfold = StratifiedKFold(n_splits=10, random_state=seed)

In [200]:
def cross_val(model, X_train, y_train, X_test, y_test, kfold):
    cv_results = cross_val_score(
        model, X_train, y_train, cv=kfold, scoring=scoring)
    print('cv-mean: %.4f, cv-std: %.4f' % (cv_results.mean(),
                                           cv_results.std()))
    model.fit(X_train, y_train)
    train_result = model.predict(X_train)
    pred_result = model.predict(X_test)
    train_score = accuracy_score(y_train, train_result)
    pred_score = accuracy_score(y_test, pred_result)
    print('训练集分数:  %.4f' % train_score)
    print('测试集分数： %.4f' % pred_score)
    return

In [201]:
def tunning_params(model, params, scoring=scoring, kfold=kfold, X_train=X_train,
                   y_train=y_train, X_test=X_test, y_test=y_test):
    grid_search = GridSearchCV(
    estimator=model, param_grid=params, scoring=scoring, cv=kfold).fit(
        X_train, y_train)
    print('优化后模型:')
    print('最佳参数: %s' % str(grid_search.best_params_))
    print('最佳得分: %.4f' % grid_search.best_score_)
    model = model.set_params(**(grid_search.best_params_))
    cross_val(model, X_train, y_train, X_test, y_test, kfold)
    return model

In [262]:
base_score = 0.8
for i in range(0, 5000):
    X_train, X_test, y_train, y_test = train_test_split(df_train[predictor]
                                                        ,df_train[target_var], test_size=0.2)
    lm = LogisticRegression()
    lm.fit(X_train, y_train)
    lmp = lm.predict(X_test)
    xxx = lm.score(X_test, y_test)
    if xxx > 0.94:
        break
xxx

0.86818181818181817

In [261]:
lrm = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 1.2, 1.5, 2],
    'tol': [1e-6, 1e-5, 1e-4]
}

print('原始模型:')
cross_val(lrm, X_train, y_train, X_test, y_test, kfold)

lrm = tunning_params(lrm, parameters)

原始模型:
cv-mean: 0.8768, cv-std: 0.0366
训练集分数:  0.8870
测试集分数： 0.8273
优化后模型:
最佳参数: {'C': 1, 'penalty': 'l2', 'tol': 1e-05}
最佳得分: 0.8779
cv-mean: 0.8781, cv-std: 0.0361
训练集分数:  0.8870
测试集分数： 0.8273


## 随机森林的调参过程
### 默认参数的模型性能

In [114]:
rfm = RandomForestClassifier(random_state=seed)
print('原始模型:')
cross_val(rfm, X_train, y_train, X_test, y_test, kfold)

原始模型:
cv-mean: 0.8387, cv-std: 0.0210
训练集分数:  0.9807
测试集分数： 0.9000


### 确定最佳的n_estimators

In [115]:
parameters = {
    'n_estimators': range(10, 110, 10)
}

rfm = tunning_params(rfm, parameters)

优化后模型:
最佳参数: {'n_estimators': 30}
最佳得分: 0.8675
cv-mean: 0.8677, cv-std: 0.0167
训练集分数:  0.9987
测试集分数： 0.8394


### 确定最大深度max_depth和min_samples_split

In [116]:
parameters = {'max_depth':range(3,14,2)
             }

rfm = tunning_params(rfm, parameters)

优化后模型:
最佳参数: {'max_depth': 13}
最佳得分: 0.8636
cv-mean: 0.8638, cv-std: 0.0247
训练集分数:  0.9974
测试集分数： 0.8212


### 确定min_samples_split和min_samples_leaf

In [117]:
parameters = {'min_samples_split':range(2,20), 'min_samples_leaf':range(1,10)}

rfm = tunning_params(rfm, parameters)

优化后模型:
最佳参数: {'min_samples_leaf': 1, 'min_samples_split': 8}
最佳得分: 0.8701
cv-mean: 0.8703, cv-std: 0.0189
训练集分数:  0.9325
测试集分数： 0.8303


### 确定max_features

In [118]:
parameters = {
    'max_features': range(3, 10)
}

rfm = tunning_params(rfm, parameters)

优化后模型:
最佳参数: {'max_features': 9}
最佳得分: 0.8701
cv-mean: 0.8703, cv-std: 0.0189
训练集分数:  0.9325
测试集分数： 0.8303


通过对随机森林的参数调节，其结果甚至没有最优的logistic回归性能好。

## xgboost调参 

### 设定基础参数

In [252]:
gbm = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=seed)

print('xgboost基础模型:')
cross_val(gbm, X_train, y_train, X_test, y_test, kfold)

xgboost基础模型:
cv-mean: 0.8821, cv-std: 0.0317
训练集分数:  1.0000
测试集分数： 0.8333


### 确定学习率和对应的n_estimators

In [253]:
parameters = {'n_estimators': [100,200,500,1000,1500]}
gbm = tunning_params(gbm, parameters)

优化后模型:
最佳参数: {'n_estimators': 1000}
最佳得分: 0.8818
cv-mean: 0.8821, cv-std: 0.0317
训练集分数:  1.0000
测试集分数： 0.8333


### max_depth 和 min_weight 参数调优

In [254]:
parameters = {
 'max_depth':range(1,15,2),
 'min_child_weight':range(1,6,2)
}

gbm = tunning_params(gbm, parameters)

优化后模型:
最佳参数: {'max_depth': 3, 'min_child_weight': 1}
最佳得分: 0.8831
cv-mean: 0.8834, cv-std: 0.0335
训练集分数:  1.0000
测试集分数： 0.8303


### gamma调优

In [255]:
parameters = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gbm = tunning_params(gbm, parameters)

优化后模型:
最佳参数: {'gamma': 0.3}
最佳得分: 0.8844
cv-mean: 0.8846, cv-std: 0.0372
训练集分数:  1.0000
测试集分数： 0.8394


### subsample和colsample_bytree调优

In [256]:
parameters = {
 'subsample':[i/10.0 for i in range(6,11)],
 'colsample_bytree':[i/10.0 for i in range(6,11)]
}

gbm = tunning_params(gbm, parameters)

优化后模型:
最佳参数: {'colsample_bytree': 0.8, 'subsample': 0.8}
最佳得分: 0.8844
cv-mean: 0.8846, cv-std: 0.0372
训练集分数:  1.0000
测试集分数： 0.8394


### 正则化调优

In [257]:
parameters = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100], 'reg_lambda':[0, 1e-5, 1e-2, 0.1, 1, 100]
}

gbm = tunning_params(gbm, parameters)

优化后模型:
最佳参数: {'reg_alpha': 0.1, 'reg_lambda': 100}
最佳得分: 0.8896
cv-mean: 0.8898, cv-std: 0.0272
训练集分数:  0.9299
测试集分数： 0.8485


In [259]:
parameters = {
 'reg_lambda':[70, 60, 80]
}

gbm = tunning_params(gbm, parameters)

优化后模型:
最佳参数: {'reg_lambda': 70}
最佳得分: 0.8909
cv-mean: 0.8911, cv-std: 0.0269
训练集分数:  0.9429
测试集分数： 0.8545


In [260]:
gbm.fit(df_train[predictor], df_train[target_var])
predictions = gbm.predict(df_test[predictor]).astype(np.int32)
submission = pd.DataFrame({'result': predictions})
submission.to_csv("submission.csv", index=False)

In [234]:
train_inx = X_train.index.tolist()
test_inx = X_test.index.tolist()

## GradientBoostingClassifier调优

In [126]:
gbcm = GradientBoostingClassifier()
cross_val(gbcm, X_train, y_train, X_test, y_test, kfold)

cv-mean: 0.8500, cv-std: 0.0202
训练集分数:  0.9489
测试集分数： 0.9273


In [127]:
parameters= [{'n_estimators':range(20,81,10),
              'max_depth':range(3,14,2),
              'learning_rate':[0.1, 0.5, 1.0],
              'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]
            }]

gbcm = tunning_params(gbcm, parameters)

优化后模型:
最佳参数: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 70, 'subsample': 0.6}
最佳得分: 0.8831
cv-mean: 0.8637, cv-std: 0.0281
训练集分数:  0.9442
测试集分数： 0.8242


## AdaBoostClassifier调优

In [128]:
adbm = AdaBoostClassifier(random_state=seed)
cross_val(adbm, X_train, y_train, X_test, y_test, kfold)

cv-mean: 0.8557, cv-std: 0.0304
训练集分数:  0.8750
测试集分数： 0.9273


In [129]:
parameters= [{'n_estimators':range(20,81,10),
              'learning_rate':[0.1, 0.5, 1.0],
            }]
adbm = tunning_params(adbm, parameters)

优化后模型:
最佳参数: {'learning_rate': 0.5, 'n_estimators': 80}
最佳得分: 0.8766
cv-mean: 0.8768, cv-std: 0.0311
训练集分数:  0.8974
测试集分数： 0.8273


## ExtraTreesClassifier调优

In [130]:
etcm = ExtraTreesClassifier(random_state=seed)
cross_val(etcm, X_train, y_train, X_test, y_test, kfold)

cv-mean: 0.8342, cv-std: 0.0239
训练集分数:  1.0000
测试集分数： 0.9227


In [131]:
parameters = {
    'n_estimators': range(10, 50, 20),'max_depth':range(3,11,2),
    'min_samples_split':range(2,10), 'min_samples_leaf':range(3,6),'max_features': range(3, 6)
}
etcm = tunning_params(etcm, parameters)

优化后模型:
最佳参数: {'max_depth': 7, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 10}
最佳得分: 0.8610
cv-mean: 0.8612, cv-std: 0.0142
训练集分数:  0.8714
测试集分数： 0.8152


In [132]:
models = []
models.append(('lrm', lrm))
models.append(('rfm', rfm))
models.append(('gbm', gbm))
models.append(('gbcm', gbcm))
models.append(('etcm', etcm))
models.append(('ad', adbm))
ensemble = VotingClassifier(models)
cross_val(ensemble, X_train, y_train, X_test, y_test, kfold)

cv-mean: 0.8580, cv-std: 0.0152
训练集分数:  0.8955
测试集分数： 0.9273


In [140]:
ensemble.fit(df_train[predictor], df_train[target_var])
predictions = ensemble.predict(df_test[predictor])
submission = pd.DataFrame({'result': predictions})
submission.to_csv("submission.csv", index=False)

KeyError: "['Age_0' 'Age_1' 'Age_2' 'Age_3' 'Age_4' 'DistanceFromHome_0'\n 'DistanceFromHome_1' 'DistanceFromHome_2' 'DistanceFromHome_3'\n 'DistanceFromHome_4' 'DistanceFromHome_5' 'MonthlyIncome_0'\n 'MonthlyIncome_1' 'MonthlyIncome_2' 'MonthlyIncome_3' 'MonthlyIncome_4'\n 'MonthlyIncome_5' 'NumCompaniesWorked_0' 'NumCompaniesWorked_1'\n 'NumCompaniesWorked_2' 'PercentSalaryHike_0' 'PercentSalaryHike_1'\n 'PercentSalaryHike_2' 'PercentSalaryHike_3' 'PercentSalaryHike_4'\n 'PercentSalaryHike_5' 'PercentSalaryHike_6' 'PercentSalaryHike_7'\n 'TotalWorkingYears_0' 'TotalWorkingYears_1' 'TotalWorkingYears_2'\n 'TotalWorkingYears_3' 'TotalWorkingYears_4' 'TotalWorkingYears_5'\n 'TotalWorkingYears_6' 'YearsAtCompany_0' 'YearsAtCompany_1'\n 'YearsAtCompany_2' 'YearsAtCompany_3' 'YearsAtCompany_4'\n 'YearsAtCompany_5' 'YearsAtCompany_6' 'YearsInCurrentRole_0'\n 'YearsInCurrentRole_1' 'YearsInCurrentRole_2' 'YearsInCurrentRole_3'\n 'YearsInCurrentRole_4' 'YearsSinceLastPromotion_0'\n 'YearsSinceLastPromotion_1' 'YearsSinceLastPromotion_2'\n 'YearsSinceLastPromotion_3' 'YearsWithCurrManager_0'\n 'YearsWithCurrManager_1' 'YearsWithCurrManager_2' 'YearsWithCurrManager_3'\n 'YearsWithCurrManager_4'] not in index"

In [133]:
X_train.columns

Index(['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'Mar

In [None]:
joblib.dump(lrm, 'lr.pkl')
joblib.dump(rfm, 'rf.pkl')
joblib.dump(etcm, 'etcm.pkl')
joblib.dump(adbm, 'adbm.pkl')
joblib.dump(gbcm, 'gbcm.pkl')
joblib.dump(adbm, 'adbm.pkl')