In [39]:
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

In [40]:
# 目标变量
target_var = 'Attrition'

# 连续变量
num_col = ['Age', 'MonthlyIncome', 'TotalWorkingYears','PercentSalaryHike',
           'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
           'YearsWithCurrManager', 'NumCompaniesWorked']
# 有序变量
ord_col = ['DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement',
           'JobLevel', 'JobSatisfaction', 'PerformanceRating', 'RelationshipSatisfaction',
           'StockOptionLevel', 'WorkLifeBalance', 'TrainingTimesLastYear']

# 分类变量
cat_col = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
           'MaritalStatus', 'OverTime']

In [41]:
train = pd.read_csv('pfm_train.csv', header=0)
test = pd.read_csv('pfm_test.csv', header=0)

In [42]:
y = train['Attrition']
X = train.drop([x for x in train.columns if x not in num_col+ord_col+cat_col], axis=1)
test = test.drop([x for x in test.columns if x not in num_col+ord_col+cat_col], axis=1)

In [43]:
label = LabelEncoder()
for col in cat_col:
    X[col] = label.fit_transform(X[col])
    test[col] = label.transform(test[col])

## 划分数据集

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=45)
log_reg = LogisticRegression(random_state=45)
kfold = StratifiedKFold(n_splits=10, random_state=45)

In [70]:
num_list = []
num_list.append(('X_train', X_train[num_col]))

## 连续变量对数变换

In [71]:
num_log = X_train[num_col].copy()
num_log[num_col] = num_log[num_col].apply(lambda x: np.log(1+x))
num_list.append(('num_log',num_log))
#X_test_log[num_col] = X_test[num_col].apply(lambda x: np.log(1+x))

## 对数变换后MinMax

In [72]:
num_log_minmax = num_log.copy()
scaler = MinMaxScaler()
_ = scaler.fit(num_log_minmax)
num_log_minmax = scaler.transform(num_log_minmax)
num_list.append(('num_log_minmax', num_log_minmax))

## 对数变换后Standard

In [73]:
num_log_standard = num_log.copy()
scaler = StandardScaler()
_ = scaler.fit(num_log_standard)
num_log_standard = scaler.transform(num_log_standard)
num_list.append(('num_log_standard', num_log_standard))

## 直接MinMax

In [74]:
num_minmax = X_train[num_col].copy()
scaler = MinMaxScaler()
_ = scaler.fit(num_minmax)
num_minmax = scaler.transform(num_minmax)
num_list.append(('num_minmax', num_minmax))

## 直接standard

In [75]:
num_standard = X_train[num_col].copy()
scaler = StandardScaler()
_ = scaler.fit(num_standard)
num_standard = scaler.transform(num_standard)
num_list.append(('num_standard', num_standard))

## 处理分类变量

In [76]:
cat_list = []
cat_list.append(('X_train', X_train[cat_col]))

### 分类变量dummy

In [45]:
cat_dummies = X_train[cat_col].copy()
cat_dummies = pd.get_dummies(cat_dummies, columns=cat_col)
#X_test = pd.get_dummies(X_test, columns=cat_col)
#cat_list.append(('cat_dummies', cat_dummies))

### dummy后standard

In [78]:
cat_dummies_standard = cat_dummies.copy()
scaler = StandardScaler()
_ = scaler.fit(cat_dummies_standard)
cat_dummies_standard = scaler.transform(cat_dummies_standard)
cat_list.append(('cat_dummies_standard', cat_dummies_standard))

## 有序变量处理

In [79]:
ord_list = []
ord_list.append(('X_train', X_train[ord_col]))

### 连续变量对数变换

In [80]:
ord_log = X_train[ord_col].copy()
ord_log = ord_log.apply(lambda x: np.log(1+x))
ord_list.append(('ord_log',ord_log))
#X_test_log[num_col] = X_test[num_col].apply(lambda x: np.log(1+x))

## 对数变换后MinMax

In [81]:
ord_log_minmax = ord_log.copy()
scaler = MinMaxScaler()
_ = scaler.fit(ord_log_minmax)
ord_log_minmax = scaler.transform(ord_log_minmax)
ord_list.append(('ord_log_minmax', ord_log_minmax))

## 对数变换后Standard

In [82]:
ord_log_standard = ord_log.copy()
scaler = StandardScaler()
_ = scaler.fit(ord_log_standard)
ord_log_standard = scaler.transform(ord_log_standard)
ord_list.append(('ord_log_standard', ord_log_standard))

## 直接MinMax

In [83]:
ord_minmax = X_train[ord_col].copy()
scaler = MinMaxScaler()
_ = scaler.fit(ord_minmax)
ord_minmax = scaler.transform(ord_minmax)
ord_list.append(('ord_minmax', ord_minmax))

## 直接standard

In [84]:
ord_standard = X_train[ord_col].copy()
scaler = StandardScaler()
_ = scaler.fit(ord_standard)
ord_standard = scaler.transform(ord_standard)
ord_list.append(('ord_standard', ord_standard))

### 分类变量dummy

In [85]:
ord_dummies = X_train[ord_col].copy()
ord_dummies = pd.get_dummies(ord_dummies, columns=ord_col)
#X_test = pd.get_dummies(X_test, columns=cat_col)
ord_list.append(('ord_dummies', ord_dummies))

### dummy后standard

In [86]:
ord_dummies_standard = ord_dummies.copy()
scaler = StandardScaler()
_ = scaler.fit(ord_dummies_standard)
ord_dummies_standard = scaler.transform(ord_dummies_standard)
ord_list.append(('ord_dummies_standard', ord_dummies_standard))

In [108]:
data = pd.concat([X_train[num_col], X_train[ord_col], cat_dummies, y_train], axis=1)
# y = y_train
predictor = [x for x in data.columns if x != target_var]

In [111]:
tlmeanacc = 0
tlmaxacc = 0
i = 1
for train, test in kfold.split(X, y):
    print('%d'%i)
    rf = RandomForestClassifier(random_state=45, max_features='sqrt')
    rf.fit(data.iloc[train, predictor], data.iloc[train, target_var])
    score = rf.score(data.iloc[test, predictor], data.iloc[test, target_var])
    tlmeanacc += score/10
    if tlmaxacc <= score:
        tlmaxacc = tlmeanacc
        feature_importances = pd.DataFrame({'coef':rf.feature_importances_, 'cols':predictor}).sort_values(by='coef', ascending=False)
if tlmaxacc <= tlmeanacc:
    tlmaxacc = tlmeanacc
print(tlmaxacc)
print(feature_importances)

1


TypeError: cannot perform reduce with flexible type

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=45, max_features='log2')
rf.fit(matrix, y_train)
feature_importances = pd.DataFrame({'coef':rf.feature_importances_, 'cols':matrix.columns})
feature_importances = feature_importances.sort_values(by='coef', ascending=False)
feature_importances['cumsum'] = feature_importances['coef'].cumsum()
useful_cols = feature_importances[feature_importances['cumsum']<=0.8]['cols'].tolist()
useful_cols

['MonthlyIncome',
 'Age',
 'YearsAtCompany',
 'TotalWorkingYears',
 'DistanceFromHome',
 'OverTime_Yes',
 'PercentSalaryHike',
 'YearsWithCurrManager',
 'JobSatisfaction',
 'EnvironmentSatisfaction',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'RelationshipSatisfaction',
 'MaritalStatus_Single',
 'Education',
 'NumCompaniesWorked',
 'TrainingTimesLastYear',
 'StockOptionLevel',
 'WorkLifeBalance',
 'JobInvolvement']

In [30]:
new = matrix[useful_cols]
rf.fit(new, y_train)
feature_importances = pd.DataFrame({'coef':rf.feature_importances_, 'cols':new.columns})
feature_importances = feature_importances.sort_values(by='coef', ascending=False)
feature_importances['cumsum'] = feature_importances['coef'].cumsum()
useful_cols = feature_importances[feature_importances['cumsum']<=0.8]['cols'].tolist()
useful_cols

['MonthlyIncome',
 'TotalWorkingYears',
 'Age',
 'DistanceFromHome',
 'OverTime_Yes',
 'PercentSalaryHike',
 'YearsAtCompany',
 'YearsWithCurrManager',
 'WorkLifeBalance',
 'NumCompaniesWorked',
 'EnvironmentSatisfaction',
 'YearsInCurrentRole']

In [31]:
new = matrix[useful_cols]
rf.fit(new, y_train)
feature_importances = pd.DataFrame({'coef':rf.feature_importances_, 'cols':new.columns})
feature_importances = feature_importances.sort_values(by='coef', ascending=False)
feature_importances['cumsum'] = feature_importances['coef'].cumsum()
useful_cols = feature_importances[feature_importances['cumsum']<=0.8]['cols'].tolist()
useful_cols

['MonthlyIncome',
 'Age',
 'TotalWorkingYears',
 'DistanceFromHome',
 'OverTime_Yes',
 'PercentSalaryHike',
 'YearsWithCurrManager',
 'EnvironmentSatisfaction']

In [87]:
result = pd.DataFrame(columns=['name', 'mean', 'std'])
poly = PolynomialFeatures(2)
for num_name, num_values in num_list:
    for ord_name, ord_values in ord_list:
        for cat_name, cat_values in cat_list:
            feature = '{}+{}+{}'.format(num_name, ord_name, cat_name)
            matrix = np.hstack((num_values, ord_values, cat_values))
            new = poly.fit_transform(matrix)
            score = cross_val_score(estimator=log_reg, cv=kfold, X=new, y=y_train)
            print('mean={:.3f}, std={:.3f}'.format(np.mean(score), np.std(score)))
            result.loc[len(result)] = [feature, np.mean(score), np.std(score)]

mean=0.845, std=0.015
mean=0.847, std=0.015
mean=0.851, std=0.017
mean=0.847, std=0.016
mean=0.845, std=0.015
mean=0.842, std=0.014
mean=0.844, std=0.019
mean=0.849, std=0.018
mean=0.845, std=0.011
mean=0.840, std=0.015
mean=0.848, std=0.016
mean=0.849, std=0.018
mean=0.844, std=0.020
mean=0.853, std=0.016
mean=0.842, std=0.014
mean=0.845, std=0.017
mean=0.847, std=0.019
mean=0.853, std=0.012
mean=0.842, std=0.018
mean=0.839, std=0.020
mean=0.848, std=0.021
mean=0.838, std=0.024
mean=0.840, std=0.031
mean=0.846, std=0.030
mean=0.782, std=0.038
mean=0.814, std=0.030
mean=0.824, std=0.033
mean=0.823, std=0.039
mean=0.835, std=0.044
mean=0.819, std=0.029
mean=0.823, std=0.043
mean=0.834, std=0.039
mean=0.818, std=0.022
mean=0.798, std=0.028
mean=0.822, std=0.046
mean=0.828, std=0.034
mean=0.828, std=0.044
mean=0.830, std=0.041
mean=0.821, std=0.025
mean=0.785, std=0.030
mean=0.812, std=0.030
mean=0.822, std=0.025
mean=0.823, std=0.041
mean=0.846, std=0.020
mean=0.842, std=0.020
mean=0.844

In [None]:
matrix = np.hstack((num_log_minmax,ord_dummies,cat_dummies))
new = poly.fit_transform(matrix)
for i in range(1, new.shape[1]+1):
    rfe = RFE(estimator=log_reg, n_features_to_select=i)
    cv_score = cross_val_score(estimator=rfe, cv=kfold, X=new, y=y_train)
#     _ = rfe.fit(opt_train, y_train)
#     score = rfe.score(opt_test, y_test)
    print('%d: CV result:%.3f'%(i, np.mean(cv_score)))

In [88]:
result.sort_values(by='mean', ascending=False)

Unnamed: 0,name,mean,std
67,num_log_minmax+ord_dummies+cat_dummies,0.865420,0.022198
95,num_log_standard+ord_dummies_standard+cat_dumm...,0.861776,0.019151
143,num_standard+ord_dummies_standard+cat_dummies_...,0.860615,0.017167
115,num_minmax+ord_dummies+cat_dummies,0.855737,0.021308
61,num_log_minmax+ord_minmax+cat_dummies,0.854604,0.029944
46,num_log+ord_dummies_standard+cat_dummies,0.854532,0.025510
13,X_train+ord_minmax+cat_dummies,0.853443,0.016438
17,X_train+ord_standard+cat_dummies_standard,0.853428,0.012484
94,num_log_standard+ord_dummies_standard+cat_dummies,0.853370,0.024871
142,num_standard+ord_dummies_standard+cat_dummies,0.853356,0.027100


In [28]:
opt_train = X_train.copy()
opt_test = X_test.copy()

opt_train[num_col+ord_col] = opt_train[num_col+ord_col].apply(lambda x: np.log(1+x))
opt_test[num_col+ord_col] = opt_test[num_col+ord_col].apply(lambda x: np.log(1+x))

scaler = StandardScaler()
_ = scaler.fit(opt_train[num_col+ord_col])
opt_train[num_col+ord_col] = scaler.transform(opt_train[num_col+ord_col])
opt_test[num_col+ord_col] = scaler.transform(opt_test[num_col+ord_col])

opt_train = pd.get_dummies(opt_train, columns=cat_col)
opt_test = pd.get_dummies(opt_test, columns=cat_col)

In [29]:
for i in range(1, opt_train.shape[1]+1):
    rfe = RFE(estimator=log_reg, n_features_to_select=i)
    cv_score = cross_val_score(estimator=rfe, cv=kfold, X=opt_train, y=y_train)
    _ = rfe.fit(opt_train, y_train)
    score = rfe.score(opt_test, y_test)
    print('%d: CV result:%.3f, test result:%.3f'%(i, np.mean(cv_score), score))
#   pipe = make_pipeline(rfe, log_reg)
#     score = cross_val_score(estimator=pipe, cv=kfold, X=opt_train, y=y_train)
#     print('mean={:.3f}, std={:.3f}'.format(np.mean(score), np.std(score)))

1: CV result:0.839, test result:0.836
2: CV result:0.839, test result:0.836
3: CV result:0.838, test result:0.836
4: CV result:0.838, test result:0.840
5: CV result:0.838, test result:0.840
6: CV result:0.834, test result:0.855
7: CV result:0.830, test result:0.858
8: CV result:0.840, test result:0.858
9: CV result:0.845, test result:0.858
10: CV result:0.845, test result:0.862
11: CV result:0.850, test result:0.865
12: CV result:0.856, test result:0.862
13: CV result:0.853, test result:0.869
14: CV result:0.849, test result:0.869
15: CV result:0.851, test result:0.873
16: CV result:0.857, test result:0.880
17: CV result:0.861, test result:0.876
18: CV result:0.858, test result:0.884
19: CV result:0.859, test result:0.880
20: CV result:0.869, test result:0.876
21: CV result:0.864, test result:0.869
22: CV result:0.869, test result:0.869
23: CV result:0.874, test result:0.876
24: CV result:0.874, test result:0.873
25: CV result:0.875, test result:0.869
26: CV result:0.880, test result:0

In [30]:
score = cross_val_score(estimator=log_reg, cv=kfold, X=X, y=y)
print('mean: {:.3f}, std={:.3f}'.format(np.mean(score), np.std(score)))

mean: 0.870, std=0.022


In [61]:
X[num_col] = X[num_col].apply(lambda x: np.log(1+x))
test[num_col] = test[num_col].apply(lambda x: np.log(1+x))

scaler = StandardScaler()
_ = scaler.fit(X[num_col+ord_col])
X[num_col+ord_col] = scaler.transform(X[num_col+ord_col])
test[num_col+ord_col] = scaler.transform(test[num_col+ord_col])

X = pd.get_dummies(X, columns=cat_col)
test = pd.get_dummies(test, columns=cat_col)

In [62]:
score = cross_val_score(estimator=log_reg, cv=kfold, X=X, y=y)
print('mean: {:.3f}, std={:.3f}'.format(np.mean(score), np.std(score)))

mean: 0.888, std=0.023


In [63]:
params = {'penalty': ['l1', 'l2'], 'C':[.5, 1, 1.5]}
grid = GridSearchCV(estimator=log_reg, param_grid=params, cv=kfold)
_ = grid.fit(opt_train, y_train)
print(grid.score(opt_test, y_test))
print(grid.best_params_)

0.88
{'C': 0.5, 'penalty': 'l2'}


In [64]:
score = cross_val_score(estimator=LogisticRegression(C=0.5, random_state=45), cv=kfold, X=X, y=y)
print('mean: {:.3f}, std={:.3f}'.format(np.mean(score), np.std(score)))

mean: 0.891, std=0.022


In [65]:
clf = LogisticRegression(C=0.5, random_state=45)
clf.fit(X, y)
pred = clf.predict(test).astype(int)
result = pd.DataFrame({'result':pred})
result.to_csv('result.csv', index=False)