# 员工离职预测之特征工程

In [31]:
# 目标变量
target_var = 'Attrition'

# 连续变量
num_col = ['Age', 'MonthlyIncome', 'TotalWorkingYears','PercentSalaryHike',
           'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
           'YearsWithCurrManager', 'NumCompaniesWorked']
# 有序变量
ord_col = ['DistanceFromHome', 'StockOptionLevel', 'JobInvolvement',
           'PerformanceRating', 'RelationshipSatisfaction',
           'WorkLifeBalance']

# 分类变量
cat_col = ['BusinessTravel', 'Department', 'JobSatisfaction',
           'OverTime']

In [25]:
#pandas, numpy, matplotlib, seaborn
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
plt.style.use('ggplot')
import numpy as np
%matplotlib inline

from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import learning_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomTreesEmbedding

from sklearn.pipeline import make_pipeline
from scipy import stats
import sys
sys.path.append('../code')
import chimerge

In [35]:
df_train = pd.read_csv('../dataset/pfm_modified.csv')
df_train['sum'] = 1

## 特征选择
<img src='FS1.png' />

### 利用卡方检验过滤非连续变量

In [32]:
chi2_result = pd.DataFrame({'pvalue':np.zeros(len(ord_col+cat_col))}, index=ord_col+cat_col)
for col in ord_col+cat_col:
    freq = df_train.pivot_table(index=col, columns='Attrition', values='sum', aggfunc=sum)
    _, p, _, _ = stats.chi2_contingency(freq.values)
    chi2_result.loc[col, 'pvalue'] = p
chi2_result.sort_values(by='pvalue')

Unnamed: 0,pvalue
OverTime,1.839919e-18
StockOptionLevel,2.299846e-12
JobInvolvement,1.050064e-05
JobSatisfaction,0.0004375365
BusinessTravel,0.004086993
Department,0.02527176
WorkLifeBalance,0.03466896
DistanceFromHome,0.1155587
PerformanceRating,0.1506839
RelationshipSatisfaction,0.2580105


零假设H0: 特征与目标变量独立。显著性水平α=0.05，如果p < alpha，拒绝H0，说明特征变量与目标变量不独立。在chi2_result中过滤p值大于0.05的特征变量。

In [36]:
used_cat_col = chi2_result[chi2_result['pvalue']<=0.05].index.tolist()
print('提取变量:'+str(used_cat_col))

unused_col = [x for x in ord_col+cat_col if x not in used_cat_col]
print('与目标独立的变量有：'+str(unused_col))

提取变量:['StockOptionLevel', 'JobInvolvement', 'WorkLifeBalance', 'BusinessTravel', 'Department', 'JobSatisfaction', 'OverTime']
与目标独立的变量有：['DistanceFromHome', 'PerformanceRating', 'RelationshipSatisfaction']


通过对分类变量和有序变量进行卡方检验，删除了**'DistanceFromHome', 'Education', 'PerformanceRating', 'RelationshipSatisfaction', 'TrainingTimesLastYear', 'Gender'**6个变量。<br>
从前面的数据探索也可以看出，从这几个变量与目标变量的关系来看，确实没有明显的特征。但是在OverTime上，加班的离职率明显是高于不加班的离职率。

### 用sklearn包的feature_selection过滤变量

In [37]:
# 卡方检验
_, pvalue = chi2(df_train[num_col+ord_col+cat_col], df_train[target_var])
chi2_test = pd.Series(pvalue, index=num_col+ord_col+cat_col)

# ANVOA（f_classif)
_, pvalue = f_classif(df_train[num_col+ord_col+cat_col], df_train[target_var])
f_test = pd.Series(pvalue, index=num_col+ord_col+cat_col)

# 互信息
mi = mutual_info_classif(df_train[num_col+ord_col+cat_col], df_train[target_var])
mi_test = pd.Series({'mi':mi}, index=num_col+ord_col+cat_col)

#### 数据处理和预测
数据处理部分，为非连续变量创建虚拟变量

In [38]:
def data_processing(result_df=chi2_test, df=df_train, model=LogisticRegression()):
    selected_col = result_df[result_df<=0.05].index.tolist()

    new_num_col = [x for x in selected_col if x in num_col]
    new_cat_col = [x for x in selected_col if x in cat_col]
    new_ord_col = [x for x in selected_col if x in ord_col]
    
    sample_data = df[new_ord_col+new_cat_col+new_num_col].copy()

    # 为分类变量和有序变量创建虚拟变量
    for col in new_cat_col + new_ord_col:
        dummy = pd.get_dummies(sample_data[col], prefix=col)
        #onehot_col.extend(dummy.columns.tolist())
        sample_data = pd.concat([sample_data, dummy], axis=1)
        sample_data.drop([col], axis=1, inplace=True)
        
    #std_scaler = StandardScaler()
    #for col in new_num_col:
    # 分割训练集，测试集，70%作为训练集，30%作为测试集
    #test_size = 0.3
    seed = 45
    scoring = 'accuracy'
    #X_train, X_test, y_train, y_test = train_test_split(
    #    sample_data,
    #    df_train[target_var],
    #    test_size=test_size,
    #    random_state=seed)
    kfold = StratifiedKFold(n_splits=10, random_state=seed)
    
    #lr_model = LogisticRegression(random_state=seed)
    model = model.set_params(**({'random_state':seed}))
    cv = cross_val_score(cv=kfold, estimator=model, scoring=scoring, X=sample_data, y=df_train[target_var])
    #lr_model.fit(X_train, y_train)
    #y_pred = lr_model.predict(X_test)
    #score = lr_model.score(X_test, y_test)
    #print('准确度是: {:0.4f}'.format(score))
    print(cv.mean())

    return new_ord_col+new_cat_col+new_num_col

In [39]:
models = []
for x in [chi2_test, f_test]:
    for z in [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier()]:
        models.append((x, z))

In [40]:
for x, z in models:
    data_processing(x, df_train, z)

0.856342447535
0.839969869786
0.85086288123
0.865433957728
0.852730653923
0.861813673557


In [41]:
selected_col = f_test[f_test<=0.05].index.tolist()

new_num_col = [x for x in selected_col if x in num_col]
new_cat_col = [x for x in selected_col if x in cat_col]
new_ord_col = [x for x in selected_col if x in ord_col]

In [42]:
new_num_col

['Age',
 'MonthlyIncome',
 'TotalWorkingYears',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [18]:
for col in new_ord_col:
    print(df_train.groupby(col)['sum'].sum()/1100)

DistanceFromHome
1     0.120909
2     0.139091
3     0.054545
4     0.043636
5     0.044545
6     0.040909
7     0.047273
8     0.053636
9     0.051818
10    0.057273
11    0.020000
12    0.011818
13    0.015455
14    0.014545
15    0.017273
16    0.022727
17    0.013636
18    0.017273
19    0.015455
20    0.015455
21    0.009091
22    0.012727
23    0.016364
24    0.020909
25    0.016364
26    0.016364
27    0.009091
28    0.011818
29    0.020909
Name: sum, dtype: float64
EnvironmentSatisfaction
1    0.184545
2    0.178182
3    0.293636
4    0.294545
Name: sum, dtype: float64
JobInvolvement
1    0.052727
2    0.239091
3    0.572727
4    0.086364
Name: sum, dtype: float64
JobLevel
1    0.356364
2    0.349091
3    0.135455
4    0.069091
5    0.040909
Name: sum, dtype: float64
JobSatisfaction
1    0.187273
2    0.176364
3    0.285455
4    0.301818
Name: sum, dtype: float64
RelationshipSatisfaction
1    0.187273
2    0.190909
3    0.292727
4    0.280000
Name: sum, dtype: float64
StockOpti

## 参考资料
- [A Complete Tutorial to Learn Data Science with Python from Scratch](https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-learn-data-science-python-scratch-2/)
- [Introduction to Feature Selection methods with an example (or how to select the right variables?)](https://www.analyticsvidhya.com/blog/2016/12/introduction-to-feature-selection-methods-with-an-example-or-how-to-select-the-right-variables/)