# 一、資料匯入及預處理

### 資料來源：https://www.kaggle.com/ludobenistant/hr-analytics

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25) #設定pandas最多顯示出25個欄位資訊
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5") #15000筆
df.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
6606,0.63,0.5,4,167,3,1,0,0,technical,medium
11764,0.79,0.65,3,235,10,0,0,0,technical,low
12235,0.83,0.85,4,255,5,0,1,0,management,low
9868,0.69,0.98,3,261,4,0,0,0,sales,medium
3845,0.64,0.9,4,211,3,0,0,0,hr,medium


### One-hot Encoding

In [2]:
df_job = pd.get_dummies(df['dept'])
df_job.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
8977,0,0,0,0,0,0,0,0,1,0
1937,0,0,1,0,0,0,0,0,0,0
1105,0,0,0,0,0,0,0,0,0,1
4683,0,0,0,0,0,0,0,1,0,0
5415,0,1,0,0,0,0,0,0,0,0


In [3]:
df_ml = pd.merge(df, df_job,left_index=True,right_index=True)
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
5011,0.52,0.54,5,239,3,0,0,0,support,medium,0,0,0,0,0,0,0,0,1,0
6629,0.32,0.55,4,283,4,0,0,0,product_mng,low,0,0,0,0,0,0,1,0,0,0
10974,0.37,0.86,6,260,3,0,0,0,technical,medium,0,0,0,0,0,0,0,0,0,1
4888,0.79,0.59,5,197,4,0,0,0,technical,medium,0,0,0,0,0,0,0,0,0,1
14951,0.39,0.54,2,154,3,0,1,0,marketing,low,0,0,0,0,0,1,0,0,0,0


### LabelEncoding

In [4]:
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
9045,0.94,0.92,3,273,3,0,0,0,technical,high,0,0,0,0,0,0,0,0,0,1,3
1779,0.38,0.52,2,128,3,0,1,0,sales,medium,0,0,0,0,0,0,0,1,0,0,2
13674,0.8,0.7,4,246,3,0,0,1,marketing,low,0,0,0,0,0,1,0,0,0,0,1
3886,0.72,0.67,3,164,4,0,0,0,RandD,medium,0,1,0,0,0,0,0,0,0,0,2
9789,0.92,0.91,3,202,2,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0,2


In [5]:
df_ml = df_ml.dropna().reset_index(drop=True)

In [6]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# 二、單一分類器

### 決策分類樹

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.99      3440
          1       0.98      0.93      0.95      1060

avg / total       0.98      0.98      0.98      4500



### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.97      0.98      0.98      3440
          1       0.94      0.91      0.93      1060

avg / total       0.97      0.97      0.97      4500



### SVC

In [10]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97      3440
          1       0.92      0.91      0.91      1060

avg / total       0.96      0.96      0.96      4500



# 三、VotingClassifier

In [11]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True) #probability要設成True(同時計算每個分類的機率)，classification_report才能work

eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),('svc', clf3)], voting='soft', weights=[3, 1, 1])
eclf.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.99      3440
          1       0.98      0.93      0.95      1060

avg / total       0.98      0.98      0.98      4500



# 四、Bagging

### 註: 基於決策樹的學習演算法可以不用標準化沒關係，而且訓練速度通常較快

#### OOB

In [12]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100, oob_score=True)
bagc.fit(X,y)
print("oob_score(accuary):",bagc.oob_score_)

oob_score(accuary): 0.990932728849


In [13]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100)
bagc.fit(X_train, y_train)
print(metrics.classification_report(y_test, bagc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99      3440
          1       0.98      0.97      0.98      1060

avg / total       0.99      0.99      0.99      4500



# 五、隨機森林(Random Forest)

#### OOB

In [14]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X,y)
print("oob_score(accuary):",rfc.oob_score_)

oob_score(accuary): 0.992732848857


In [15]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X,y)
print(metrics.classification_report(y_test, rfc.predict(X_test)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3440
          1       1.00      1.00      1.00      1060

avg / total       1.00      1.00      1.00      4500



# 六、AdaBoost

In [16]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=100)
adb.fit(X_train, y_train)
print(metrics.classification_report(y_test, adb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97      3440
          1       0.92      0.91      0.91      1060

avg / total       0.96      0.96      0.96      4500

