In [27]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt
%matplotlib inline

# 0. データセットの読み込み

In [28]:
HR_DATASET_PATH = '../datasets/HR_comma_sep.csv'
hr_df = pd.read_csv(HR_DATASET_PATH)
hr_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [29]:
print(hr_df.shape)
print(hr_df.columns)
print(hr_df.isnull().any())

(14999, 10)
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')
satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool


In [30]:
# salary(給与水準)をダミー変数へ置換する
hr_df.salary.replace({'low': 1, 'medium': 2, 'high': 3}, inplace=True)
# salesをダミー変数へ
hr_df = pd.get_dummies(hr_df, columns=['sales'])

---
# 1. ランダムフォレスト(Random Forests)

## 学習アルゴリズム



In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


use_cols = [
    'satisfaction_level',
    'last_evaluation',
    'number_project',
    'average_montly_hours',
    'time_spend_company',
    'Work_accident',
    'promotion_last_5years',
    'salary',
    'sales_IT',
    'sales_RandD',
    'sales_accounting',
    'sales_hr',
    'sales_management', 
    'sales_marketing',
    'sales_product_mng',
    'sales_sales',
    'sales_support', 
    'sales_technical'
]

# 離職者数:在籍者数 = 1:1に直す
feature_df1 = hr_df[hr_df.left == 1][use_cols]
feature_df0 = hr_df[hr_df.left == 0][use_cols].sample(len(feature_df1))
feature_df = pd.concat([feature_df1, feature_df0])
labels = hr_df.loc[feature_df.index, 'left']

# 標準化
transformed_cols = [
    'satisfaction_level',
    'last_evaluation',
    'number_project',
    'average_montly_hours',
    'time_spend_company',
]
ss = StandardScaler()
ss.fit(feature_df[transformed_cols])
feature_df[transformed_cols] = ss.transform(feature_df[transformed_cols])

# 学習
parameters = {
        'n_estimators': [5, 10, 20, 30, 50, 100, 300],
        'max_features': range(3, feature_df.shape[1], 2),
        'random_state': [0],
        'n_jobs': [1],
        'min_samples_split': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
        'max_depth': [3, 5, 10, 15, 20, 25, 30, 40, 50, 100]
    }
CV = 10
SCORE = 'f1'

clf = GridSearchCV(
        RandomForestClassifier(), # 識別器
        parameters, # 最適化したいパラメータセット
        cv=CV, # 交差検定の回数
        scoring='%s_weighted' % SCORE # モデルの評価関数の指定
    )
clf.fit(feature_df, labels)

print(clf.best_params_)
print(clf.best_estimator_)

label_pred = cross_val_predict(clf, feature_df, labels, cv=CV)

print('正確度: %s' % accuracy_score(labels, label_pred))
print('適合率: %s' % precision_score(labels, label_pred))
print('再現率: %s' % recall_score(labels, label_pred))
print('F値: %s' % f1_score(labels, label_pred))
print('分割表')
confusion_df = pd.DataFrame(confusion_matrix(labels, label_pred), index=['在籍者', '離職者'], columns=['在籍すると予測', '離職すると予測'])
confusion_df



{'n_estimators': 100}
正確度: 0.984178101372
適合率: 0.992312072893
再現率: 0.975917110053
F値: 0.984046308062
分割表


Unnamed: 0,在籍すると予測,離職すると予測
在籍者,3544,27
離職者,86,3485


## ランダムフォレストによるデータ解析

### 特徴量の重要度

In [None]:
# 特徴量の重要度
fti = clf.best_estimator_.feature_importances_

features = ['{0}'.format(col) for col in feature_df.columns]
feature_importance_df = DataFrame(fti, index=features, columns=['重要度'])
feature_importance_df

`satisfaction_level`が最も重要な特徴量であると結論付けられる

In [17]:
# 重要度の低い変数を削除
drop_features = []
threshold = 0.03

for i, feat in enumerate(use_cols):
    if fti[i] < threshold:
        drop_features.append(feat)

drop_col

['Work_accident',
 'promotion_last_5years',
 'salary',
 'sales_IT',
 'sales_RandD',
 'sales_accounting',
 'sales_hr',
 'sales_management',
 'sales_marketing',
 'sales_product_mng',
 'sales_sales',
 'sales_support',
 'sales_technical']

### 森のサイズによる評価値(accuracy, f value, ...)の変化

In [None]:
from sklearn.model_selection import LeaveOneOut

# 調べる深さ
MAX_DEPTH = max(parameters['max_depth'])
depths = range(1, MAX_DEPTH)

# 決定木の最大深度ごとに正解率を計算する
accuracy_scores = []
for depth in depths:

    predicted_labels = []
    # LOO 法で汎化性能を調べる
    loo = LeaveOneOut()
    for train, test in loo.split(feature_df):
        train_data = feature_df.iloc[train]
        target_data = labels.iloc[train]

        rfc = RandomForestClassifier(
            n_estimators=clf.best_params_['n_estimators'],
            max_features=clf.best_params_['max_features'],
            random_state=clf.best_params_['random_state'],
            n_jobs=clf.best_params_['n_jobs'],
            min_samples_split=clf.best_params_['min_samples_split'],
            max_depth=depth
        )
        rfc.fit(train_data, target_data)

        predicted_label = rfc.predict(feature_df.iloc[test])
        predicted_labels.append(predicted_label)

    # 各深度での汎化性能を出力する
    score = accuracy_score(labels, predicted_labels)
    print('max depth={0}: {1}'.format(depth, score))

    accuracy_scores.append(score)

# 最大深度ごとの正解率を折れ線グラフで可視化する
X = list(depths)
plt.plot(X, accuracy_scores)

plt.xlabel('max depth')
plt.ylabel('accuracy rate')
plt.show()

## 参考文献
 - [ランダムフォレスト - Wikipedia](https://ja.wikipedia.org/wiki/%E3%83%A9%E3%83%B3%E3%83%80%E3%83%A0%E3%83%95%E3%82%A9%E3%83%AC%E3%82%B9%E3%83%88)
 - [3.2.4.3.1. sklearn.ensemble.RandomForestClassifier — scikit-learn 0.19.0 documentation](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)