In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt

%matplotlib inline 

# 0. データセットの読み込み

In [2]:
HR_DATASET_PATH = '../datasets/HR_comma_sep.csv'
hr_df = pd.read_csv(HR_DATASET_PATH)
hr_df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
print(hr_df.shape)
print(hr_df.columns)
print(hr_df.isnull().any())

(14999, 10)
Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')
satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool


In [4]:
# salary(給与水準)をダミー変数へ置換する
hr_df.salary.replace({'low': 1, 'medium': 2, 'high': 3}, inplace=True)
# salesをダミー変数へ
hr_df = pd.get_dummies(hr_df, columns=['sales'])

 - [1.17. Neural network models (supervised) — scikit-learn 0.19.0 documentation](http://scikit-learn.org/stable/modules/neural_networks_supervised.html#classification)
 - [MLPClassifierの引数](http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

use_cols = [
    'satisfaction_level',
    'last_evaluation',
    'number_project',
    'average_montly_hours',
    'time_spend_company',
    'Work_accident',
    'promotion_last_5years',
    'salary',
    'sales_IT',
    'sales_RandD',
    'sales_accounting',
    'sales_hr',
    'sales_management', 
    'sales_marketing',
    'sales_product_mng',
    'sales_sales',
    'sales_support', 
    'sales_technical'
]

# 離職者数:在籍者数 = 1:1に直す
X1 = hr_df[hr_df.left == 1][use_cols]
X0 = hr_df[hr_df.left == 0][use_cols].sample(len(X1))
X = pd.concat([X1, X0])
Y = hr_df.loc[X.index, 'left']

# 標準化
transformed_cols = [
    'satisfaction_level',
    'last_evaluation',
    'number_project',
    'average_montly_hours',
    'time_spend_company',
]
ss = StandardScaler()
ss.fit(X[transformed_cols])
X[transformed_cols] = ss.transform(X[transformed_cols])


# 交差検証(ホールドアウト法)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, train_size=0.7, random_state=0)

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=0)
clf.fit(x_train, y_train)

print('学習データ数: %s' % x_train.shape[0])
print('学習データのうち離職した人数: %s' % y_train[y_train == 1].shape[0])
print('検証データ数: %s' % x_test.shape[0])
print('検証データのうち離職した人数: %s' % y_test[y_test == 1].shape[0])

y_pred = clf.predict(x_test)

print('---モデルの評価---')
print('正確度: %s' % accuracy_score(y_test, y_pred))
print('適合率: %s' % precision_score(y_test, y_pred))
print('再現率: %s' % recall_score(y_test, y_pred))
print('F値: %s' % f1_score(y_test, y_pred))

print('---分割表---')
confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred), index=['在籍者', '離職者'], columns=['在籍すると予測', '離職すると予測'])
confusion_df



学習データ数: 4999
学習データのうち離職した人数: 2510
検証データ数: 2143
検証データのうち離職した人数: 1061
---モデルの評価---
正確度: 0.938870741951
適合率: 0.941176470588
再現率: 0.934967012253
F値: 0.938061465721
---分割表---


Unnamed: 0,在籍すると予測,離職すると予測
在籍者,1020,62
離職者,69,992
