In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC 
from sklearn import tree
#from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)



In [2]:
df = pd.read_csv('C:/Users/epatdeb/Edurekapythonproject/HR_Data.csv')
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [3]:
# Best Employee are those whose last_evaluation > .85
# Most experienced employee are those whose time_spend_company >= 6 and number_project >= 4

bestemp = df [df['last_evaluation'] >= .85 ] [df['time_spend_company'] >= 5 ][df['number_project'] >= 4 ]
empfeatures = bestemp.drop(axis=1,labels='left')
empleft = bestemp['left']
print ("Total number of elements is: {}".format(len(empfeatures)))

#One hot encoder (get_dummies) applied on columns=["sales", "salary"] to transform to numeric format with n-columns 
# (where n - no. of elements)

empfeatures = pd.get_dummies(empfeatures, columns=["sales", "salary"], prefix=["sales", "salary"])



Total number of elements is: 1126


  after removing the cwd from sys.path.


In [4]:
#Feature Scaling applied on empfeatures['average_montly_hours', 'time_spend_company' and 'number_project']

std_scale = preprocessing.StandardScaler().fit(empfeatures['average_montly_hours'])
df_std = std_scale.transform(empfeatures['average_montly_hours'])

std_scale = preprocessing.StandardScaler().fit(empfeatures['time_spend_company'])
df_std1 = std_scale.transform(empfeatures['time_spend_company'])

std_scale = preprocessing.StandardScaler().fit(empfeatures['number_project'])
df_std2 = std_scale.transform(empfeatures['number_project'])

empfeatures['average_montly_hours'] = df_std
empfeatures['time_spend_company'] = df_std1
empfeatures['number_project'] = df_std2
empfeatures




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,salary_high,salary_low,salary_medium
1,0.80,0.86,0.405582,0.739115,0.466293,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,0.72,0.87,0.405582,-0.318201,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
7,0.92,0.85,0.405582,0.657783,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
8,0.89,1.00,0.405582,-0.291090,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
12,0.84,0.92,-1.058153,-0.019984,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
17,0.78,0.99,-1.058153,0.549340,0.466293,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
19,0.76,0.89,0.405582,0.739115,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
25,0.89,0.92,0.405582,0.196901,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
26,0.82,0.87,-1.058153,0.115569,-0.497094,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
34,0.84,0.87,-1.058153,0.305344,0.466293,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [5]:
# Split the dataset among train and test model with 80:20

X_train, X_test, Y_train, Y_test = train_test_split(empfeatures, empleft, test_size=0.20, random_state = 20)
KX = int(round(np.sqrt(len(Y_test))))   # To be used for calculating the value of K in K-nearest neighbor 
print ("X_train Length: {} , X_test Length: {}, Y_train Length: {} and Y_test Length: {}".format(len(X_train), len(X_test), len(Y_train), len(Y_test)))

X_train Length: 900 , X_test Length: 226, Y_train Length: 900 and Y_test Length: 226


In [119]:
#SVM Algorithm fit, train, test and predict
# Kernel --> ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ 
def svm():
    krnl = ['linear', 'poly', 'rbf', 'sigmoid']
    kvals= {}

    for k_val in krnl:
        clf = SVC(kernel= k_val)
        clf.fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)
        mae_svm = mean_absolute_error(Y_test, Y_pred)
        kvals[k_val] = mae_svm

#print (kvals)

    kvals_np = pd.DataFrame(kvals, index=range(0,1))
#print (kvals_np)
#sorted(kvals_np.T[0])

    ax = kvals_np.min(axis=1).convert_objects(convert_numeric=True)
    optim_C = kvals_np.T.index[kvals_np.T[0] == float(ax)]
    #print ("The most optimum 'Kernel' type is {} and corresponding MAE is {}". format(optim_C, ax))
    ax = ax.astype(float)
    return ax.values[0]


In [89]:
#Decision Tree Algorithm
def dectree():
    clf1 = tree.DecisionTreeClassifier()
    clf1.fit(X_train, Y_train)
    Y_pred = clf1.predict(X_test)
    mae_dct = mean_absolute_error(Y_test, Y_pred)
    return mae_dct


In [90]:
#RandomForest Algorithm
def rndforest():
    clf2 = RandomForestClassifier(n_estimators=85)
    clf2.fit(X_train, Y_train)

    KNeighborsClassifier = clf2.predict(X_test)
    mae_rnf = mean_absolute_error(Y_test, Y_pred)
    return mae_rnf


In [100]:
#K-Nearest Neighbor Algorithm
def knn(KX):
    #print ("Number of K neighbors : ", KX)
    clf3 = neighbors.KNeighborsClassifier(n_neighbors=KX)
    clf3.fit(X_train, Y_train)
    Y_pred = clf3.predict(X_test)
    mae_knn = mean_absolute_error(Y_test, Y_pred)
    return mae_knn

In [115]:
#Logistic Regression Algorithm
def logreg():
    c_val = [0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 1, 10, 100, 1000, 10000, 1e5]
    Logistic_Vals = {}

    for x_var in c_val:
        clf4 = LogisticRegression(C= float(x_var))
        clf4.fit(X_train, Y_train)
        Y_pred = clf4.predict(X_test)
        mae_lreg = mean_absolute_error(Y_test, Y_pred)
        Logistic_Vals[x_var] = mae_lreg

    logpd = pd.DataFrame(Logistic_Vals, index = range(0,1))

#logpd.columns[logpd]
#logpd.iloc[logpd.min(axis=1)]


    ax = logpd.min(axis=1).convert_objects(convert_numeric=True)
    optim_C = logpd.T.index[logpd.T[0] == float(ax)]
    #print ("The most optimum value for C is {} and corresponding MAE is {}". format(optim_C, ax))
    ax = ax.astype(float)
    #return format(ax)
    return ax.values[0]

#logreg()



0.079646017699115043

In [122]:
iterate = 100
my_svm = []
my_dec = []
my_rnd = []
my_knn = []
my_log = []

print ("***"*10)
for xx in range(iterate):
    MAE_SVM = svm()
    my_svm.append(MAE_SVM)
    MAE_DEC = dectree()
    my_dec.append(MAE_DEC)
    MAE_RND = rndforest()
    my_rnd.append(MAE_RND)
    MAE_KNN = knn(KX)
    my_knn.append(MAE_KNN)
    MAE_LOG = logreg()
    my_log.append(MAE_LOG)

print ("MAE_SVM : ", sum(my_svm)/iterate)
print ("MAE_DEC : ", sum(my_dec)/iterate)   # BEST MODEL as per average MSE after 100 iterations.
print ("MAE_RND : ", sum(my_rnd)/iterate)
print ("MAE_KNN : ", sum(my_knn)/iterate)
print ("MAE_LOG : ", sum(my_log)/iterate)

******************************














MAE_SVM :  0.0884955752212
MAE_DEC :  0.0509292035398
MAE_RND :  0.0840707964602
MAE_KNN :  0.119469026549
MAE_LOG :  0.0796460176991




In [54]:
# Correlation between dependent variable('left') and independent variables
# Based on correlation with 'left': I decide which features are the most important factors to the model

df2 = df.corr()
df2 [df2.index == 'left']

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
left,-0.388375,0.006567,0.023787,0.071287,0.144822,-0.154622,1.0,-0.061788


In [61]:
# Factors that may impact the best employees to leave company
best_leftemp = bestemp[bestemp['left'] == 1]
#
#print (best_leftemp.columns)

best_leftemp.describe(include='all')

# From the describe(), the statistic (perticularly the 'std') shows the following:

#  satisfaction_level:   If the satisfaction level is  < ~87% 
#  time_spend_company:   If the time spent in company >= 5 years
#  promotion_last_5years:  If no promotion in last 5 years

#  average_montly_hours : If average_montly_hours > 247

#

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
count,824.0,824.0,824.0,824.0,824.0,824.0,824.0,824.0,824,824
unique,,,,,,,,,10,3
top,,,,,,,,,sales,low
freq,,,,,,,,,237,504
mean,0.770024,0.934757,4.734223,247.445388,5.203883,0.046117,1.0,0.001214,,
std,0.187463,0.05122,0.647263,20.355275,0.403128,0.209865,0.0,0.034837,,
min,0.09,0.85,4.0,139.0,5.0,0.0,1.0,0.0,,
25%,0.76,0.89,4.0,234.0,5.0,0.0,1.0,0.0,,
50%,0.82,0.94,5.0,247.0,5.0,0.0,1.0,0.0,,
75%,0.87,0.99,5.0,261.0,5.0,0.0,1.0,0.0,,


In [72]:
best_leftemp.index = best_leftemp['salary']
best_leftemp.groupby('salary').head()

Defaulting to column but this will raise an ambiguity error in a future version
  


Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
medium,0.8,0.86,5,262,6,0,1,0,sales,medium
low,0.72,0.87,5,223,5,0,1,0,sales,low
low,0.92,0.85,5,259,5,0,1,0,sales,low
low,0.89,1.0,5,224,5,0,1,0,sales,low
low,0.84,0.92,4,234,5,0,1,0,sales,low
low,0.78,0.99,4,255,6,0,1,0,sales,low
medium,0.85,0.91,5,226,5,0,1,0,management,medium
medium,0.1,0.95,6,244,5,0,1,0,IT,medium
medium,0.9,0.98,4,264,6,0,1,0,product_mng,medium
medium,0.76,0.86,5,223,5,1,1,0,product_mng,medium
