# Why are the best and most experienced employees leaving? 
Dataset is from: https://www.kaggle.com/ludobenistant/hr-analytics
Data visualization using Python packages: Numpy,Pandas,Seaborn,Plotly and Matplotlib

# Import packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot
import seaborn as sns
%matplotlib inline

# Load Dataset

In [4]:
#load dataset
hr = pd.read_csv('C:\\Users\\dxd4380\\Documents\\personal\\Programming samples\\Python\\1.Data Viz- HR Analytics\\HR_comma_sep.csv')
#hr = pd.DataFrame.from_csv('../1.Data Viz- HR Analytics/HR_comma_sep.csv', index_col=None)


# Get preliminary view of data
### Dataset contains 15000 samples with employees indicating if they left or not along with Nine(9)  data points describing their experience in the company. (Also called features/dimensions for machine learning purposes) 

### Understanding datatypes of various features 
** After looking at the data types and first few rows of data as shown below

### Quantitative:
* Continuous variables: satisfaction_level,last_evaluation,average_montly_hours
* Discrete: number_project, time_spend_company

### Qualitative:
* Binary:Work_accident,left,promotion_last_5years
* Unordered/Nominal: sales
* Ordered/Ordinal: salary

### *** To understand differences between different types of data, please read:
http://blog.minitab.com/blog/understanding-statistics/understanding-qualitative-quantitative-attribute-discrete-and-continuous-data-types




In [13]:
#Check datatypes of features
df = pd.DataFrame(hr)
dtypeCount =[df.iloc[:,i].apply(type).value_counts() for i in range(df.shape[1])]
print(dtypeCount)

[<class 'float'>    14999
Name: satisfaction_level, dtype: int64, <class 'float'>    14999
Name: last_evaluation, dtype: int64, <class 'int'>    14999
Name: number_project, dtype: int64, <class 'int'>    14999
Name: average_montly_hours, dtype: int64, <class 'int'>    14999
Name: time_spend_company, dtype: int64, <class 'int'>    14999
Name: Work_accident, dtype: int64, <class 'int'>    14999
Name: left, dtype: int64, <class 'int'>    14999
Name: promotion_last_5years, dtype: int64, <class 'str'>    14999
Name: sales, dtype: int64, <class 'str'>    14999
Name: salary, dtype: int64]


In [5]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# Basic Logistic Regression
Some data manipulation is performed before logistic regression. As you notice in the list of features and their data types, there are two features that are qualitative: Sales and Salary. Sales and Salary are converted into dummy variables. Accuracy scores of test and training datasets are close to 80%

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#Initializing duplicate dataset without overwriting the original
hr2=hr

#Check datatypes of all columns in the parameters are compatible
df = pd.DataFrame(hr)
dtypeCount =[df.iloc[:,i].apply(type).value_counts() for i in range(df.shape[1])]
print(dtypeCount)

#Convert categorical variables into dummy variables

hr_sales = pd.get_dummies(hr['sales'])
hr2=pd.concat([hr2, hr_sales], axis=1)

hr_salary = pd.get_dummies(hr['salary'])
hr2=pd.concat([hr2, hr_salary], axis=1)

#Build Target and parameters
y = hr2['left'].values
X = hr2.drop(['left','salary','sales'], axis=1).values

Xlr, Xtestlr, ylr, ytestlr = train_test_split(X,y,random_state=5,stratify=y)

clf = LogisticRegression()
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print(accuracy_score(clf.predict(Xtestlr), ytestlr))
print(accuracy_score(clf.predict(Xlr), ylr))


[<class 'float'>    14999
Name: satisfaction_level, dtype: int64, <class 'float'>    14999
Name: last_evaluation, dtype: int64, <class 'int'>    14999
Name: number_project, dtype: int64, <class 'int'>    14999
Name: average_montly_hours, dtype: int64, <class 'int'>    14999
Name: time_spend_company, dtype: int64, <class 'int'>    14999
Name: Work_accident, dtype: int64, <class 'int'>    14999
Name: left, dtype: int64, <class 'int'>    14999
Name: promotion_last_5years, dtype: int64, <class 'str'>    14999
Name: sales, dtype: int64, <class 'str'>    14999
Name: salary, dtype: int64]
0.802933333333
0.792070406258


In [15]:
# Imbalanced data check

In [62]:
uniquetest, countstest = np.unique(ytestlr,return_counts=True)
testvals=dict(zip(uniquetest, countstest))
print("test":{},format(testvals[1]/len(ytestlr))

uniquetrain, countstrain= np.unique(ylr,return_counts=True)
trainvals=dict(zip(uniquetrain, countstrain))
print(trainvals[1]/len(ylr))

uniqueorig, countsorig= np.unique(y,return_counts=True)
origvals=dict(zip(uniqueorig, countsorig))
print(origvals[1]/len(y))


0.238133333333
0.238065605832
0.238082538836


Let's look at the confusion matrix and classification report to understand the split between precision and recall. For the employees that left both the precision and recall are not very high. 0.64 precision means that quite a few employees that stayed were predicted as they left. 0.39 recall means that many employees that left were incorrectly classified as left.

In [None]:
# Predict the labels of the test set: y_pred
y_pred = clf.predict(Xtestlr)

print(confusion_matrix(ytestlr, y_pred))
print(classification_report(ytestlr, y_pred))

# Find best 'k', Model complexity curve

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# Loop over different values of k
for i, k in enumerate(neighbors):
    # Setup a k-NN Classifier with k neighbors: knn
    knn = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the training data
    knn.fit(Xlr, ylr)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(Xlr,ylr)

    #Compute accuracy on the testing set
    test_accuracy[i] = knn.score(Xtestlr,ytestlr)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

# Add regularization/Tuning

## Find optimal Regularization paramter 'C'

In [None]:
# Import necessary modules
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Setup the hyperparameter grid
c_space = [0.001, 0.1, 1, 10, 100,1000,10000]
param_grid = {'C': c_space}

# Instantiate a logistic regression classifier: logreg
logreg = LogisticRegression()

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(Xlr, ylr)

# Print the tuned parameters and scoreb
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best Training score is {}".format(logreg_cv.best_score_))
print("Best Test score is {}".format(accuracy_score(logreg_cv.predict(Xtestlr), ytestlr)))



In [None]:
# Import necessary modules
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(Xtestlr)[:,1] 

# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(ytestlr, y_pred_prob)))

# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(clf,X, y, cv=5,scoring ='roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))