## Figuring Out which emplyees are more likely to quit

<p>Two data sets hr_data.csv and another employee_satisfaction_evaluation.xlsx</p>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# importing data
file_hr = 'hr_data.csv'
file_satisfaction = 'employee_satisfaction_evaluation.xlsx'
hrdf = pd.read_csv('hr_data.csv')
sat_df = pd.read_excel(file_satisfaction)

In [None]:
hrdf.head()

In [None]:
hrdf.info()

In [None]:
hrdf.select_dtypes(exclude=['int','float']).columns # to find catigorical non-numerical data

## Exploratory Data Analysis


In [None]:
hrdf.salary.unique()

In [None]:
hrdf.department.unique() # unique depts

In [None]:
round((hrdf.left.sum()/len(hrdf)) * 100,2) # pct of people how left the company

In [None]:
hrdf.time_spend_company.mean() # average number of years at company

In [None]:
hrdf.average_montly_hours.mean() # average number of monthly hours

In [None]:
hrdf.promotion_last_5years.sum() # total number of promotions

In [None]:
# to find out which department has the highest number of leaving employees
table = pd.pivot_table(hrdf, values='left',columns='department', aggfunc=np.sum).T.sort_values(by='left')
table

In [None]:
plt.barh(table.index,table.left)
#plt.xticks(rotation=90)
plt.title('# Employees left by Dept')


In [None]:
hrdf.department.value_counts().plot.barh()

In [None]:
# exploring satisfaction dataset
sat_df.head()

In [None]:
sat_df.info()

In [None]:
# joining both dataset by index
df_all = hrdf.set_index('employee_id').join(sat_df.set_index('EMPLOYEE #')) 
df_all = df_all.reset_index()

In [None]:
df_all.head()

In [None]:
df_all.isnull().sum()

In [None]:
# fill null values
df_all.fillna(df_all.mean(), inplace=True)

In [None]:
df_all = df_all.drop(columns='employee_id')

In [None]:
df_all.head()

In [None]:
# correlation matrix
plt.figure(figsize=(12,8))
corr_mat = df_all.corr()
mask = np.zeros_like(corr_mat)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr_mat,annot=True,mask=mask,cmap='coolwarm')
plt.title("Correlation Map")

## Cleaning & Preprocessing

In [None]:
# using hot one encoding
categorical = ['department','salary']
df_final = pd.get_dummies(df_all, columns=categorical, drop_first=True)
df_final.head()

In [None]:
# data split
from sklearn.model_selection import train_test_split
X = df_final.drop(['left'], axis=1).values
y = df_final.left.values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [None]:
# normalize data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
df_train = pd.DataFrame(X_train)

In [None]:
df_train.head()

## Modeling - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [None]:
# create and fit the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# predict
predictions = model.predict(X_test)

In [None]:
print("Accuracy: {:.2f}%".format((100*accuracy_score(predictions,y_test))))

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
# random forests
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report


In [None]:
m2_classifier = RandomForestClassifier()
m2_classifier.fit(X_train,y_train)

In [None]:
predict_classifier = m2_classifier.predict(X_test)

In [None]:
print("Accuracy: {:.2f}%".format(100*accuracy_score(y_test,predict_classifier)))

In [None]:
print(confusion_matrix(y_test,predict_classifier))

In [None]:
print(classification_report(y_test,predict_classifier))

In [None]:
# feature importances
train_feature_names = df_final.drop(['left'],axis=1).columns
feature_importance = pd.DataFrame(m2_classifier.feature_importances_,
                                 index = train_feature_names,
                                 columns = ['importance'])

In [None]:
feature_importance.sort_values('importance', ascending=False)

## Findings:

Satisfaction Level has the highest weight affecting the probability of an employee leaving. 