<a href="https://colab.research.google.com/github/datascience-uniandes/classification_tutorial/blob/master/rrhh/rrhh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification: Predict if an employee will left the company

MINE-4101: Applied Data Science  
Univerisdad de los Andes  
  
Last update: October, 2023

In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt
%matplotlib inline

### Reading the dataset

In [None]:
rrhh_df = pd.read_csv('./data/rrhh.csv', sep = ',')

In [None]:
rrhh_df.rename(columns = {'sales' : 'department'}, inplace = True)

In [None]:
rrhh_df.shape

In [None]:
rrhh_df.dtypes

In [None]:
rrhh_df.head()

In [None]:
rrhh_df['left'].value_counts(dropna = False, normalize = True)

### Splitting train and test datasets

In [None]:
features = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']

In [None]:
X = rrhh_df[features]

In [None]:
Y = rrhh_df['left']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 100)

In [None]:
Y_train.value_counts(normalize = True)

In [None]:
Y_test.value_counts(normalize = True)

### Training the model

In [None]:
model = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy', random_state = 200)

In [None]:
model.fit(X_train, Y_train)

In [None]:
plt.figure(figsize = (30, 20))
plot_tree(model, feature_names = model.feature_names_in_)
plt.show()

### Evaluating the model

In [None]:
predictions = model.predict(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(Y_test, predictions)

In [None]:
print('Precision:', precision_score(Y_test, predictions))
print('Recall:', recall_score(Y_test, predictions))
print('F1:', f1_score(Y_test, predictions))

### Showing class distributions among features

In [None]:
fig, axes = plt.subplots(nrows = 4, ncols = 2, figsize = (15, 15))
ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7 = axes.flatten()

ax0.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'satisfaction_level' ], label = 'Left', alpha = 0.5)
ax0.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'satisfaction_level' ], label = 'No left', alpha = 0.5)
ax0.set_title('Satisfaction Level')
ax0.legend()

ax1.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'last_evaluation' ], label = 'Left', alpha = 0.5)
ax1.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'last_evaluation' ], label = 'No left', alpha = 0.5)
ax1.set_title('Last Evaluation')

ax2.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'number_project' ], label = 'Left', alpha = 0.5)
ax2.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'number_project' ], label = 'No left', alpha = 0.5)
ax2.set_title('Number of Projects')

ax3.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'average_montly_hours' ], label = 'Left', alpha = 0.5)
ax3.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'average_montly_hours' ], label = 'No left', alpha = 0.5)
ax3.set_title('Average Monthly Hours')

ax4.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'time_spend_company' ], label = 'Left', alpha = 0.5)
ax4.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'time_spend_company' ], label = 'No left', alpha = 0.5)
ax4.set_title('Time Spend in Company')

ax5.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'Work_accident' ], label = 'Left', alpha = 0.5)
ax5.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'Work_accident' ], label = 'No left', alpha = 0.5)
ax5.set_title('Work Accident')

ax6.hist(rrhh_df.loc[rrhh_df['left'] == 1, 'promotion_last_5years' ], label = 'Left', alpha = 0.5)
ax6.hist(rrhh_df.loc[rrhh_df['left'] == 0, 'promotion_last_5years' ], label = 'No left', alpha = 0.5)
ax6.set_title('Promotion Last 5 Years')

plt.show()

### TODO:

What is the effect to add the categorical features (department, salary)? How much performance metrics will be improved?