- **Importer les librairies nécessaires pour le travail**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Le nécessaire pour plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Librairies de modélisation de données
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from boruta import BorutaPy
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, roc_auc_score, 
                             log_loss, classification_report)
from imblearn.over_sampling import SMOTE
import xgboost

# Interprétation de modèle
import shap

In [2]:
# Pour voir toutes les colonnes 
pd.set_option("display.max_columns", None)

In [3]:
# Importer les données
attrition = pd.read_csv("../data/HR-Employee-Attrition.csv")

# Afficher la taille des données
print(f"La base fait {attrition.shape[0]} lignes",
       f"et {attrition.shape[1]} colonnes\n")

# Afficher les premières lignes de nos données
attrition.head()

La base fait 1470 lignes et 35 colonnes



Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


- **Encodage de la target**

In [4]:
# Créer la target map
target_map = {"Yes":1, "No":0}
# Encoder la variable cible en utilisant la méthode apply() de Pandas
attrition["Attrition"] = attrition["Attrition"].apply(lambda x: target_map[x])

In [5]:
# Afficher la distribution de la target
round(attrition["Attrition"].value_counts(normalize=True)*100, 2)

0    83.88
1    16.12
Name: Attrition, dtype: float64

In [6]:
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


# Feature Engineering

In [43]:

# Utiliser la méthode apply de Pandas pour encoder la target
attrition["Attrition_numerical"] = attrition["Attrition"]

# Créer une liste uniquement de colonnes numériques
num_cols = attrition.select_dtypes(include=np.number).columns.tolist()


In [44]:
attrition = attrition.drop(["Attrition_numerical"], axis=1)

# Liste vide pour les colonnes de type catégoriel
categorical = []
for col, value in attrition.iteritems():
    if value.dtype == "object":
        categorical.append(col)

# Stocker les variables numériques
numerical = attrition.columns.difference(categorical)

In [49]:
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [50]:
# Définir le DataFrame de variables catégorielles
attrition_cat = attrition[categorical]

# One Hot Encoder les variables catégorielles avec
# la fonction get_dummies de Pandas
attrition_cat = pd.get_dummies(attrition_cat)
# Afficher les 5 premièrs lignes des données encodées
attrition_cat.head()

# Créer le DataFrame des variables numériques
attrition_num = attrition[numerical]
# Concatener les deux DataFrames pour n'en faire qu'un seul
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)

- **Division des données en jeu de train et de test**

In [51]:
# Définir la seed
SEED = 42

# Diviser les données
df_train, df_test = train_test_split(attrition_final, 
                                     test_size=0.20, 
                                     random_state=SEED, 
                                     stratify=attrition["Attrition"])

In [52]:
# Distribution de la target dans le train set
round(df_train["Attrition"].value_counts(normalize=True)*100, 2)

0    83.84
1    16.16
Name: Attrition, dtype: float64

In [53]:
# Distribution de la target dans le test set
round(df_test["Attrition"].value_counts(normalize=True)*100, 2)

0    84.01
1    15.99
Name: Attrition, dtype: float64

- **Création des matrices et vecteurs de train et test set**

# Modèle de baseline

In [58]:
# Créer la matrice et le vecteur de train
X_train = df_train.drop(columns=["Attrition"], axis=1)
y_train = df_train["Attrition"]

# Créer la matrice et le vecteur de test
X_test = df_test.drop(columns=["Attrition"], axis=1)
y_test = df_test["Attrition"]

In [67]:
# Instancier le SMOTE
over_sm = SMOTE(random_state=SEED)
# Définir les vecteurs de train et de test
sm_train, sm_target = over_sm.fit_resample(X_train, y_train)

# Définir le dictionnaire paramètres de l'estimateur RandomForest
rfc_params = {"n_jobs" : -1, 
             "n_estimators" : 1000,
             "warm_start" : True,
             "max_features" : 0.3,
             "max_depth" : 4,
             "min_samples_leaf" : 2,
             "max_features" : 'sqrt',
             "random_state" : SEED,
             "verbose" : 1
            }

In [68]:
# Instancier le modèle avec les paramètres définis au préalable
rfc = RandomForestClassifier(**rfc_params)
# Entrainer le modèle
rfc.fit(sm_train, sm_target)
# Calculer les prédictions sur le jeu de test
rfc_pred = rfc.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   11.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.6s finished


- **Affichage des scores de l'accuracy et du roc de l'auc**

In [71]:
# Afficher le score de l'accuracy
print(f"Score de l'accuracy : {round(accuracy_score(y_test, rfc_pred), 4)*100}%")
# Afficher le score de l'auc
print(f"Score de l'auc : {round(roc_auc_score(y_test, rfc_pred), 4)*100}%\n")
print("*"*60)
# Afficher le rapport de classification
print(classification_report(y_test, rfc_pred))
print("*"*60)

Score de l'accuracy : 83.33%
Score de l'auc : 68.55%

************************************************************
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       247
           1       0.48      0.47      0.47        47

    accuracy                           0.83       294
   macro avg       0.69      0.69      0.69       294
weighted avg       0.83      0.83      0.83       294

************************************************************


# Explicabilité du modèle

- **Importance Globale des Variables**

In [73]:
trace = go.Scatter(
    y = rfc.feature_importances_,
    x = attrition_final.columns.values,
    mode="markers",
    marker=dict(
        sizemode = "diameter",
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        color = rfc.feature_importances_,
        colorscale="Portland",
        showscale=True
    ),
    text = attrition_final.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= "Random Forest Importance des Variables",
    hovermode= "closest",
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= "Importance de Variables",
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename="scatter2010");