# HR Analytics – Analyse et Préparation des Données
Projet TechNova Partners

Ce notebook reprend intégralement le script Python fourni et génère tous les graphiques interactifs.

In [None]:
# --------------------------------------------------
# 1. Import des packages essentiels
# --------------------------------------------------
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.base import BaseEstimator, TransformerMixin

# SHAP optionnel
try:
    import shap
    shap_available = True
    shap.initjs()
except Exception:
    shap_available = False
    print("SHAP non disponible — installez shap si vous voulez les graphes SHAP (pip install shap).")

# Options d'affichage
pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")

In [None]:
# --------------------------------------------------
# 2. Chargement des fichiers CSV
# --------------------------------------------------
base_path = os.path.abspath('.')
data_path = os.path.join(base_path, 'data')

sirh = pd.read_csv(os.path.join(data_path, "extrait_sirh.csv"))
evals = pd.read_csv(os.path.join(data_path, "extrait_eval.csv"))
sondage = pd.read_csv(os.path.join(data_path, "extrait_sondage.csv"))

# Nettoyage des colonnes
sirh.columns = sirh.columns.str.lower().str.strip()
evals.columns = evals.columns.str.lower().str.strip()
sondage.columns = sondage.columns.str.lower().str.strip()

evals['id_employee'] = evals['eval_number'].str.replace('e_', '', case=False).str.replace('E_', '').astype(int)
sondage = sondage.rename(columns={"code_sondage": "id_employee"})

df = pd.merge(sirh, evals, on="id_employee", how="inner")
df = pd.merge(df, sondage, on="id_employee", how="inner")

In [None]:
# --------------------------------------------------
# 3. Variable cible et features
# --------------------------------------------------
y_raw = df["a_quitte_l_entreprise"].astype(str)
X = df.drop(columns=["a_quitte_l_entreprise", "id_employee"])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

In [None]:
# --------------------------------------------------
# 4. Split train/test
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Taille X_train:", X_train.shape)
print("Taille X_test:", X_test.shape)

In [None]:
# --------------------------------------------------
# 5. Préprocessing avec regroupement des catégories rares
# --------------------------------------------------
class RareCategoryEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, min_freq=0.01):
        self.min_freq = min_freq
        self.frequent_categories_ = {}

    def fit(self, X, y=None):
        for col in X.columns:
            freqs = X[col].value_counts(normalize=True)
            self.frequent_categories_[col] = freqs[freqs >= self.min_freq].index.tolist()
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col, cats in self.frequent_categories_.items():
            X_copy[col] = X_copy[col].where(X_copy[col].isin(cats), 'autre')
        return X_copy

numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

for col in categorical_features:
    X_train.loc[:, col] = X_train[col].astype(str).str.strip().str.lower()
    X_test.loc[:, col] = X_test[col].astype(str).str.strip().str.lower()

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', Pipeline([
        ('rare', RareCategoryEncoder(min_freq=0.01)),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), categorical_features)
])

print("Colonnes numériques :", numeric_features)
print("Colonnes catégorielles :", categorical_features)