# HR Analytics – Analyse et Préparation des Données
## Projet TechNova Partners
Notebook Jupyter fonctionnel pour l'exploration, préparation et modélisation.

In [None]:
# Import des librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Affichage inline pour Jupyter
%matplotlib inline
sns.set(style='whitegrid')
pd.set_option('display.max_columns', None)

## 1. Chargement des fichiers

In [None]:
sirh = pd.read_csv('data/extrait_sirh.csv')
evals = pd.read_csv('data/extrait_eval.csv')
sondage = pd.read_csv('data/extrait_sondage.csv')

print('=== SIRH ===')
display(sirh.head())
print('=== Eval ===')
display(evals.head())
print('=== Sondage ===')
display(sondage.head())

## 2. Exploration des fichiers

In [None]:
print('--- Info SIRH ---')
sirh.info()
print('--- Info Eval ---')
evals.info()
print('--- Info Sondage ---')
sondage.info()

In [None]:
print('--- Description SIRH ---')
display(sirh.describe(include='all'))
print('--- Description Eval ---')
display(evals.describe(include='all'))
print('--- Description Sondage ---')
display(sondage.describe(include='all'))

## 3. Nettoyage et préparation des colonnes

In [None]:
# Normalisation des noms de colonnes
sirh.columns = sirh.columns.str.lower().str.strip()
evals.columns = evals.columns.str.lower().str.strip()
sondage.columns = sondage.columns.str.lower().str.strip()

# Création d'id_employee
evals['id_employee'] = evals['eval_number'].str.replace('E_', '').astype(int)
sondage = sondage.rename(columns={'code_sondage': 'id_employee'})

## 4. Création du DataFrame central

In [None]:
df = pd.merge(sirh, evals, on='id_employee', how='inner')
df = pd.merge(df, sondage, on='id_employee', how='inner')
display(df.head())

## 5. Statistiques descriptives

In [None]:
print('=== Moyenne par statut de départ ===')
display(df.groupby('a_quitte_l_entreprise').mean(numeric_only=True))

## 6. Visualisations

In [None]:
plt.figure(figsize=(6,4))
sns.boxplot(x='a_quitte_l_entreprise', y='revenu_mensuel', data=df)
plt.title("Revenu mensuel vs Départ de l'entreprise")
plt.show()

plt.figure(figsize=(10,4))
sns.countplot(x='poste', hue='a_quitte_l_entreprise', data=df)
plt.title("Poste vs Départ de l'entreprise")
plt.xticks(rotation=45)
plt.show()

## 7. Préparation des données pour la modélisation

In [None]:
# Séparation features et target
X = df.drop(columns=['a_quitte_l_entreprise', 'id_employee'])
y = df['a_quitte_l_entreprise']

In [None]:
# Fonction d'encodage automatique
def encode_categorical_features(df, ordinal_cols=None, nominal_cols=None):
    df_encoded = df.copy()
    
    # Encodage ordinal
    if ordinal_cols:
        for col, categories in ordinal_cols.items():
            enc = OrdinalEncoder(categories=[categories])
            df_encoded[col] = enc.fit_transform(df[[col]])
    
    # Encodage nominal
    if nominal_cols:
        enc = OneHotEncoder(drop='first', sparse_output=False)
        nominal_encoded = enc.fit_transform(df[nominal_cols])
        nominal_encoded_df = pd.DataFrame(nominal_encoded, columns=enc.get_feature_names_out(nominal_cols), index=df.index)
        df_encoded = pd.concat([df_encoded.drop(columns=nominal_cols), nominal_encoded_df], axis=1)
    
    return df_encoded

In [None]:
ordinal_mapping = {'niveau_education': [1,2,3,4,5]}
nominal_cols = ['genre', 'departement', 'poste', 'domaine_etude', 'ayant_enfants', 'frequence_deplacement', 'heure_supplementaires']

X_encoded = encode_categorical_features(X, ordinal_cols=ordinal_mapping, nominal_cols=nominal_cols)
display(X_encoded.head())

## 8. Analyse des corrélations

In [None]:
numeric_cols = X_encoded.select_dtypes(exclude=['object']).columns

plt.figure(figsize=(10,8))
sns.heatmap(X_encoded[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Corrélations de Pearson')
plt.show()

sns.pairplot(pd.concat([X_encoded[numeric_cols], y], axis=1), hue='a_quitte_l_entreprise')
plt.show()

## 9. Train/Test split et modélisation

In [None]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)
print('Taille X_train:', X_train.shape)
print('Taille X_test:', X_test.shape)

In [None]:
# Liste des modèles
models = {
    'Dummy': DummyClassifier(strategy='most_frequent'),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
}

for name, model in models.items():
    print(f'=== Modèle : {name} ===')
    model.fit(X_train, y_train)
    
    print('Train Metrics:')
    y_train_pred = model.predict(X_train)
    print(classification_report(y_train, y_train_pred, zero_division=0))
    
    print('Test Metrics:')
    y_test_pred = model.predict(X_test)
    print(classification_report(y_test, y_test_pred, zero_division=0))