## Seting the model ready


In [5]:
import pandas as pd
from sklearn.pipeline import Pipeline
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


In [6]:
# 1. DATA LOADING AND PREPROCESSING
# ---------------------------------
def load_and_preprocess_data(filepath):
    # Load the dataset
    hr_data = pd.read_csv(filepath)

    # Remove irrelevant columns (constants like 'StandardHours' or IDs)
    hr_data = hr_data.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis=1)

    # Encode the Target variable (Attrition: Yes/No -> 1/0)
    hr_data['Attrition'] = hr_data['Attrition'].map({'Yes': 1, 'No': 0})

    # Encode categorical features (One-Hot Encoding)
    # Select object type columns excluding the target (already processed)
    categorical_cols = hr_data.select_dtypes(include=['object']).columns
    hr_data = pd.get_dummies(hr_data, columns=categorical_cols, drop_first=True)

    # Split Features and Target
    features = hr_data.drop('Attrition', axis=1)
    target = hr_data['Attrition']

    return features, target

# Execute loading
features, target = load_and_preprocess_data('/workspaces/employee_attrition_project/notebook/data/HR_Analytics.csv')

In [8]:
# See how much class imbalance there is

print(target.value_counts(normalize=True))

Attrition
0    0.838776
1    0.161224
Name: proportion, dtype: float64


### observations
- I could try to solve with randomundersampler or oversampling, but it might be a better idea to try class imbalance with the class_weights of the model to avoid having too much data or too few

In [9]:
# Split into Train and Test sets
# We use stratification to handle class imbalance (Attrition is usually unbalanced)
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.25, random_state=432, stratify=target
)

In [None]:
# 2. DEFINING THE OBJECTIVE FUNCTION WITH PRUNING
# -----------------------------------------------

def objective(trial):
    # -- Define the search space for hyperparameters

    params= {
        'n_estimators': 40,
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
        "random_state": 432,
        "njobs": -1
    }

    # Inizialize the model
    model = RandomForestClassifier(**params)


    # Manual training loop with pruning
    for epoch in range(1, 21):  # 20 epochs