# Imports

In [89]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error

# Data

In [90]:
df = pd.read_csv("data/train.csv")

# 7595 colonnes

# Preprocessing

#### On retire les colonnes 

In [20]:
# unique values

unique_cols = df.columns[df.nunique() <= 1].tolist()

df = df.drop(columns=unique_cols)

In [21]:
# colonne identiques

duplicate_mask = df.T.duplicated()
duplicate_cols = df.columns[duplicate_mask].tolist()

df = df.loc[:, ~duplicate_mask]

In [22]:
# imbalanced columns
        
threshold = 0.95

dominant_ratio = df.apply(
    lambda col: col.value_counts(normalize=True, dropna=False).iloc[0]
)

imbalanced_cols = dominant_ratio[dominant_ratio >= threshold].index.tolist()

df = df.drop(columns=imbalanced_cols)

In [23]:
len(unique_cols)

1484

In [24]:
len(duplicate_cols)

3003

In [25]:
len(imbalanced_cols)

651

In [26]:
df.to_parquet("data/train_V1.parquet")

# Simplification de model

In [27]:
df = pd.read_parquet("data/train_V1.parquet")

n_bins = 10

satisfaction_bins = pd.qcut(
    df["satisfaction"],
    q=n_bins,
    duplicates="drop"
)

df_sample, _ = train_test_split(
    df,
    train_size=0.2,
    random_state=42,
    stratify=satisfaction_bins
)

In [28]:
df_sample.to_parquet("data/train_sample_V1.parquet")

# model

In [60]:
df = pd.read_parquet("data/train_sample_V1.parquet")

targets = ["wip", "investissement", "satisfaction"]

X = df.drop(columns=targets)
y = df[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [30]:
model_RFR = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

model_RFR.fit(X_train, y_train)
pred_RFR = model_RFR.predict(X_test)

df_pred_RFR = pd.DataFrame(
    pred_RFR,
    columns=y_test.columns,
    index=y_test.index
)

mae_per_kpi = (y_test - df_pred_RFR).abs().mean()
print(mae_per_kpi)

wip               2.019244e+06
investissement    7.735294e+04
satisfaction      8.727667e-02
dtype: float64


In [None]:
# from scipy.stats import randint, uniform
# from sklearn.metrics import make_scorer
# from sklearn.model_selection import RandomizedSearchCV

# def mae_satisfaction(y_true, y_pred):
#     # y_true / y_pred = DataFrame ou array
#     # On prend la colonne 0 si "satisfaction" est la première KPI
#     return mean_absolute_error(y_true[:, 0], y_pred[:, 0])

# # Créer le scorer pour sklearn
# satisfaction_scorer = make_scorer(mae_satisfaction, greater_is_better=False)


# base_model_HGBR = HistGradientBoostingRegressor(random_state=42)
# model = MultiOutputRegressor(base_model_HGBR)

# param_distributions = {
#     'estimator__max_iter': randint(100, 500),
#     'estimator__max_depth': randint(3, 10),
#     'estimator__learning_rate': uniform(0.01, 0.2),
#     'estimator__min_samples_leaf': randint(20, 100),
#     'estimator__l2_regularization': uniform(0, 1)
# }

# search = RandomizedSearchCV(
#     model,
#     param_distributions=param_distributions,
#     n_iter=20,
#     scoring=satisfaction_scorer,  # <- ici
#     cv=3,
#     verbose=2,
#     n_jobs=-1,
#     random_state=42
# )

# search.fit(X_train, y_train)

# print("Best params:", search.best_params_)
# print("Best MAE satisfaction:", -search.best_score_)

In [86]:
base_model_HGBR = HistGradientBoostingRegressor(
    min_samples_leaf=80,
    max_iter=400,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    l2_regularization=0.37,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
)

model_HGBR = MultiOutputRegressor(base_model_HGBR)

model_HGBR.fit(X_train, y_train)
pred_HGBR = model_HGBR.predict(X_test)

df_pred_HGBR = pd.DataFrame(
    pred_HGBR,
    columns=y_test.columns,
    index=y_test.index
)

mae_per_kpi = (y_test - df_pred_HGBR).abs().mean()
print(mae_per_kpi)

wip               1.880638e+06
investissement    4.301231e-05
satisfaction      4.774033e-02
dtype: float64


In [None]:
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV


xgb = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

param_dist = {
    "n_estimators": randint(200, 800),
    "max_depth": randint(3, 10),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "min_child_weight": randint(1, 10),
    "reg_alpha": uniform(0, 1),
    "reg_lambda": uniform(0.5, 2),
}

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=40,
    scoring="neg_root_mean_squared_error",
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

best_xgb = search.best_estimator_

print(search.best_params_)
print("CV RMSE:", -search.best_score_)

In [32]:
xgb_base = XGBRegressor(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

model_XGB = MultiOutputRegressor(xgb_base)

model_XGB.fit(X_train, y_train)
pred_XGB = model_XGB.predict(X_test)

df_pred_XGB = pd.DataFrame(
    pred_XGB,
    columns=y_test.columns,
    index=y_test.index
)

mae_per_kpi = (y_test - df_pred_XGB).abs().mean()
print(mae_per_kpi)

wip               1.809183e+06
investissement    1.539287e+03
satisfaction      4.647920e-02
dtype: float64


#### Blending

In [33]:
pred_blend = (pred_HGBR + pred_XGB) / 2

df_pred_blend = pd.DataFrame(
    pred_blend,
    columns=y_test.columns,
    index=y_test.index
)

# Metric

In [87]:
# prédictions -> DataFrame

df_pred = df_pred_HGBR["satisfaction"]

# vraies valeurs
df_true = y_test.copy()["satisfaction"]

# errreur
df_error = (df_true - df_pred).abs()

# tableau final
df_results = pd.concat(
    [
        df_true.rename("satisfaction"), 
        df_pred.rename("satisfaction_pred"), 
        df_error.rename("satisfaction_error")
    ], 
    axis=1
)

df_results.head()

Unnamed: 0,satisfaction,satisfaction_pred,satisfaction_error
4731,0.873203,0.887201,0.013998
53154,0.723735,0.729809,0.006074
50581,0.362903,0.351712,0.011191
67231,0.339268,0.242476,0.096792
13403,0.703752,0.734048,0.030296


In [88]:
mask = df_results["satisfaction_error"] < 0.05

count_ok = mask.sum()
ratio_ok = mask.mean()  

count_ok, ratio_ok

(1956, 0.6356841078973026)