# Imports

In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error

# Data

In [None]:
df = pd.read_csv("data/train.csv")

# 7595 colonnes

# Preprocessing

#### On retire les colonnes 

In [None]:
# unique values

unique_cols = df.columns[df.nunique() <= 1].tolist()

df = df.drop(columns=unique_cols)

In [None]:
# colonne identiques

duplicate_mask = df.T.duplicated()
duplicate_cols = df.columns[duplicate_mask].tolist()

df = df.loc[:, ~duplicate_mask]

In [None]:
# imbalanced columns
        
threshold = 0.95

dominant_ratio = df.apply(
    lambda col: col.value_counts(normalize=True, dropna=False).iloc[0]
)

imbalanced_cols = dominant_ratio[dominant_ratio >= threshold].index.tolist()

df = df.drop(columns=imbalanced_cols)

In [None]:
len(unique_cols)

In [None]:
len(duplicate_cols)

In [None]:
len(imbalanced_cols)

In [None]:
df.to_parquet("data/train_V1.parquet")

# Simplification de model

In [None]:
df = pd.read_parquet("data/train_V1.parquet")

n_bins = 10

satisfaction_bins = pd.qcut(
    df["satisfaction"],
    q=n_bins,
    duplicates="drop"
)

df_sample, _ = train_test_split(
    df,
    train_size=0.2,
    random_state=42,
    stratify=satisfaction_bins
)

In [None]:
df_sample.to_parquet("data/train_sample_V1.parquet")

# model

In [4]:
df = pd.read_parquet("data/train_sample_V1.parquet")

targets = ["wip", "investissement", "satisfaction"]

X = df.drop(columns=targets)
y = df[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
model_RFR = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

model_RFR.fit(X_train, y_train)
pred_RFR = model_RFR.predict(X_test)

df_pred_HGBR = pd.DataFrame(
    pred_RFR,
    columns=y_test.columns,
    index=y_test.index
)

mae_per_kpi = (y_test - df_pred_HGBR).abs().mean()
print(mae_per_kpi)

In [None]:
base_model_HGBR = HistGradientBoostingRegressor(
    max_iter=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

model_HGBR = MultiOutputRegressor(base_model_HGBR)

model_HGBR.fit(X_train, y_train)
pred_HGBR = model_HGBR.predict(X_test)

df_pred_HGBR = pd.DataFrame(
    pred_HGBR,
    columns=y_test.columns,
    index=y_test.index
)

mae_per_kpi = (y_test - df_pred_HGBR).abs().mean()
print(mae_per_kpi)

In [5]:
xgb_base = XGBRegressor(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

model_XGB = MultiOutputRegressor(xgb_base)

model_XGB.fit(X_train, y_train)
pred_XGB = model_XGB.predict(X_test)

df_pred_XGB = pd.DataFrame(
    pred_XGB,
    columns=y_test.columns,
    index=y_test.index
)

mae_per_kpi = (y_test - df_pred_XGB).abs().mean()
print(mae_per_kpi)

wip               1.809183e+06
investissement    1.539287e+03
satisfaction      4.647920e-02
dtype: float64


# Metric

In [6]:
# prÃ©dictions -> DataFrame

df_pred = df_pred_XGB["satisfaction"]

# vraies valeurs
df_true = y_test.copy()["satisfaction"]

# errreur
df_error = (df_true - df_pred).abs()

# tableau final
df_results = pd.concat(
    [
        df_true.rename("satisfaction"), 
        df_pred.rename("satisfaction_pred"), 
        df_error.rename("satisfaction_error")
    ], 
    axis=1
)

df_results.head()

Unnamed: 0,satisfaction,satisfaction_pred,satisfaction_error
4731,0.873203,0.874193,0.000991
53154,0.723735,0.7497,0.025965
50581,0.362903,0.352761,0.010142
67231,0.339268,0.238172,0.101096
13403,0.703752,0.762012,0.05826


In [7]:
mask = df_results["satisfaction_error"] < 0.05

count_ok = mask.sum()
ratio_ok = mask.mean()  

count_ok, ratio_ok

(1987, 0.6457588560285993)