# Imports

In [109]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.metrics import mean_absolute_error

# Data

In [110]:
df = pd.read_csv("data/train.csv")

# 7595 colonnes

# Preprocessing

#### On retire les colonnes 

In [111]:
# unique values

unique_cols = df.columns[df.nunique() <= 1].tolist()

df = df.drop(columns=unique_cols)

In [112]:
# colonne identiques

duplicate_mask = df.T.duplicated()
duplicate_cols = df.columns[duplicate_mask].tolist()

df = df.loc[:, ~duplicate_mask]

In [113]:
# imbalanced columns
        
threshold = 0.95

dominant_ratio = df.apply(
    lambda col: col.value_counts(normalize=True, dropna=False).iloc[0]
)

imbalanced_cols = dominant_ratio[dominant_ratio >= threshold].index.tolist()

df = df.drop(columns=imbalanced_cols)

In [114]:
len(unique_cols)

1484

In [115]:
len(duplicate_cols)

3003

In [116]:
len(imbalanced_cols)

651

In [117]:
df.to_parquet("data/train_V1.parquet")

# Simplification de model

In [118]:
df = pd.read_parquet("data/train_V1.parquet")

n_bins = 10

satisfaction_bins = pd.qcut(
    df["satisfaction"],
    q=n_bins,
    duplicates="drop"
)

df_sample, _ = train_test_split(
    df,
    train_size=0.2,
    random_state=42,
    stratify=satisfaction_bins
)

In [119]:
df_sample.to_parquet("data/train_sample_V1.parquet")

# model

In [120]:
df = pd.read_parquet("data/train_sample_V1.parquet")

targets = ["wip", "investissement", "satisfaction"]

X = df.drop(columns=targets)
y = df["satisfaction"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [121]:
model_RFR = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

model_RFR.fit(X_train, y_train)
pred_RFR = model_RFR.predict(X_test)

print(mean_absolute_error(y_test, pred_RFR))

0.05311779298789356


In [122]:
model_HGBR = HistGradientBoostingRegressor(
    max_iter=200,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

model_HGBR.fit(X_train, y_train)
pred_HGBR = model_HGBR.predict(X_test)

print(mean_absolute_error(y_test, pred_HGBR))

0.04869239140620158


# Metric

In [123]:
# prÃ©dictions -> DataFrame
df_pred = pd.DataFrame(pred_RFR, columns=["satisfaction"], index=y_test.index)

# vraies valeurs
df_true = pd.DataFrame(y_test.copy())

# errreur
df_error = (df_true - df_pred).abs()

# tableau final
df_results = pd.concat([df_true, df_pred.add_suffix("_pred"), df_error.add_suffix("_error")], axis=1)

df_results.head()

Unnamed: 0,satisfaction,satisfaction_pred,satisfaction_error
4731,0.873203,0.856723,0.01648
53154,0.723735,0.752968,0.029233
50581,0.362903,0.375051,0.012148
67231,0.339268,0.291662,0.047607
13403,0.703752,0.792937,0.089186


In [124]:
mask = df_results["satisfaction_error"] < 0.05

count_ok = mask.sum()
ratio_ok = mask.mean()  

count_ok, ratio_ok

(1793, 0.582710432239194)