In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score, mean_squared_error


from lightgbm import LGBMRegressor

RANDOM_STATE = 42

In [10]:
df_train_raw = pd.read_csv("data/train.csv")
df_test_raw  = pd.read_csv("data/test.csv")

print("Train shape:", df_train_raw.shape)
print("Test shape :", df_test_raw.shape)


Train shape: (76905, 7595)
Test shape : (8346, 7592)


In [11]:
df_train = df_train_raw.drop(columns=["id"])
df_test  = df_test_raw.drop(columns=["id"])

In [12]:
def remove_duplicate_features(df):
    duplicated_mask = df.T.duplicated()
    return df.loc[:, ~duplicated_mask]

df_train = remove_duplicate_features(df_train)
df_test  = remove_duplicate_features(df_test)

print("After duplicate removal:")
print("Train:", df_train.shape)
print("Test :", df_test.shape)

After duplicate removal:
Train: (76905, 3109)
Test : (8346, 3106)


In [13]:
constant_columns = df_train.columns[df_train.nunique() <= 1]

df_train = df_train.drop(columns=constant_columns)
df_test  = df_test.drop(columns=constant_columns, errors="ignore")

print("After constant removal:")
print("Train:", df_train.shape)
print("Test :", df_test.shape)

After constant removal:
Train: (76905, 3107)
Test : (8346, 3104)


In [14]:
KPI_TARGETS = ["wip", "investissement", "satisfaction"]

In [15]:
X_features = df_train.drop(columns=KPI_TARGETS)
y_kpis = df_train[KPI_TARGETS]

print("X shape:", X_features.shape)
print("y shape:", y_kpis.shape)

X shape: (76905, 3104)
y shape: (76905, 3)


In [16]:
X_train, X_val, y_train, y_val = train_test_split(
    X_features,
    y_kpis,
    test_size=0.2,
    random_state=RANDOM_STATE
)


In [None]:
lgbm_base_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model_kpis = MultiOutputRegressor(lgbm_base_model)

In [18]:
model_kpis.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.288408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.317222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.274429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM

In [19]:
y_val_pred = model_kpis.predict(X_val)

In [None]:
r2_per_kpi = r2_score(y_val, y_val_pred, multioutput="raw_values")
r2_global = r2_score(y_val, y_val_pred, multioutput="uniform_average")

rmse_per_kpi = np.sqrt(
    mean_squared_error(y_val, y_val_pred, multioutput="raw_values")
)

mae_per_kpi = np.mean(np.abs(y_val - y_val_pred), axis=0)

for kpi, r2, rmse, mae in zip(KPI_TARGETS, r2_per_kpi, rmse_per_kpi, mae_per_kpi):
    print(f"{kpi:15s} | R2={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")

print("\nOverall R2:", round(r2_global, 4))

wip             | R2=0.7716 | RMSE=2215029.1940 | MAE=1404380.8137
investissement  | R2=0.9999 | RMSE=1532.8459 | MAE=879.0294
satisfaction    | R2=0.9328 | RMSE=0.0537 | MAE=0.0390

Overall R2: 0.9014


In [22]:
idx_satisfaction = KPI_TARGETS.index("satisfaction")

satisfaction_true = y_val.iloc[:, idx_satisfaction]
satisfaction_pred = y_val_pred[:, idx_satisfaction]

score = (np.abs(satisfaction_true - satisfaction_pred) < 0.05).mean()

print(" score (±0.05 on satisfaction):", round(score, 4))

 score (±0.05 on satisfaction): 0.7346


In [23]:
X_test_final = df_test.reindex(columns=X_features.columns, fill_value=0)
X_test_final = X_test_final[X_features.columns]

In [24]:
kpi_predictions = model_kpis.predict(X_test_final)


In [25]:
df_submission = pd.DataFrame(
    kpi_predictions,
    columns=KPI_TARGETS
)

df_submission.insert(0, "id", df_test_raw["id"].values)

df_submission.head()

Unnamed: 0,id,wip,investissement,satisfaction
0,0,26894360.0,999316.6,0.378354
1,1,22644910.0,1249122.0,0.482598
2,2,24381000.0,749252.3,0.575819
3,3,27222230.0,1249442.0,0.821705
4,4,28672090.0,1125213.0,0.804899


In [26]:
df_submission.to_csv("submission_lightgbm_multi_kpi.csv", index=False)