In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score

from lightgbm import LGBMRegressor

RANDOM_STATE = 42


In [None]:
df_train_raw = pd.read_csv("data/train.csv")
df_test_raw  = pd.read_csv("data/test.csv")

print("Train:", df_train_raw.shape)
print("Test :", df_test_raw.shape)


In [None]:
df_train = df_train_raw.drop(columns=["id"])
df_test  = df_test_raw.drop(columns=["id"])


In [None]:
def remove_duplicate_columns(df):
    mask = df.T.duplicated()
    return df.loc[:, ~mask]

df_train = remove_duplicate_columns(df_train)
df_test  = remove_duplicate_columns(df_test)


In [None]:
constant_cols = df_train.columns[df_train.nunique() <= 1]

df_train = df_train.drop(columns=constant_cols)
df_test  = df_test.drop(columns=constant_cols, errors="ignore")


In [None]:
KPI_TARGETS = ["wip", "investissement", "satisfaction"]

X = df_train.drop(columns=KPI_TARGETS)
y = df_train[KPI_TARGETS]

print("X:", X.shape)
print("y:", y.shape)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE
)


In [None]:
lgbm_1 = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.025,
    num_leaves=96,
    min_child_samples=40,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=42,
    n_jobs=-1
)

model_1 = MultiOutputRegressor(lgbm_1)
model_1.fit(X_train, y_train)

pred_val_1 = np.asarray(model_1.predict(X_val))


In [None]:
lgbm_2 = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=128,
    min_child_samples=60,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=43,
    n_jobs=-1
)

model_2 = MultiOutputRegressor(lgbm_2)
model_2.fit(X_train, y_train)

pred_val_2 = np.asarray(model_2.predict(X_val))


In [None]:
y_val_pred_ens = 0.5 * pred_val_1 + 0.5 * pred_val_2


In [None]:
y_val_np = np.asarray(y_val)

y_val_pred_ens = np.clip(y_val_pred_ens, 0, None)

bias = np.mean(y_val_np - y_val_pred_ens, axis=0)

y_val_pred_ens = y_val_pred_ens + bias


In [None]:
r2_per_kpi = r2_score(y_val_np, y_val_pred_ens, multioutput="raw_values")

for kpi, r2 in zip(KPI_TARGETS, r2_per_kpi):
    print(f"{kpi:15s} | R2 = {r2:.4f}")

errors = np.abs(y_val_np - y_val_pred_ens)
score_per_kpi = (errors < 0.05).mean(axis=0)

for kpi, score in zip(KPI_TARGETS, score_per_kpi):
    print(f"{kpi:15s} | Score Â±0.05 = {score:.4f}")

print("\nOverall score:", score_per_kpi.mean())


In [None]:
X_test_final = df_test.reindex(columns=X.columns, fill_value=0)
X_test_final = X_test_final[X.columns]


In [None]:
test_pred_1 = np.asarray(model_1.predict(X_test_final))
test_pred_2 = np.asarray(model_2.predict(X_test_final))

test_pred_ens = 0.5 * test_pred_1 + 0.5 * test_pred_2

test_pred_ens = np.clip(test_pred_ens, 0, None)
test_pred_ens = test_pred_ens + bias


In [None]:
df_submission = pd.DataFrame(
    test_pred_ens,
    columns=["wip", "investissement", "satisfaction"]
)

df_submission.insert(0, "id", df_test_raw["id"].values)

df_submission.head()


In [None]:
df_submission.to_csv(
    "data/submission_lgbm_ensemble_v1.csv",
    index=False
)


