In [2]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor

RANDOM_STATE = 42


In [None]:

df_train_raw = pd.read_csv("train.csv")
df_test_raw  = pd.read_csv("test.csv")

print("Train shape:", df_train_raw.shape)
print("Test shape :", df_test_raw.shape)


MemoryError: Unable to allocate 601. KiB for an array with shape (76905,) and data type float64

In [None]:
df_train = df_train_raw.drop(columns=["id"])
df_test  = df_test_raw.drop(columns=["id"])

In [None]:
def remove_duplicate_features(df):
    duplicated_mask = df.T.duplicated()
    return df.loc[:, ~duplicated_mask]

df_train = remove_duplicate_features(df_train)
df_test  = remove_duplicate_features(df_test)

print("After duplicate removal:")
print("Train:", df_train.shape)
print("Test :", df_test.shape)

In [None]:
constant_columns = df_train.columns[df_train.nunique() <= 1]

df_train = df_train.drop(columns=constant_columns)
df_test  = df_test.drop(columns=constant_columns, errors="ignore")

print("After constant removal:")
print("Train:", df_train.shape)
print("Test :", df_test.shape)

In [None]:
KPI_TARGETS = ["wip", "investissement", "satisfaction"]

In [None]:
X_features = df_train.drop(columns=KPI_TARGETS)
y_kpis = df_train[KPI_TARGETS]

print("X shape:", X_features.shape)
print("y shape:", y_kpis.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_features,
    y_kpis,
    test_size=0.2,
    random_state=RANDOM_STATE
)


In [None]:
lgbm_base_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model_kpis = MultiOutputRegressor(lgbm_base_model)

In [None]:
model_kpis.fit(X_train, y_train)

In [None]:
y_val_pred = model_kpis.predict(X_val)

In [None]:
r2_per_kpi = r2_score(y_val, y_val_pred, multioutput="raw_values")
r2_global = r2_score(y_val, y_val_pred, multioutput="uniform_average")

rmse_per_kpi = np.sqrt(
    mean_squared_error(y_val, y_val_pred, multioutput="raw_values")
)

mae_per_kpi = np.mean(np.abs(y_val - y_val_pred), axis=0)

for kpi, r2, rmse, mae in zip(KPI_TARGETS, r2_per_kpi, rmse_per_kpi, mae_per_kpi):
    print(f"{kpi:15s} | R2={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")

print("\nOverall R2:", round(r2_global, 4))

In [None]:
idx_satisfaction = KPI_TARGETS.index("satisfaction")

satisfaction_true = y_val.iloc[:, idx_satisfaction]
satisfaction_pred = y_val_pred[:, idx_satisfaction]

score = (np.abs(satisfaction_true - satisfaction_pred) < 0.05).mean()

print(" score (Â±0.05 on satisfaction):", round(score, 4))

In [None]:
X_test_final = df_test.reindex(columns=X_features.columns, fill_value=0)
X_test_final = X_test_final[X_features.columns]

In [None]:
kpi_predictions = model_kpis.predict(X_test_final)


In [None]:
df_submission = pd.DataFrame(
    kpi_predictions,
    columns=KPI_TARGETS
)

df_submission.insert(0, "id", df_test_raw["id"].values)

df_submission.head()

In [None]:
df_submission.to_csv("submission_lightgbm_multi_kpi.csv", index=False)