In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [2]:


df_train_raw = pd.read_csv("train.csv")
df_test_raw  = pd.read_csv("test.csv")

print("Train:", df_train_raw.shape)
print("Test :", df_test_raw.shape)


Train: (76905, 7595)
Test : (8346, 7592)


In [4]:
df_train = df_train_raw.drop(columns=["id"])
df_test  = df_test_raw.drop(columns=["id"])



In [5]:
def remove_duplicate_columns(df):
    mask = df.T.duplicated()
    return df.loc[:, ~mask]

df_train = remove_duplicate_columns(df_train)
df_test  = remove_duplicate_columns(df_test)


In [6]:
constant_cols = df_train.columns[df_train.nunique() <= 1]

df_train = df_train.drop(columns=constant_cols)
df_test  = df_test.drop(columns=constant_cols, errors="ignore")



In [7]:
KPI_TARGETS = ["wip", "investissement", "satisfaction"]


In [8]:
X = df_train.drop(columns=KPI_TARGETS)
y = df_train[KPI_TARGETS]

print("X:", X.shape)
print("y:", y.shape)



X: (76905, 3104)
y: (76905, 3)


In [9]:
RANDOM_STATE = 42


In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE
)


In [12]:
from sklearn.multioutput import MultiOutputRegressor


In [13]:
lgbm_base = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.025,
    num_leaves=96,
    min_child_samples=40,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model_kpis = MultiOutputRegressor(lgbm_base)


In [19]:
model_kpis.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.248263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.245201 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if me

0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`.,LGBMRegressor...ubsample=0.85)
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",

0,1,2
,boosting_type,'gbdt'
,num_leaves,96
,max_depth,-1
,learning_rate,0.025
,n_estimators,1500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [14]:

sat_model = model_kpis.estimators_[2]

importances = pd.Series(
    sat_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

TOP_N = 800
top_features = importances.head(TOP_N).index


AttributeError: 'MultiOutputRegressor' object has no attribute 'estimators_'

In [47]:
X_train_top = X_train[top_features]
X_val_top   = X_val[top_features]


In [22]:
model_kpis.fit(X_train_top, y_train)

y_val_pred = model_kpis.predict(X_val_top)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1820
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 800
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1820
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 800
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memo

In [23]:

y_val_pred_np = np.asarray(y_val_pred)
y_val_np = np.asarray(y_val)
y_val_pred_np = np.clip(y_val_pred_np, 0, None)
bias = np.mean(y_val_np - y_val_pred_np, axis=0)
y_val_pred_np = y_val_pred_np + bias



In [26]:
from sklearn.metrics import r2_score


In [27]:
r2_per_kpi = r2_score(y_val, y_val_pred_np, multioutput="raw_values")


for kpi, r2 in zip(KPI_TARGETS, r2_per_kpi):
    print(f"{kpi:15s} | R2 = {r2:.4f}")


wip             | R2 = 0.7884
investissement  | R2 = 0.9999
satisfaction    | R2 = 0.9378


In [28]:
errors = np.abs(y_val - y_val_pred_np)
score_per_kpi = (errors < 0.05).mean(axis=0)

for kpi, score in zip(KPI_TARGETS, score_per_kpi):
    print(f"{kpi:15s} | Score ±0.05 = {score:.4f}")

print("\nOverall score:", score_per_kpi.mean())


wip             | Score ±0.05 = 0.0000
investissement  | Score ±0.05 = 0.0001
satisfaction    | Score ±0.05 = 0.7536

Overall score: 0.2512190364735713


In [29]:
X_test_final = df_test.reindex(columns=top_features, fill_value=0)
X_test_final = X_test_final[top_features]


In [30]:
test_pred_np = model_kpis.predict(X_test_final)

test_pred_np = np.clip(test_pred_np, 0, None)
test_pred_np = test_pred_np + bias


In [31]:
df_submission = pd.DataFrame(
    test_pred_np,
    columns=["wip", "investissement", "satisfaction"]
)

df_submission.insert(0, "id", df_test_raw["id"].values)


In [32]:
df_submission.to_csv(
    "data/submission_lgbm_improved.csv",
    index=False
)



In [34]:
X_test = df_test_raw[X.columns]


In [36]:
X_train.shape


(61524, 3104)

In [37]:
X_train.columns


Index(['demand_1_0', 'demand_1_1', 'demand_2_0', 'demand_2_1', 'demand_3_0',
       'demand_3_1', 'demand_3_2', 'demand_3_6', 'demand_4_0', 'demand_4_2',
       ...
       'param_year0_f13', 'param_year0_f14', 'param_year0_f15',
       'param_year0_f16', 'param_year1_f14', 'param_year1_f15',
       'param_year1_f16', 'param_year2_f14', 'param_year2_f15',
       'param_year2_f16'],
      dtype='object', length=3104)

In [38]:
X_test = df_test_raw[X_train.columns]


In [40]:
X_test = df_test_raw[X_train.columns]


In [41]:
print(X_train.shape)
print(X_test.shape)


(61524, 3104)
(8346, 3104)


In [42]:
# RITRAJNO MODELIN ME X_train AKTUAL
model_kpis.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.285636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.249186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.252729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if me

0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`.,LGBMRegressor...ubsample=0.85)
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",

0,1,2
,boosting_type,'gbdt'
,num_leaves,96
,max_depth,-1
,learning_rate,0.025
,n_estimators,1500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [43]:
pred_test_all = model_kpis.predict(X_test)


In [44]:
import numpy as np
import pandas as pd

pred_satisfaction = pred_test_all[:, 2]
pred_satisfaction = np.clip(pred_satisfaction, 0, 1)


In [45]:
submission = pd.DataFrame({
    "id": df_test_raw["id"],
    "satisfaction": pred_satisfaction
})

submission.to_csv("submission.csv", index=False)
print("DONE: submission.csv u krijua")


DONE: submission.csv u krijua


In [46]:
submission.head()
submission.shape


(8346, 2)