In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [2]:


df_train_raw = pd.read_csv("train.csv")
df_test_raw  = pd.read_csv("test.csv")

print("Train:", df_train_raw.shape)
print("Test :", df_test_raw.shape)


Train: (76905, 7595)
Test : (8346, 7592)


In [3]:
df_train = df_train_raw.drop(columns=["id"])
df_test  = df_test_raw.drop(columns=["id"])



In [5]:
def remove_duplicate_columns(df):
    mask = df.T.duplicated()
    return df.loc[:, ~mask]

df_train = remove_duplicate_columns(df_train)
df_test  = remove_duplicate_columns(df_test)



In [6]:
constant_cols = df_train.columns[df_train.nunique() <= 1]

df_train = df_train.drop(columns=constant_cols)
df_test  = df_test.drop(columns=constant_cols, errors="ignore")



In [7]:
KPI_TARGETS = ["wip", "investissement", "satisfaction"]


In [8]:
X = df_train.drop(columns=KPI_TARGETS)
y = df_train[KPI_TARGETS]

print("X:", X.shape)
print("y:", y.shape)



X: (76905, 3104)
y: (76905, 3)


In [9]:
RANDOM_STATE = 42


In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE
)



In [11]:
from sklearn.multioutput import MultiOutputRegressor


In [12]:
lgbm_base = LGBMRegressor(
    n_estimators=1500,
    learning_rate=0.025,
    num_leaves=96,
    min_child_samples=40,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model_kpis = MultiOutputRegressor(lgbm_base)


In [13]:
model_kpis.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.274407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.279306 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.289820 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if me

0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`.,LGBMRegressor...ubsample=0.85)
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",

0,1,2
,boosting_type,'gbdt'
,num_leaves,96
,max_depth,-1
,learning_rate,0.025
,n_estimators,1500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [14]:

sat_model = model_kpis.estimators_[2]

importances = pd.Series(
    sat_model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

TOP_N = 800
top_features = importances.head(TOP_N).index


In [15]:
X_train_top = X_train[top_features]
X_val_top   = X_val[top_features]


In [16]:
model_kpis.fit(X_train_top, y_train)

y_val_pred = model_kpis.predict(X_val_top)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068099 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1820
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 800
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066635 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1820
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 800
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memo

In [17]:

y_val_pred_np = np.asarray(y_val_pred)
y_val_np = np.asarray(y_val)
y_val_pred_np = np.clip(y_val_pred_np, 0, None)
bias = np.mean(y_val_np - y_val_pred_np, axis=0)
y_val_pred_np = y_val_pred_np + bias



In [18]:
from sklearn.metrics import r2_score


In [19]:
r2_per_kpi = r2_score(y_val, y_val_pred_np, multioutput="raw_values")


for kpi, r2 in zip(KPI_TARGETS, r2_per_kpi):
    print(f"{kpi:15s} | R2 = {r2:.4f}")


wip             | R2 = 0.7884
investissement  | R2 = 0.9999
satisfaction    | R2 = 0.9378


In [21]:
X_test_final = df_test.reindex(columns=top_features, fill_value=0)
X_test_final = X_test_final[top_features]


In [22]:
test_pred_np = model_kpis.predict(X_test_final)

test_pred_np = np.clip(test_pred_np, 0, None)
test_pred_np = test_pred_np + bias


In [23]:
df_submission = pd.DataFrame(
    test_pred_np,
    columns=["wip", "investissement", "satisfaction"]
)

df_submission.insert(0, "id", df_test_raw["id"].values)


In [24]:
df_submission.to_csv(
    "data/submission_lgbm_improved.csv",
    index=False
)



In [25]:
X_test = df_test_raw[X.columns]


In [26]:
X_train.shape


(61524, 3104)

In [27]:
X_train.columns


Index(['demand_1_0', 'demand_1_1', 'demand_2_0', 'demand_2_1', 'demand_3_0',
       'demand_3_1', 'demand_3_2', 'demand_3_6', 'demand_4_0', 'demand_4_2',
       ...
       'param_year0_f13', 'param_year0_f14', 'param_year0_f15',
       'param_year0_f16', 'param_year1_f14', 'param_year1_f15',
       'param_year1_f16', 'param_year2_f14', 'param_year2_f15',
       'param_year2_f16'],
      dtype='object', length=3104)

In [28]:
X_test = df_test_raw[X_train.columns]


In [29]:
X_test = df_test_raw[X_train.columns]


In [30]:
print(X_train.shape)
print(X_test.shape)


(61524, 3104)
(8346, 3104)


In [31]:
# RITRAJNO MODELIN ME X_train AKTUAL
model_kpis.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.255983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 29137002.444087
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.230356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 1000345.393668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.284199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if me

0,1,2
,estimator  estimator: estimator object An estimator object implementing :term:`fit` and :term:`predict`.,LGBMRegressor...ubsample=0.85)
,"n_jobs  n_jobs: int or None, optional (default=None) The number of jobs to run in parallel. :meth:`fit`, :meth:`predict` and :meth:`partial_fit` (if supported by the passed estimator) will be parallelized for each target. When individual estimators are fast to train or predict, using ``n_jobs > 1`` can result in slower performance due to the parallelism overhead. ``None`` means `1` unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all available processes / threads. See :term:`Glossary ` for more details. .. versionchanged:: 0.20  `n_jobs` default changed from `1` to `None`.",

0,1,2
,boosting_type,'gbdt'
,num_leaves,96
,max_depth,-1
,learning_rate,0.025
,n_estimators,1500
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [32]:
pred_test_all = model_kpis.predict(X_test)


In [33]:
import numpy as np
import pandas as pd

pred_satisfaction = pred_test_all[:, 2]
pred_satisfaction = np.clip(pred_satisfaction, 0, 1)


In [None]:
submission = pd.DataFrame({
    "id": df_test_raw["id"],
    "satisfaction": pred_satisfaction
})

submission.to_csv("submission.csv", index=False)
print("DONE: submission.csv u krijua")


DONE: submission.csv u krijua


In [34]:
from lightgbm import LGBMRegressor
import numpy as np

model_sat = LGBMRegressor(
    n_estimators=1800,
    learning_rate=0.025,
    num_leaves=32,
    min_child_samples=150,
    subsample=0.95,
    colsample_bytree=0.95,
    reg_alpha=0.3,
    reg_lambda=0.3,
    random_state=42,
    n_jobs=-1
)


In [35]:
model_sat.fit(X, y["satisfaction"])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.273728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6424
[LightGBM] [Info] Number of data points in the train set: 76905, number of used features: 3102
[LightGBM] [Info] Start training from score 0.743514


0,1,2
,boosting_type,'gbdt'
,num_leaves,32
,max_depth,-1
,learning_rate,0.025
,n_estimators,1800
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [36]:
pred_sat = model_sat.predict(X_test)

# clip bazë
pred_sat = np.clip(pred_sat, 0, 1)

# smoothing i fortë
mean_sat = y["satisfaction"].mean()
pred_sat = 0.85 * pred_sat + 0.15 * mean_sat

submission = pd.DataFrame({
    "id": df_test_raw["id"],
    "satisfaction": pred_sat
})

submission.to_csv("submission_try2.csv", index=False)
print("submission_try2.csv u krijua")


submission_try2.csv u krijua


In [37]:
import numpy as np

# predict në validation
pred_val = model_sat.predict(X_val)

# clip
pred_val = np.clip(pred_val, 0, 1)

# smoothing i njëjtë si për test
mean_sat = y["satisfaction"].mean()
pred_val = 0.85 * pred_val + 0.15 * mean_sat

# score ±0.05
y_true = y_val["satisfaction"] if hasattr(y_val, "__getitem__") else y_val
score_sat = np.mean(np.abs(pred_val - y_true) <= 0.05)

print(f"Satisfaction | Score ±0.05 = {score_sat:.4f}")


Satisfaction | Score ±0.05 = 0.5493


In [38]:
model_sat = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.02,
    num_leaves=64,
    min_child_samples=100,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1
)


In [39]:
model_sat.fit(X_train, y_train["satisfaction"])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.253649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 61524, number of used features: 3104
[LightGBM] [Info] Start training from score 0.743393


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.02
,n_estimators,2000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [40]:
import numpy as np

pred_val = model_sat.predict(X_val)
pred_val = np.clip(pred_val, 0, 1)

score_sat = np.mean(
    np.abs(pred_val - y_val["satisfaction"]) <= 0.05
)

print(f"Satisfaction | Score ±0.05 = {score_sat:.4f}")


Satisfaction | Score ±0.05 = 0.7399


In [41]:
# train final me gjithë data
model_sat.fit(X, y["satisfaction"])

# predict test
pred_test = model_sat.predict(X_test)
pred_test = np.clip(pred_test, 0, 1)

submission = pd.DataFrame({
    "id": df_test_raw["id"],
    "satisfaction": pred_test
})

submission.to_csv("submission_3.csv", index=False)
print("submission_final.csv u krijua")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.282413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 76905, number of used features: 3104
[LightGBM] [Info] Start training from score 0.743514
submission_final.csv u krijua


In [42]:
import numpy as np

pred_val = model_sat.predict(X_val)

# clip bazë
pred_val = np.clip(pred_val, 0, 1)

# quantile clip (heq outliers)
q_low, q_high = np.quantile(pred_val, [0.02, 0.98])
pred_val = np.clip(pred_val, q_low, q_high)

# smoothing shumë i lehtë
mean_sat = y["satisfaction"].mean()
pred_val = 0.95 * pred_val + 0.05 * mean_sat

# score ±0.05
score_sat = np.mean(np.abs(pred_val - y_val["satisfaction"]) <= 0.05)
print(f"Satisfaction | Score ±0.05 = {score_sat:.4f}")


Satisfaction | Score ±0.05 = 0.7657


In [43]:
model_sat.fit(X, y["satisfaction"])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.304941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 76905, number of used features: 3104
[LightGBM] [Info] Start training from score 0.743514


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.02
,n_estimators,2000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [44]:
pred_test = model_sat.predict(X_test)

# clip bazë
pred_test = np.clip(pred_test, 0, 1)

# quantile clip
q_low, q_high = np.quantile(pred_test, [0.02, 0.98])
pred_test = np.clip(pred_test, q_low, q_high)

# smoothing i lehtë
mean_sat = y["satisfaction"].mean()
pred_test = 0.95 * pred_test + 0.05 * mean_sat


In [45]:
submission = pd.DataFrame({
    "id": df_test_raw["id"],
    "satisfaction": pred_test
})
submission.to_csv("submission_4.csv", index=False)
print("submission_4.csv")


submission_4.csv


# ==============================
# DEEP LEARNING MODEL (EXPERIMENTAL)
# ==============================


In [51]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression

# model 1: LGBM (që e ke)
pred_lgbm_val = model_sat.predict(X_val)

# model 2: MLP
mlp = MLPRegressor(
    hidden_layer_sizes=(256,128,64),
    activation="relu",
    max_iter=50,
    random_state=42
)
mlp.fit(X_train, y_train["satisfaction"])
pred_mlp_val = mlp.predict(X_val)

# stacking (meta-model)
stack_X = np.column_stack([pred_lgbm_val, pred_mlp_val])
meta = LinearRegression()
meta.fit(stack_X, y_val["satisfaction"])

# score
pred_stack = meta.predict(stack_X)
pred_stack = np.clip(pred_stack, 0, 1)

score_stack = np.mean(np.abs(pred_stack - y_val["satisfaction"]) <= 0.05)
print("STACK SCORE:", score_stack)


STACK SCORE: 0.8006631558416227


In [52]:
# retrain MLP në gjithë data
mlp.fit(X, y["satisfaction"])

# retrain LGBM në gjithë data
model_sat.fit(X, y["satisfaction"])


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.294601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6428
[LightGBM] [Info] Number of data points in the train set: 76905, number of used features: 3104
[LightGBM] [Info] Start training from score 0.743514


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.02
,n_estimators,2000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [53]:
pred_lgbm_test = model_sat.predict(X_test)
pred_mlp_test  = mlp.predict(X_test)

stack_test = np.column_stack([pred_lgbm_test, pred_mlp_test])


In [54]:
pred_stack_test = meta.predict(stack_test)
pred_stack_test = np.clip(pred_stack_test, 0, 1)


In [55]:
mean_sat = y["satisfaction"].mean()
pred_stack_test = 0.97 * pred_stack_test + 0.03 * mean_sat


In [57]:
pred_lgbm_test = model_sat.predict(X_test)
pred_mlp_test  = mlp.predict(X_test)

stack_test = np.column_stack([pred_lgbm_test, pred_mlp_test])
pred_stack_test = meta.predict(stack_test)
pred_stack_test = np.clip(pred_stack_test, 0, 1)

mean_sat = y["satisfaction"].mean()
pred_stack_test = 0.97 * pred_stack_test + 0.03 * mean_sat

submission = pd.DataFrame({
    "id": df_test_raw["id"],
    "satisfaction": pred_stack_test
})
submission.to_csv("submission_STACK.csv", index=False)


In [50]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np


ModuleNotFoundError: No module named 'torch'

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

model_dl = models.Sequential([
    layers.Input(shape=(X_dl.shape[1],)),
    layers.Dense(512, activation="relu"),
    layers.Dropout(0.4),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(64, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model_dl.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.Huber(delta=0.05)
)


ModuleNotFoundError: No module named 'tensorflow'