In [1]:
import sys
sys.path.append("../")
sys.path.append("../..")

In [3]:
# For reproducibility
import random
random.seed(0)
VAL_SIZE = 38
SUBMISSION_NAME = "first_lightgbm"

In [None]:
# %% Imports
import pandas as pd
import sys
from metrics.metric_participants import ComputeMetrics
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
import random

from eda.checker import check_train_test

# read data

In [None]:
sales_train = pd.read_csv("../data/data_raw/sales_train.csv")
df_full = pd.read_csv("../data/split.csv")
df_region = pd.read_csv("../data/data_raw/regions.csv")
regions_hcps = pd.read_csv("../data/data_raw/regions_hcps.csv")
activity_features = pd.read_csv("../data/features/activity_features.csv")
train_correlation_features = pd.read_csv("../data/features/train_correlation_features_for_validation.csv")
test_correlation_features = pd.read_csv("../data/features/test_correlation_features_for_validation.csv")
correlation_features = pd.concat([train_correlation_features, test_correlation_features]).reset_index(drop=True)

# merge

In [None]:
# %% Add region data
df_feats = df_full.merge(df_region, on="region", how="left")
df_feats = pd.merge(left=df_feats, right=regions_hcps, how="left", on="region")
df_feats = df_feats.merge(
    activity_features, on=["month", "region", "brand"], how="left"
)
# df_feats = df_feats.merge(
#     correlation_features, on=["month", "region"], how="left"
# )

In [None]:
# drop sum variables
cols_to_drop = ["month", "region", "brand", "sales", "validation"]


# %% Split train val test
X_train = df_feats.query("validation == 0").drop(columns=cols_to_drop)
y_train = df_feats.query("validation == 0").sales

X_val = df_feats.query("validation == 1").drop(columns=cols_to_drop)
y_val = df_feats.query("validation == 1").sales

In [None]:
check_train_test(X_train, X_val)

# fit

In [None]:
# %%
lgbms = {}
pipes = {}
train_preds = {}
val_preds = {}
test_preds = {}

for quantile in [0.5, 0.1, 0.9]:

    lgbms[quantile] = LGBMRegressor(
        n_jobs=-1,
        n_estimators=50,
        objective="quantile",
        alpha=quantile,
    )

    pipes[quantile] = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("lgb", lgbms[quantile])]
    )

    # Fit cv model
    pipes[quantile].fit(X_train, y_train)

    train_preds[quantile] = pipes[quantile].predict(X_train)
    val_preds[quantile] = pipes[quantile].predict(X_val)

In [None]:
# %% Train prediction
train_preds_df = (
    df_feats.query("validation == 0")
    .loc[:, ["month", "region", "brand"]]
    .assign(sales=train_preds[0.5])
    .assign(lower=train_preds[0.1].clip(0))
    .assign(upper=train_preds[0.9])
)

In [None]:
ground_truth_train = df_feats.query("validation == 0").loc[
    :, ["month", "region", "brand", "sales"]
]

In [None]:
ComputeMetrics(train_preds_df, sales_train, ground_truth_train)

(33.38105759717777, 102.31712168606201)

In [None]:
# %% Validation prediction
val_preds_df = (
    df_feats.query("validation == 1")
    .loc[:, ["month", "region", "brand"]]
    .assign(sales=val_preds[0.5])
    .assign(lower=val_preds[0.1].clip(0))
    .assign(upper=val_preds[0.9])
)

ground_truth_val = df_feats.query("validation == 1").loc[
    :, ["month", "region", "brand", "sales"]
]

ComputeMetrics(val_preds_df, sales_train, ground_truth_val)

(68.04061865181535, 184.6997685505819)
