<a href="https://www.kaggle.com/code/davidhalim2004/data-cleaning?scriptVersionId=280113860" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Library importing

In [1]:
!pip install xgboost
import pandas as pd
import joblib
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import gc
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer



# data pre-proccessing

## data reading

In [2]:
chunks = pd.read_csv("/kaggle/input/depi-dataset/data.csv", chunksize=5_000_000)
final_data = pd.concat(chunks, ignore_index=True)

In [3]:
del chunks

In [4]:
preprocessor=joblib.load("/kaggle/input/depi-dataset/pre-proccess.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [5]:
categorical_cols = []
numeric_cols = ["wm_yr_wk", "wday", "snap","year", "month", "day", "sell_price","lag_1",'price_flag','lag_7','snap_weekend','wday_x_snap','is_weekend','event_impact','event_count','is_event']
categorical_cols += ["item_category", "item_subcategory",'item_number']
categorical_transformer = Pipeline(steps=[
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ("scaler", MinMaxScaler())
])
numeric_transformer = "passthrough"
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, categorical_cols),
    ("num", numeric_transformer, numeric_cols)
])
joblib.dump(preprocessor,"/kaggle/working/pre-proccess.pkl")

['/kaggle/working/pre-proccess.pkl']

## Data spliting

In [6]:
store_models = {}

stores = final_data["store_id"].unique()

for store in stores:
    print(f"\n==== Training model for store: {store} ====")

    # 1️⃣ Filter the data of that store only
    df_store = final_data[final_data["store_id"] == store].copy()

    # 2️⃣ Split into X, y
    X = df_store.drop(columns=["sales"])
    y = df_store["sales"]
    X.drop(columns='store_id',inplace=True)
    X=preprocessor.fit_transform(X)

    # 3️⃣ Random Split for THIS store ONLY
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.15, random_state=42
    )

    # 4️⃣ Fit model
    model = XGBRegressor(
    objective='reg:tweedie',
    tweedie_variance_power=1.35,
    booster='gbtree',
    device='cuda',
    tree_method='hist',
    random_state=42,
    eval_metric='mae',
    learning_rate=0.03,
    max_depth=7,
    min_child_weight=15,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=6,
    reg_alpha=2,
    n_estimators=5000,
    early_stopping_rounds=40,
    gamma= 1,
    max_delta_step = 1
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose = False
    )
    
    model.score(X_train,y_train) , model.score(X_test,y_test)
    from sklearn.metrics import mean_absolute_error, r2_score

    train_mae = mean_absolute_error(y_train, model.predict(X_train))
    test_mae  = mean_absolute_error(y_test,  model.predict(X_test))
    train_r2  = r2_score(y_train, model.predict(X_train))
    test_r2   = r2_score(y_test,  model.predict(X_test))

    print(f"MAE train: {train_mae:.4f}, test: {test_mae:.4f}")
    print(f"R2  train: {train_r2:.4f}, test: {test_r2:.4f}")
   
    # 5️⃣ Save model
    store_models[store] = model
    print(f"Model for {store} trained and stored.")

    del X, y, X_temp, X_test, X_val, X_train, y_train, y_temp, y_test
    gc.collect()


==== Training model for store: CA_1 ====


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




MAE train: 0.3922, test: 0.4169
R2  train: 0.9237, test: 0.8832
Model for CA_1 trained and stored.

==== Training model for store: CA_2 ====
MAE train: 0.3162, test: 0.3354
R2  train: 0.8884, test: 0.8452
Model for CA_2 trained and stored.

==== Training model for store: CA_3 ====
MAE train: 0.5363, test: 0.5693
R2  train: 0.9357, test: 0.9138
Model for CA_3 trained and stored.

==== Training model for store: CA_4 ====
MAE train: 0.2345, test: 0.2482
R2  train: 0.8845, test: 0.8437
Model for CA_4 trained and stored.

==== Training model for store: TX_1 ====
MAE train: 0.2963, test: 0.3184
R2  train: 0.9249, test: 0.8559
Model for TX_1 trained and stored.

==== Training model for store: TX_2 ====
MAE train: 0.3609, test: 0.3849
R2  train: 0.9407, test: 0.9101
Model for TX_2 trained and stored.

==== Training model for store: TX_3 ====
MAE train: 0.3139, test: 0.3361
R2  train: 0.9352, test: 0.9025
Model for TX_3 trained and stored.

==== Training model for store: WI_1 ====
MAE train: 0.

In [7]:
import numpy as np
for store in stores:
    # 1️⃣ Filter the data of that store only
    df_store = final_data[final_data["store_id"] == store].copy()

    # 2️⃣ Split into X, y
    X = df_store.drop(columns=["sales"])
    y = df_store["sales"]
    X=preprocessor.fit_transform(X)

    # 3️⃣ Random Split for THIS store ONLY
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=0.15, random_state=42
    )
    y_pred = store_models[store].predict(X_test)
    # Round predictions and ground truth
    y_pred_rounded = np.round(y_pred).astype(int)
    y_test_rounded = np.round(y_test).astype(int)

    print("\n=== Sample Predictions vs Actuals ===\n")
    for i in range(min(25, len(y_test))):
        print(f"Pred: {y_pred_rounded[i]:>3}   Actual: {y_test_rounded.iloc[i]:>3}")


=== Sample Predictions vs Actuals ===

Pred:   2   Actual:   3
Pred:   2   Actual:   2
Pred:  11   Actual:   8
Pred:   3   Actual:   3
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   2   Actual:   2
Pred:   6   Actual:   6
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   2   Actual:   2
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   3   Actual:   3
Pred:   6   Actual:   6
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   2   Actual:   2
Pred:   0   Actual:   0
Pred:   0   Actual:   0

=== Sample Predictions vs Actuals ===

Pred:   0   Actual:   0
Pred:   3   Actual:   3
Pred:  12   Actual:   6
Pred:   2   Actual:   0
Pred:   1   Actual:   1
Pred:   2   Actual:   3
Pred:   0   Actual:   0
Pred:   1   Actual:   1
Pred:   1   Actual:   1
Pred:   1   Actual:   1
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   0   Actual:   0
Pred:   

In [8]:
for i in store_models.keys():
    joblib.dump(store_models[i], f"/kaggle/working/{i}.pkl")