In [1]:
# loading data + prerocssing + splitting

import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

df = pd.read_csv("retail_store_inventory.csv")
# delete 'df = df[(df["Region"] == "North") & (df["Category"] == "Groceries")].copy()'

df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

split_idx = int(len(df) * 0.8)
train = df.iloc[:split_idx].copy()
test  = df.iloc[split_idx:].copy()

In [2]:
# -------------------------
# Feature definition
# -------------------------

numeric_features = [
    "Inventory Level",
    "Price",
    "Discount",
    "Competitor Pricing"
]

categorical_features = [
    "Weather Condition",
    "Holiday/Promotion",
    "Seasonality",
    "Region",
    "Category"
]

# 날짜 파생 변수 (이미 Date는 datetime으로 변환됨)
for d in [train, test]:
    d["DayOfWeek"] = d["Date"].dt.day_name()
    d["Month"] = d["Date"].dt.month

categorical_features += ["DayOfWeek", "Month"]

features = numeric_features + categorical_features

In [3]:
# Sparating Feature / Target 

X_train = train[features]
X_test  = test[features]

y_train = train["Units Sold"]
y_test  = test["Units Sold"]

In [4]:
# Baseline 2 types
pred_mean = pd.Series([y_train.mean()] * len(test), index=test.index)

pred_forecast = test["Demand Forecast"]

In [5]:
# features

numeric_features = [
    "Inventory Level",
    "Price",
    "Discount",
    "Competitor Pricing",
]
categorical_features = [
    "Weather Condition",
    "Holiday/Promotion",
    "Seasonality",
]

# Date 파생 변수 추가 (DayOfWeek, Month)

for d in [train, test]:
    d["DayOfWeek"] = d["Date"].dt.day_name()
    d["Month"] = d["Date"].dt.month.astype(int)

categorical_features += ["DayOfWeek", "Month"]

feature_cols = numeric_features + categorical_features

X_train = train[feature_cols]
X_test  = test[feature_cols]

In [6]:
# Preprocess (One-hot for categorical)
preprocess = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [7]:
# Linear Regression
lr_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

# Random Forest
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

# Stacking (LR + RF -> meta LR)
#     ✅ 각 base estimator도 "전처리 포함 Pipeline"이어야 함
stack_model = StackingRegressor(
    estimators=[
        ("lr", lr_model),
        ("rf", rf_model),
    ],
    final_estimator=LinearRegression(),
    passthrough=False  # True로 하면 원래 feature도 meta에 같이 들어감(원하면 True로 바꿔도 됨)
)


In [8]:
# Train + Predict + Evaluate
# -----------------------
# Train
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
stack_model.fit(X_train, y_train)

# Predict
pred_lr = lr_model.predict(X_test)
pred_rf = rf_model.predict(X_test)
pred_stack = stack_model.predict(X_test)

# Compare table
results = pd.DataFrame({
    "Model": [
        "Baseline 1 (Mean)",
        "Baseline 2 (Demand Forecast)",
        "Linear Regression",
        "Random Forest",
        "Stacking (LR + RF)"
    ],
    "RMSE": [
        rmse(y_test, pred_mean),
        rmse(y_test, pred_forecast),
        rmse(y_test, pred_lr),
        rmse(y_test, pred_rf),
        rmse(y_test, pred_stack)
    ],
    "MAE": [
        mean_absolute_error(y_test, pred_mean),
        mean_absolute_error(y_test, pred_forecast),
        mean_absolute_error(y_test, pred_lr),
        mean_absolute_error(y_test, pred_rf),
        mean_absolute_error(y_test, pred_stack)
    ]
}).sort_values("RMSE")

print(results)

                          Model        RMSE        MAE
1  Baseline 2 (Demand Forecast)   10.052654   8.364171
4            Stacking (LR + RF)   87.663157  68.675955
2             Linear Regression   87.666167  68.676636
3                 Random Forest   89.298671  69.631864
0             Baseline 1 (Mean)  108.241251  88.808149


In [9]:
scope_compare = pd.DataFrame({
    "Scope": ["North + Groceries", "All Data"],
    "RMSE": [87.66, 88.10],
    "MAE": [68.68, 69.20]
})

scope_compare.to_csv("scope_comparison.csv", index=False)