In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

# 1) Load + scope
df = pd.read_csv("retail_store_inventory.csv")
df = df[(df["Region"] == "North") & (df["Category"] == "Groceries")].copy()
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

# 2) Date features (+ cyclical month)
df["DayOfWeek"] = df["Date"].dt.day_name()
df["Month"] = df["Date"].dt.month.astype(int)
df["Month_sin"] = np.sin(2*np.pi*df["Month"]/12)
df["Month_cos"] = np.cos(2*np.pi*df["Month"]/12)

# 3) Train/Test split (time-based)
split_idx = int(len(df) * 0.8)
train = df.iloc[:split_idx].copy()
test  = df.iloc[split_idx:].copy()

y_train = train["Units Sold"].astype(float)
y_test  = test["Units Sold"].astype(float)

numeric_features = ["Inventory Level", "Price", "Discount", "Competitor Pricing", "Month_sin", "Month_cos"]
categorical_features = ["Weather Condition", "Holiday/Promotion", "Seasonality", "DayOfWeek"]

X_train = train[numeric_features + categorical_features]
X_test  = test[numeric_features + categorical_features]

# 4) Preprocess
numeric_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("poly", PolynomialFeatures(degree=2, include_bias=False))  # 상호작용/곡선
])

cat_pipe = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer([
    ("num", numeric_pipe, numeric_features),
    ("cat", cat_pipe, categorical_features)
])

# 5) Model: Ridge (규제)
model = Pipeline([
    ("preprocess", preprocess),
    ("ridge", Ridge())
])

# 6) Log transform target (train only), and inverse transform predictions
y_train_log = np.log1p(y_train)

# 7) TimeSeries CV + alpha 튜닝
tscv = TimeSeriesSplit(n_splits=5)
param_grid = {"ridge__alpha": [0.1, 1, 10, 50, 100, 200]}

search = GridSearchCV(
    model, param_grid=param_grid,
    cv=tscv, scoring="neg_mean_absolute_error",
    n_jobs=-1
)

search.fit(X_train, y_train_log)

best_model = search.best_estimator_
pred_log = best_model.predict(X_test)
pred = np.expm1(pred_log)  # back to original scale

# 8) Evaluate
print("Best alpha:", search.best_params_)
print("RMSE:", rmse(y_test, pred))
print("MAE :", mean_absolute_error(y_test, pred))

Best alpha: {'ridge__alpha': 200}
RMSE: 94.00841334554181
MAE : 73.22650560865598


In [4]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

df = pd.read_csv("retail_store_inventory.csv")
df = df[(df["Region"] == "North") & (df["Category"] == "Groceries")].copy()
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values(["Product ID", "Date"]).reset_index(drop=True)

# Lag features per Product
df["lag_1"] = df.groupby("Product ID")["Units Sold"].shift(1)
df["rolling_7_mean"] = df.groupby("Product ID")["Units Sold"].shift(1).rolling(7).mean().reset_index(level=0, drop=True)

# Date features
df["DayOfWeek"] = df["Date"].dt.day_name()
df["Month"] = df["Date"].dt.month.astype(int)

# Drop rows where lag is missing (초기 구간)
df = df.dropna(subset=["lag_1", "rolling_7_mean"]).copy()

# Time split (after feature creation)
split_idx = int(len(df) * 0.8)
train = df.iloc[:split_idx].copy()
test  = df.iloc[split_idx:].copy()

y_train = train["Units Sold"].astype(float)
y_test  = test["Units Sold"].astype(float)

numeric_features = ["Inventory Level", "Price", "Discount", "Competitor Pricing", "lag_1", "rolling_7_mean"]
categorical_features = ["Weather Condition", "Holiday/Promotion", "Seasonality", "DayOfWeek", "Month"]

X_train = train[numeric_features + categorical_features]
X_test  = test[numeric_features + categorical_features]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

model = Pipeline([
    ("preprocess", preprocess),
    ("ridge", Ridge(alpha=10))
])

# (선택) log target도 같이 쓰면 더 좋아질 수 있음
y_train_log = np.log1p(y_train)
model.fit(X_train, y_train_log)

pred_log = model.predict(X_test)
pred = np.expm1(pred_log)

print("RMSE:", rmse(y_test, pred))
print("MAE :", mean_absolute_error(y_test, pred))

RMSE: 96.8133049761552
MAE : 73.51041323052071
