In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

# Paths (notebook อยู่ใน rossmann/notebooks)
PROJECT_DIR = Path.cwd().parent
DATA_DIR = PROJECT_DIR / "data"
ARTIFACTS_DIR = PROJECT_DIR / "artifacts"
SUBMISSION_DIR = PROJECT_DIR / "submission"

ARTIFACTS_DIR.mkdir(exist_ok=True)
SUBMISSION_DIR.mkdir(exist_ok=True)

print("PROJECT_DIR:", PROJECT_DIR)
print("DATA_DIR:", DATA_DIR)


PROJECT_DIR: C:\Users\USER\Desktop\rossman
DATA_DIR: C:\Users\USER\Desktop\rossman\data


In [2]:
train = pd.read_csv(DATA_DIR / "train_processed.csv", dtype={"StateHoliday": "string"}, low_memory=False)
test  = pd.read_csv(DATA_DIR / "test_processed.csv",  dtype={"StateHoliday": "string"}, low_memory=False)

train["Date"] = pd.to_datetime(train["Date"])
test["Date"]  = pd.to_datetime(test["Date"])

print("train:", train.shape)
print("test:", test.shape)
train.head()


train: (1017209, 24)
test: (41088, 23)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,...,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,CompetitionDistance_missing,CompetitionOpenSinceMonth_missing,CompetitionOpenSinceYear_missing,Promo2SinceWeek_missing,Promo2SinceYear_missing,PromoInterval_missing
0,1,5,2015-07-31,5263,555,1,1,0,1,c,...,0,0,0,,0,0,0,1,1,1
1,2,5,2015-07-31,6064,625,1,1,0,1,a,...,1,13,2010,"Jan,Apr,Jul,Oct",0,0,0,0,0,0
2,3,5,2015-07-31,8314,821,1,1,0,1,a,...,1,14,2011,"Jan,Apr,Jul,Oct",0,0,0,0,0,0
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,...,0,0,0,,0,0,0,1,1,1
4,5,5,2015-07-31,4822,559,1,1,0,1,a,...,0,0,0,,0,0,0,1,1,1


In [3]:
def add_date_features(df):
    df = df.copy()
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    df["WeekOfYear"] = df["Date"].dt.isocalendar().week.astype(int)
    return df

train = add_date_features(train)
test  = add_date_features(test)


In [4]:
train = train.sort_values("Date")

cutoff_date = train["Date"].max() - pd.Timedelta(days=42)
train_part = train[train["Date"] <= cutoff_date].copy()
valid_part = train[train["Date"] > cutoff_date].copy()

print("Train period:", train_part["Date"].min(), "->", train_part["Date"].max(), train_part.shape)
print("Valid period:", valid_part["Date"].min(), "->", valid_part["Date"].max(), valid_part.shape)
print("Cutoff date:", cutoff_date)


Train period: 2013-01-01 00:00:00 -> 2015-06-19 00:00:00 (970379, 28)
Valid period: 2015-06-20 00:00:00 -> 2015-07-31 00:00:00 (46830, 28)
Cutoff date: 2015-06-19 00:00:00


In [5]:
target = "Sales"
drop_cols = ["Sales", "Customers", "Date"]

feature_cols = [c for c in train.columns if c not in drop_cols]

X_train = train_part[feature_cols]
y_train = train_part[target]

X_valid = valid_part[feature_cols]
y_valid = valid_part[target]

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)


X_train: (970379, 25) X_valid: (46830, 25)


In [6]:
# แยกคอลัมน์ประเภทตัวเลข/ข้อความ
cat_cols = [c for c in X_train.columns if (X_train[c].dtype == "object") or ("string" in str(X_train[c].dtype))]
num_cols = [c for c in X_train.columns if c not in cat_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", Ridge(alpha=1.0))
])

pipe.fit(X_train, y_train)
pred_valid = pipe.predict(X_valid)

mae = mean_absolute_error(y_valid, pred_valid)
rmse = root_mean_squared_error(y_valid, pred_valid)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")


MAE: 1741.88
RMSE: 2456.73


In [7]:
metrics_path = ARTIFACTS_DIR / "metrics_baseline.txt"
with open(metrics_path, "w", encoding="utf-8") as f:
    f.write(f"Validation cutoff_date: {cutoff_date}\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")

print("Saved metrics to:", metrics_path)


Saved metrics to: C:\Users\USER\Desktop\rossman\artifacts\metrics_baseline.txt


In [8]:
X_full = train[feature_cols]
y_full = train[target]

pipe.fit(X_full, y_full)

test_pred = pipe.predict(test[feature_cols])

# กันค่าติดลบ
test_pred = np.clip(test_pred, 0, None)

# ถ้าร้านปิด (Open=0) ให้ Sales=0
open_flag = pd.to_numeric(test["Open"], errors="coerce").fillna(1).values
test_pred[open_flag == 0] = 0

submission = pd.DataFrame({
    "Id": test["Id"],
    "Sales": test_pred
})

sub_path = SUBMISSION_DIR / "submission_baseline.csv"
submission.to_csv(sub_path, index=False)

print("Saved submission to:", sub_path)
submission.head()


Saved submission to: C:\Users\USER\Desktop\rossman\submission\submission_baseline.csv


Unnamed: 0,Id,Sales
0,1,8351.089345
1,2,7449.24187
2,3,8849.9575
3,4,8265.413047
4,5,9157.782389


## Baseline Model Summary

- Model: Ridge Regression + OneHotEncoder + SimpleImputer
- Validation: Time-based split (last 42 days as validation)
- Metrics:
  - MAE: (1741.88)
  - RMSE: (2456.73)

Output files:
- artifacts/metrics_baseline.txt
- submission/submission_baseline.csv
