In [1]:
# =========================
# Imports & Setup
# =========================
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error


In [2]:

# =========================
# Paths
# =========================
DATA_FILE = "data/processed_Data.csv"
MODEL_FILE = "model_files/forecast_model.pkl"
PIPELINE_FILE = "model_files/feature_pipeline.pkl"

os.makedirs("model_files", exist_ok=True)


In [3]:

# =========================
# Load Data
# =========================
df = pd.read_csv(DATA_FILE)

# Ensure correct dtypes
df[["year", "month", "day"]] = df[["year", "month", "day"]].astype(int)


In [4]:

# =========================
# Feature Engineering
# =========================
ORIGIN_DATE = pd.Timestamp("2019-01-01")

def feature_engineering(X):
    X = X.copy()

    # Build date
    X["date"] = pd.to_datetime(
        dict(year=X.year, month=X.month, day=X.day)
    )

    # Monotonic time axis (trend)
    X["t"] = (X["date"] - ORIGIN_DATE).dt.days

    # -------------------------
    # Yearly seasonality
    # -------------------------
    X["sin_year"] = np.sin(2 * np.pi * X["t"] / 365.25)
    X["cos_year"] = np.cos(2 * np.pi * X["t"] / 365.25)

    # -------------------------
    # Week-of-month seasonality
    # -------------------------
    X["week_of_month"] = ((X["day"] - 1) // 7) + 1
    X["week_of_month"] = X["week_of_month"].clip(1, 4)

    X["sin_wom"] = np.sin(2 * np.pi * X["week_of_month"] / 4)
    X["cos_wom"] = np.cos(2 * np.pi * X["week_of_month"] / 4)

    # Optional interaction with trend (comment out if unwanted)
    X["t_sin_wom"] = X["t"] * X["sin_wom"]
    X["t_cos_wom"] = X["t"] * X["cos_wom"]

    return X[[
        "t",
        "sin_year", "cos_year",
        "sin_wom", "cos_wom",
        "t_sin_wom", "t_cos_wom"
    ]]


In [5]:

# =========================
# Pipeline
# =========================
pipeline = Pipeline([
    ("features", FunctionTransformer(feature_engineering))
])


In [6]:

# =========================
# Train / Test Split (TIME-AWARE)
# =========================
df = df.sort_values(["year", "month", "day"])

split_index = int(len(df) * 0.8)
train_df = df.iloc[:split_index]
test_df  = df.iloc[split_index:]

X_train = pipeline.fit_transform(train_df)
y_train = train_df["price"]

X_test = pipeline.transform(test_df)
y_test = test_df["price"]


In [10]:
X_train

Unnamed: 0,t,sin_year,cos_year,sin_wom,cos_wom,t_sin_wom,t_cos_wom
2455,0,0.000000,1.000000,1.000000e+00,6.123234e-17,0.000000e+00,0.000000e+00
2454,1,0.017202,0.999852,1.000000e+00,6.123234e-17,1.000000e+00,6.123234e-17
2453,2,0.034398,0.999408,1.000000e+00,6.123234e-17,2.000000e+00,1.224647e-16
2452,3,0.051584,0.998669,1.000000e+00,6.123234e-17,3.000000e+00,1.836970e-16
2451,4,0.068755,0.997634,1.000000e+00,6.123234e-17,4.000000e+00,2.449294e-16
...,...,...,...,...,...,...,...
496,1986,0.383428,-0.923571,1.224647e-16,-1.000000e+00,2.432149e-13,-1.986000e+03
495,1987,0.367485,-0.930030,1.224647e-16,-1.000000e+00,2.433373e-13,-1.987000e+03
494,1988,0.351432,-0.936213,1.224647e-16,-1.000000e+00,2.434598e-13,-1.988000e+03
493,1989,0.335276,-0.942120,1.224647e-16,-1.000000e+00,2.435822e-13,-1.989000e+03


In [7]:

# =========================
# Train Model
# =========================
model = DecisionTreeRegressor()
model.fit(X_train, y_train)


In [8]:

# =========================
# Evaluation
# =========================
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print(f"MAE on hold-out future data: {mae:.2f}")


MAE on hold-out future data: 414.80


In [9]:

# =========================
# Save Artifacts
# =========================
joblib.dump(model, MODEL_FILE)
joblib.dump(pipeline, PIPELINE_FILE)

print("Model and pipeline saved successfully.")


Model and pipeline saved successfully.


In [None]:

# =========================
# Forecast Example (Future Dates)
# =========================
future_dates = pd.DataFrame({
    "year":  [2026, 2026, 2026,2027],
    "month": [2, 2, 2,7],
    "day":   [5, 12, 20,12]
})

X_future = pipeline.transform(future_dates)
future_predictions = model.predict(X_future)

future_dates["predicted_price"] = future_predictions
print("\nFuture Forecasts:")
print(future_dates)
