# Section 1: Project Introduction

# Sales Forecasting Project
#### This project forecasts sales using time series data from Rossmann stores. It includes data preprocessing, feature engineering, training an XGBoost model, and generating forecasts.

# Section 2: Import and Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
# Load and merge data
train_df = pd.read_csv("../data/train.csv")
store_df = pd.read_csv("../data/store.csv")
df = pd.merge(train_df, store_df, on="Store", how="left")

In [None]:
# Filter and preprocess
df = df[(df["Open"] != 0) & (df["Sales"] > 0)]
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["Year"] = df["Date"].dt.year
df = df.drop(columns=["Date", "Customers"])
df = df.fillna(0)

# Section 3: Model Training

In [None]:
# Train-test split
X = df.drop(columns=["Sales"])
y = df["Sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model training
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Save model
with open("../notebooks/final_xgboost_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Section 4: Evaluation & Visualization

In [None]:
# Prediction and Evaluation
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

In [None]:
# Plot
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
plt.scatter(range(len(y_test)), y_test, label='Actual')
plt.scatter(range(len(y_pred)), y_pred, label='Predicted', alpha=0.6)
plt.title('Actual vs Predicted Sales')
plt.legend()
plt.show()