In [None]:
# Retail Sales Forecasting - Jupyter Notebook Version

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load datasets
sales_df = pd.read_csv("train.csv")
features_df = pd.read_csv("features.csv")

In [None]:
# Preprocess dates
sales_df['Date'] = pd.to_datetime(sales_df['Date'])
features_df['Date'] = pd.to_datetime(features_df['Date'])

In [None]:
# 1. Total Weekly Sales Visualization
weekly_sales = sales_df.groupby("Date")["Weekly_Sales"].sum().reset_index()
plt.figure(figsize=(12, 6))
plt.plot(weekly_sales["Date"], weekly_sales["Weekly_Sales"], label="Total Weekly Sales", color="blue")
plt.title("Total Weekly Sales Over Time (All Stores)")
plt.xlabel("Date")
plt.ylabel("Weekly Sales (USD)")
plt.grid(True)
plt.tight_layout()
plt.savefig("plot_sales.png")
plt.show()

In [None]:
# 2. Seasonal Decomposition
weekly_sales.set_index("Date", inplace=True)
weekly_sales = weekly_sales.asfreq("W")
weekly_sales["Weekly_Sales"] = weekly_sales["Weekly_Sales"].interpolate().fillna(method="ffill").fillna(method="bfill")
decomposition = seasonal_decompose(weekly_sales["Weekly_Sales"], model="additive", period=26)
fig = decomposition.plot()
fig.set_size_inches(12, 8)
plt.savefig("plot_decomposition.png")
plt.show()

In [None]:
# 3. Forecast vs Actual using rolling average
weekly_sales["Forecast"] = weekly_sales["Weekly_Sales"].rolling(window=8).mean()
plt.figure(figsize=(12, 6))
plt.plot(weekly_sales.index, weekly_sales["Weekly_Sales"], label="Actual Sales")
plt.plot(weekly_sales.index, weekly_sales["Forecast"], label="Rolling Forecast")
plt.title("Forecast vs Actual Sales")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("plot_forecast.png")
plt.show()

In [None]:
# 4. Feature Importance with Gradient Boosting
merged_df = pd.merge(sales_df, features_df, on=['Store', 'Date', 'IsHoliday'])
model_df = merged_df.drop(columns=['Weekly_Sales', 'Date'])
for col in model_df.select_dtypes(include='object').columns:
    model_df[col] = LabelEncoder().fit_transform(model_df[col].astype(str))
X = model_df.dropna()
y = merged_df.loc[X.index, 'Weekly_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=importances.values, y=importances.index, palette="viridis")
plt.title("Feature Importance (Gradient Boosting)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig("plot_importance.png")
plt.show()