In [45]:
import pandas as pd

def preprocess_sales_data_flexible(df, n_past_week=6, n_future_week=1, start_date=None, end_date=None):
    """
    Preprocesses the sales dataset to generate time-based features and future sales aggregation.

    Parameters:
    df (pd.DataFrame): The sales data
    n_past_week (int): Number of past months to use for features (default=6)
    n_future_week (int): Number of future months to predict aggregated sales (default=1)
    start_date (str): Optional start date for filtering data (format: 'YYYY-MM')
    end_date (str): Optional end date for filtering data (format: 'YYYY-MM')

    Returns:
    pd.DataFrame: Processed dataset with time-series features and future target sales
    """
    # Convert Transaction Date to datetime format
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])

    # Extract Week (starting on Sunday) for aggregation
    df["Year_Week"] = df["Transaction Date"].dt.to_period("W-SUN").astype(str)

    # Convert Discount Applied to binary values (True=1, False=0)
    df["Discount Applied"] = df["Discount Applied"].fillna(False).astype(int)

    # Aggregate basic features
    grouped = df.groupby("Year_Week").agg(
        avg_repeat_purchases=("Customer ID", lambda x: x.duplicated().mean()),
        total_sales=("Total Spent", "sum"),
        unique_transactions=("Transaction ID", "nunique"),
        unique_customers=("Customer ID", "nunique"),
        discounted_transactions=("Discount Applied", "sum"),
    ).reset_index()

    # Aggregate sales per category
    category_sales = df.groupby(["Year_Week", "Category"])["Total Spent"].sum().unstack(fill_value=0)

    # Aggregate unique transactions per payment method
    payment_transactions = df.groupby(["Year_Week", "Payment Method"])["Transaction ID"].nunique().unstack(fill_value=0)

    # Aggregate unique transactions per location
    location_transactions = df.groupby(["Year_Week", "Location"])["Transaction ID"].nunique().unstack(fill_value=0)

    # Merge all aggregated data
    final_df = grouped.merge(category_sales, on="Year_Week", how="left") \
        .merge(payment_transactions, on="Year_Week", how="left") \
        .merge(location_transactions, on="Year_Week", how="left")

    # Fill NaNs resulting from merging
    final_df.fillna(0, inplace=True)

    # Generate past weeks features (t-1 to t-N)
    feature_cols = [col for col in final_df.columns if col != "Year_Week"]
    past_week_cols = ["unique_transactions", "unique_customers", "discounted_transactions"] + feature_cols
    for col in feature_cols:
        for i in range(1, n_past_week + 1):
            final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
    
    # Generate future target sales (T+1 to T+n_future_week)
    for i in range(1, n_future_week + 1):
        final_df[f"y_{i}"] = (
            final_df["total_sales"].shift(-i).fillna(0)
        )
    
    # Apply date filtering if specified
    if start_date:
        final_df = final_df[final_df["Year_Week"] >= start_date]
    if end_date:
        final_df = final_df[final_df["Year_Week"] <= end_date]
    
    return final_df


In [46]:
data = pd.read_csv('data/retail_store_sales.csv')

In [47]:
preprocess_sales_data_flexible(data,30,15)

  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_df[f"{col}_t-{i}"] = final_df[col].shift(i).fillna(0)
  final_

Unnamed: 0,Year_Week,avg_repeat_purchases,total_sales,unique_transactions,unique_customers,discounted_transactions,Beverages,Butchers,Computers and electric accessories,Electric household essentials,...,y_6,y_7,y_8,y_9,y_10,y_11,y_12,y_13,y_14,y_15
0,2021-12-27/2022-01-02,0.263158,2778.0,19,14,8,461.0,604.5,150.0,350.0,...,11102.5,12539.0,8757.0,12574.5,8211.0,8896.5,8229.5,8254.5,8885.0,10365.5
1,2022-01-03/2022-01-09,0.714286,9666.5,84,24,26,747.0,1239.5,1823.0,1028.0,...,12539.0,8757.0,12574.5,8211.0,8896.5,8229.5,8254.5,8885.0,10365.5,9071.5
2,2022-01-10/2022-01-16,0.747368,12803.0,95,24,39,1718.5,2277.5,1657.5,1682.0,...,8757.0,12574.5,8211.0,8896.5,8229.5,8254.5,8885.0,10365.5,9071.5,10984.5
3,2022-01-17/2022-01-23,0.744898,13441.0,98,25,38,1739.0,1789.0,1442.0,1637.5,...,12574.5,8211.0,8896.5,8229.5,8254.5,8885.0,10365.5,9071.5,10984.5,7874.0
4,2022-01-24/2022-01-30,0.687500,11912.5,80,25,31,471.5,2127.5,2193.0,601.0,...,8211.0,8896.5,8229.5,8254.5,8885.0,10365.5,9071.5,10984.5,7874.0,9517.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,2024-12-16/2024-12-22,0.689189,9124.5,74,23,21,457.0,467.0,816.5,1611.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,2024-12-23/2024-12-29,0.728261,12620.5,92,25,34,2084.0,576.0,2589.5,1326.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
157,2024-12-30/2025-01-05,0.744898,11858.0,98,25,25,1887.0,1819.0,1708.5,1332.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158,2025-01-06/2025-01-12,0.679487,9902.5,78,25,24,467.0,1790.0,1551.0,943.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
