# Dataset Resolution Pipeline
Generates **weekly (v7)**, **monthly (v8)**, and **yearly (v9)** aggregated datasets from `customer_shopping_data_v6.csv`.


## Feature Transformation Summary

| Feature | Weekly / Monthly | Yearly |
|---------|------------------|--------|
| `invoice_no`, `customer_id`, `is_weekday` | **Dropped** | **Dropped** |
| `gender`, `category`, `payment_method`, `shopping_mall` | Frequency count (one column per category) | Same |
| `age` | Mean age | Mean age |
| `quantity`, `price`, `total_price` | Sum | Sum |
| `invoice_date` | Converted to `week_key`, `month_key` | Converted to `year_key` |
| `season` | Single season label (mode) | **Excluded** |
| `is_holiday` | **`holiday_days`** = count of distinct holiday dates within period (max 7 for weekly) | Same logic (≤366) |
| Econ & Confidence indices | Mean value | Mean value |


In [6]:
# 1️⃣ Imports & load source
import pandas as pd
import numpy as np
from pathlib import Path

SOURCE_FILE = Path("../datasets/customer_shopping_data_v6.csv")
df = pd.read_csv(SOURCE_FILE)
df['invoice_date'] = pd.to_datetime(df['invoice_date'], errors='coerce')

# Ensure total_price exists
if 'total_price' not in df.columns:
    df['total_price'] = df['quantity'] * df['price']


In [7]:
# 2️⃣ Helper functions
def add_time_keys(df):
    out = df.copy()
    out['week_key']  = out['invoice_date'].dt.to_period('W').dt.start_time
    out['month_key'] = out['invoice_date'].dt.to_period('M').dt.to_timestamp()
    out['year_key']  = out['invoice_date'].dt.to_period('Y').dt.start_time
    return out

def season_from_date(d):
    m = d.month
    return ('Winter','Spring','Summer','Fall')[(m % 12)//3]


In [8]:
# 3️⃣ Add period keys + season
df = add_time_keys(df)
df['season'] = df['invoice_date'].apply(season_from_date)


In [9]:
# 4️⃣ Aggregation function
def aggregate_resolution(df, time_col, outfile, include_season=True):
    grp = df.groupby(time_col)

    # Numeric aggregations
    numeric_aggs = {
        'age': 'mean',
        'quantity': 'sum',
        'price': 'sum',
        'total_price': 'sum',
        'Econ_Conf': 'mean',
        'Cons_Conf': 'mean',
        'RealSec_Conf': 'mean',
        'Serv_Conf': 'mean',
        'Retail_Conf': 'mean',
        'Constr_Conf': 'mean'
    }
    numeric = grp.agg(numeric_aggs)

    # Holiday days count (unique dates where is_holiday == 1)
    holiday_days = (df[df['is_holiday']==1]
                    .assign(day=lambda x: x['invoice_date'].dt.normalize())
                    .groupby(time_col)['day']
                    .nunique()
                    .rename('holiday_days'))
    numeric = numeric.join(holiday_days, how='left').fillna({'holiday_days':0})

    # Season label (mode) for weekly/monthly
    if include_season:
        season = grp['season'].agg(lambda x: x.mode().iat[0] if not x.mode().empty else np.nan)
        numeric['season'] = season

    # Frequency counts for categoricals
    cat_cols = ['gender','category','payment_method','shopping_mall']
    for col in cat_cols:
        freq = pd.crosstab(df[time_col], df[col])
        numeric = numeric.join(freq, how='left')

    numeric.to_csv(outfile)
    print(f"Saved {outfile}")


In [10]:
# 5️⃣ Generate v7 (weekly), v8 (monthly), v9 (yearly)
aggregate_resolution(df, 'week_key',   '../datasets/customer_shopping_data_v7_weekly.csv',  include_season=True)
aggregate_resolution(df, 'month_key',  '../datasets/customer_shopping_data_v8_monthly.csv', include_season=True)
aggregate_resolution(df, 'year_key',   '../datasets/customer_shopping_data_v9_yearly.csv',  include_season=False)

Saved ../datasets/customer_shopping_data_v7_weekly.csv
Saved ../datasets/customer_shopping_data_v8_monthly.csv
Saved ../datasets/customer_shopping_data_v9_yearly.csv
