In [18]:
import pandas as pd

from mlxtend.frequent_patterns import fpgrowth, association_rules

In [19]:
df = pd.read_parquet('data/preprocessed/cleaned_data.parquet')

df = df[(~df['order_id_cancelled'])&(~df['order_id_adjustment'])&(~df['sku_id_no_digit'])]
df = df[['order_id','sku_name','order_date_only','quantity']]

# --- Parameters
MIN_SUPPORT    = 0.01   # 1% minimum support
MIN_CONFIDENCE = 0.30   # 30% minimum confidence
MIN_LIFT       = 1.20   # 1.2 minimum lift

# --- Step 1: Filter last 3 months
df['order_date_only'] = pd.to_datetime(df['order_date_only'])
max_date    = df['order_date_only'].max()
cutoff_date = max_date - pd.DateOffset(months=3)
df_recent   = df[df['order_date_only'] >= cutoff_date]

# --- Step 2: Build boolean basket
basket = (
    df_recent
    .groupby(['order_id','sku_name'])['quantity']
    .sum()
    .unstack(fill_value=0)
    .astype(bool)
)

# --- Step 3: Mine frequent itemsets up to size 3
frequent_itemsets = fpgrowth(
    basket,
    min_support=MIN_SUPPORT,
    use_colnames=True,
    max_len=3
)

# --- Step 4: Generate association rules with min_confidence
rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=MIN_CONFIDENCE
)

# --- Step 5: Optional lift filter & size filter
rules = (
    rules[
        (rules['lift'] >= MIN_LIFT) &
        (rules['antecedents'].apply(len).isin([1,2]))  # 1→1 gives 2-item rules; 2→1 gives 3-item
    ]
    .copy()
)

# --- Step 6: Sort by confidence descending
rules_sorted = rules.sort_values('confidence', ascending=False).reset_index(drop=True)

# --- Step 7: Tidy up for display
rules_sorted['antecedents'] = rules_sorted['antecedents'].apply(lambda s: ', '.join(s))
rules_sorted['consequents'] = rules_sorted['consequents'].apply(lambda s: ', '.join(s))
display_cols = ['antecedents','consequents','support','confidence','lift']

rules_sorted[['antecedents','consequents','support','confidence']].to_parquet('data/preprocessed/market_basket.parquet')