# 5) Compare Apriori vs FP-Growth

So sánh thời gian chạy, số lượng itemsets, rules theo các mức min_support.

> Các biến tham số (để papermill inject):
- `DATA_PATH`: đường dẫn file CSV dữ liệu thô.
- `COUNTRY`: lọc theo quốc gia (tuỳ chọn, để giảm kích thước).
- `MIN_SUPPORT`, `MIN_CONFIDENCE`, `MIN_LIFT`: ngưỡng mining.


In [None]:
# Parameters (papermill)
DATA_PATH = "data/online_retail.csv"
COUNTRY = None
MIN_SUPPORT = 0.01
MIN_CONFIDENCE = 0.5
MIN_LIFT = 1.0


In [None]:
import pandas as pd
import numpy as np

from src.apriori_library import AssociationRulesMiner, FPGrowthMiner, DataVisualizer

basket_bool = pd.read_parquet("data/processed/basket_bool.parquet").astype(bool)

apriori_miner = AssociationRulesMiner()
fp_miner = FPGrowthMiner()


## Thực nghiệm độ nhạy min_support

In [None]:
supports = [0.02, 0.01, 0.005]  # bạn có thể mở rộng
rows = []

for s in supports:
    _, rules_a, stats_a = apriori_miner.run(
        basket_bool, min_support=s, metric="confidence", min_threshold=MIN_CONFIDENCE
    )
    rules_a = rules_a[rules_a["lift"] >= MIN_LIFT]
    avg_len_a = rules_a["antecedents"].apply(len).mean() if len(rules_a) else 0

    _, rules_f, stats_f = fp_miner.run(
        basket_bool, min_support=s, metric="confidence", min_threshold=MIN_CONFIDENCE
    )
    rules_f = rules_f[rules_f["lift"] >= MIN_LIFT]
    avg_len_f = rules_f["antecedents"].apply(len).mean() if len(rules_f) else 0

    rows.append({
        "min_support": s,
        "algo": "Apriori",
        "time_sec": stats_a.elapsed_seconds,
        "n_itemsets": stats_a.n_frequent_itemsets,
        "n_rules": len(rules_a),
        "avg_antecedent_len": avg_len_a,
    })
    rows.append({
        "min_support": s,
        "algo": "FP-Growth",
        "time_sec": stats_f.elapsed_seconds,
        "n_itemsets": stats_f.n_frequent_itemsets,
        "n_rules": len(rules_f),
        "avg_antecedent_len": avg_len_f,
    })

compare_df = pd.DataFrame(rows)
compare_df


## Vẽ scatter rules để so sánh

In [None]:
# Lấy rules ở MIN_SUPPORT hiện tại để so sánh trực quan
_, rules_a, _ = apriori_miner.run(basket_bool, min_support=MIN_SUPPORT, metric="confidence", min_threshold=MIN_CONFIDENCE)
_, rules_f, _ = fp_miner.run(basket_bool, min_support=MIN_SUPPORT, metric="confidence", min_threshold=MIN_CONFIDENCE)

rules_a = rules_a[rules_a["lift"] >= MIN_LIFT]
rules_f = rules_f[rules_f["lift"] >= MIN_LIFT]

viz = DataVisualizer()
viz.plot_support_confidence_scatter(rules_a, rules_f)


## Lưu bảng so sánh

In [None]:
from pathlib import Path
Path("reports").mkdir(exist_ok=True)

compare_df.to_csv("reports/compare_apriori_fpgrowth.csv", index=False)
print("Saved reports/compare_apriori_fpgrowth.csv")
