In [3]:
import pandas as pd

# 1. Load data
df = pd.read_csv('Groceries data.csv')
df['Date'] = pd.to_datetime(df['Date'])

# 2. Filter tahun 2014
df_2014 = df[df['Date'].dt.year == 2014].copy()
df_2014['Transaction_ID'] = df_2014['Member_number'].astype(str) + '_' + df_2014['Date'].dt.strftime('%Y-%m-%d')

# 3. Group jadi basket
basket_2014 = df_2014.groupby(['Transaction_ID', 'Date']).agg({'itemDescription': list}).reset_index()
basket_2014['n_items'] = basket_2014['itemDescription'].apply(len)
basket_2014['month'] = basket_2014['Date'].dt.month

# 4. Target distribusi global (total 120 transaksi)
target_dist = {2: 24, 3: 20, 4: 31, 5: 12}
target_total = 10  # per bulan

sampled_baskets = []

for month in range(1, 13):
    month_baskets = basket_2014[basket_2014['month'] == month]
    sampled = []
    used_ids = set()
    # 2 item
    group2 = month_baskets[month_baskets['n_items'] == 2]
    n2 = min(target_dist[2] // 12, len(group2))  # rata-rata per bulan
    if n2 > 0:
        s2 = group2.sample(n=n2, random_state=month*10)
        sampled.append(s2)
        used_ids.update(s2['Transaction_ID'])
    # 3 item
    group3 = month_baskets[month_baskets['n_items'] == 3]
    n3 = min(target_dist[3] // 12, len(group3))
    if n3 > 0:
        s3 = group3.sample(n=n3, random_state=month*20)
        sampled.append(s3)
        used_ids.update(s3['Transaction_ID'])
    # 4 item
    group4 = month_baskets[month_baskets['n_items'] == 4]
    n4 = min(target_dist[4] // 12 + (1 if month <= target_dist[4] % 12 else 0), len(group4))
    if n4 > 0:
        s4 = group4.sample(n=n4, random_state=month*30)
        sampled.append(s4)
        used_ids.update(s4['Transaction_ID'])
    # 5 item
    group5 = month_baskets[month_baskets['n_items'] >= 5]
    n5 = min(target_dist[5] // 12 + (1 if month <= target_dist[5] % 12 else 0), len(group5))
    if n5 > 0:
        s5 = group5.sample(n=n5, random_state=month*40)
        sampled.append(s5)
        used_ids.update(s5['Transaction_ID'])
    # Gabung
    combined = pd.concat(sampled) if sampled else pd.DataFrame()
    # Jika masih kurang dari 10, isi random dari yang belum terambil
    sisa = target_total - len(combined)
    if sisa > 0:
        remaining = month_baskets[~month_baskets['Transaction_ID'].isin(used_ids)]
        if not remaining.empty:
            combined = pd.concat([combined, remaining.sample(min(sisa, len(remaining)), random_state=month*99)])
    # Pastikan hanya 10
    combined = combined.sample(n=min(target_total, len(combined)), random_state=month+123)
    sampled_baskets.append(combined)

# 5. Gabungkan semua bulan
final_sample = pd.concat(sampled_baskets, ignore_index=True)

# 6. Format long
rows = []
for _, row in final_sample.iterrows():
    for item in row['itemDescription']:
        rows.append({
            'Transaction_ID': row['Transaction_ID'],
            'Date': row['Date'],
            'itemDescription': item
        })
sampled_long = pd.DataFrame(rows)

# 7. Simpan ke CSV
sampled_long.to_csv('sampled_groceries_2014_custom.csv', index=False)
print('✅ Sampling selesai! File: sampled_groceries_2014_custom.csv')
print('Jumlah transaksi:', final_sample.shape[0])
print('Jumlah baris (item):', sampled_long.shape[0])
print('Distribusi jumlah item per transaksi:')
print(final_sample['n_items'].value_counts().sort_index())

✅ Sampling selesai! File: sampled_groceries_2014_custom.csv
Jumlah transaksi: 120
Jumlah baris (item): 351
Distribusi jumlah item per transaksi:
n_items
2    58
3    22
4    34
5     3
6     3
Name: count, dtype: int64


In [4]:
# 1. Load hasil sampling (long format)
df = pd.read_csv('sampled_groceries_2014_custom.csv')

# 2. Grouping: satu baris per transaksi, itemDescription jadi list
basket_ready = df.groupby(['Transaction_ID', 'Date'])['itemDescription'].apply(list).reset_index()

# 3. (Opsional) Lihat contoh hasil
print(basket_ready.head())

# 4. Simpan ke CSV baru (format: Transaction_ID, Date, Items)
basket_ready.to_csv('basket_ready_for_apriori.csv', index=False)
print('✅ Data siap untuk Apriori! File: basket_ready_for_apriori.csv')

    Transaction_ID        Date                             itemDescription
0  1020_2014-07-19  2014-07-19  [frozen meals, butter, newspapers, yogurt]
1  1025_2014-06-02  2014-06-02                     [coffee, shopping bags]
2  1035_2014-08-15  2014-08-15  [pip fruit, white wine, white bread, salt]
3  1108_2014-06-25  2014-06-25  [frozen potato products, specialty cheese]
4  1190_2014-05-18  2014-05-18               [pip fruit, rolls/buns, soda]
✅ Data siap untuk Apriori! File: basket_ready_for_apriori.csv


In [33]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [34]:
# 1. Load data hasil sampling
df = pd.read_csv('sampled_groceries_2014_custom.csv')
df['Date'] = pd.to_datetime(df['Date'])
df['month'] = df['Date'].dt.month

In [35]:
# 2. Group jadi basket (list per transaksi)
basket = df.groupby('Transaction_ID')['itemDescription'].apply(list).reset_index()

In [36]:
# 3. APRIORI OVERALL (SETAHUN)
transactions = basket['itemDescription'].tolist()
te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_array, columns=te.columns_)

In [44]:
# Apriori
min_support = 0.01
min_confidence = 0.05

frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

In [45]:
print("=== TOP 10 BARANG PALING SERING DIBELI BERSAMA (SETAHUN) ===")
if len(rules) == 0:
    print("Tidak ada rules ditemukan. Coba cek data atau turunkan threshold lagi.")
else:
    top_rules = rules.sort_values('lift', ascending=False).head(10)
    for idx, rule in top_rules.iterrows():
        ant = ', '.join(list(rule['antecedents']))
        cons = ', '.join(list(rule['consequents']))
        print(f"• {ant} + {cons} | Support: {rule['support']:.2f}, Confidence: {rule['confidence']:.2f}, Lift: {rule['lift']:.2f}")

=== TOP 10 BARANG PALING SERING DIBELI BERSAMA (SETAHUN) ===
• chocolate marshmallow + cream cheese  | Support: 0.02, Confidence: 1.00, Lift: 40.00
• cream cheese  + chocolate marshmallow | Support: 0.02, Confidence: 0.67, Lift: 40.00
• white bread + bottled beer | Support: 0.02, Confidence: 0.50, Lift: 10.00
• bottled beer + white bread | Support: 0.02, Confidence: 0.33, Lift: 10.00
• flour + chocolate | Support: 0.02, Confidence: 0.25, Lift: 10.00
• chocolate + flour | Support: 0.02, Confidence: 0.67, Lift: 10.00
• domestic eggs + canned beer | Support: 0.02, Confidence: 0.33, Lift: 8.00
• canned beer + domestic eggs | Support: 0.02, Confidence: 0.40, Lift: 8.00
• margarine + domestic eggs | Support: 0.02, Confidence: 0.40, Lift: 8.00
• domestic eggs + margarine | Support: 0.02, Confidence: 0.33, Lift: 8.00


In [46]:
# 4. APRIORI PER BULAN
print("\n=== TOP 3 BARANG PALING SERING DIBELI BERSAMA PER BULAN ===")
month_names = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
for month in range(1, 13):
    month_basket = basket[basket['Transaction_ID'].isin(
        df[df['month'] == month]['Transaction_ID'].unique()
    )]
    if len(month_basket) < 2:
        continue
    transactions_month = month_basket['itemDescription'].tolist()
    te_month = TransactionEncoder()
    te_array_month = te_month.fit(transactions_month).transform(transactions_month)
    df_encoded_month = pd.DataFrame(te_array_month, columns=te_month.columns_)
    min_support_month = max(0.01, 1/len(transactions_month))
    freq_itemsets_month = apriori(df_encoded_month, min_support=min_support_month, use_colnames=True)
    rules_month = association_rules(freq_itemsets_month, metric="confidence", min_threshold=0.05)
    # Tidak filter lift agar rules lebih banyak
    if len(rules_month) == 0:
        continue
    print(f"\n{month_names[month]}:")
    for idx, rule in rules_month.sort_values('lift', ascending=False).head(3).iterrows():
        ant = ', '.join(list(rule['antecedents']))
        cons = ', '.join(list(rule['consequents']))
        print(f"  • {ant} + {cons} | Support: {rule['support']:.2f}, Confidence: {rule['confidence']:.2f}, Lift: {rule['lift']:.2f}")


=== TOP 3 BARANG PALING SERING DIBELI BERSAMA PER BULAN ===

Jan:
  • yogurt, chocolate marshmallow + nut snack, cream cheese , curd, female sanitary products | Support: 0.10, Confidence: 1.00, Lift: 10.00
  • yogurt, female sanitary products + nut snack, cream cheese , curd, chocolate marshmallow | Support: 0.10, Confidence: 1.00, Lift: 10.00
  • yogurt, nut snack + cream cheese , curd, female sanitary products, chocolate marshmallow | Support: 0.10, Confidence: 1.00, Lift: 10.00

Feb:
  • long life bakery product, UHT-milk + whole milk, chocolate, sausage, flour | Support: 0.10, Confidence: 1.00, Lift: 10.00
  • long life bakery product, flour + whole milk, chocolate, sausage, UHT-milk | Support: 0.10, Confidence: 1.00, Lift: 10.00
  • long life bakery product, sausage + whole milk, chocolate, flour, UHT-milk | Support: 0.10, Confidence: 1.00, Lift: 10.00

Mar:
  • bottled beer, frozen dessert + rice, berries, rolls/buns, white bread | Support: 0.10, Confidence: 1.00, Lift: 10.00
  

In [47]:
print("\n=== REKOMENDASI BUNDLING PRODUK ===")
if len(rules) > 0:
    for idx, rule in top_rules.iterrows():
        ant = ', '.join(list(rule['antecedents']))
        cons = ', '.join(list(rule['consequents']))
        print(f"Bundling: {ant} + {cons} (Lift: {rule['lift']:.2f}, Confidence: {rule['confidence']:.1%})")
else:
    print("Tidak ada rekomendasi bundling yang bisa ditampilkan.")


=== REKOMENDASI BUNDLING PRODUK ===
Bundling: chocolate marshmallow + cream cheese  (Lift: 40.00, Confidence: 100.0%)
Bundling: cream cheese  + chocolate marshmallow (Lift: 40.00, Confidence: 66.7%)
Bundling: white bread + bottled beer (Lift: 10.00, Confidence: 50.0%)
Bundling: bottled beer + white bread (Lift: 10.00, Confidence: 33.3%)
Bundling: flour + chocolate (Lift: 10.00, Confidence: 25.0%)
Bundling: chocolate + flour (Lift: 10.00, Confidence: 66.7%)
Bundling: domestic eggs + canned beer (Lift: 8.00, Confidence: 33.3%)
Bundling: canned beer + domestic eggs (Lift: 8.00, Confidence: 40.0%)
Bundling: margarine + domestic eggs (Lift: 8.00, Confidence: 40.0%)
Bundling: domestic eggs + margarine (Lift: 8.00, Confidence: 33.3%)
