In [1]:
import random
import pandas as pd

# -----------------------------
# Generate Synthetic Data
# -----------------------------

items_pool = [
    'milk', 'bread', 'eggs', 'beer', 'diapers', 'cheese', 'butter', 'apples',
    'bananas', 'chicken', 'rice', 'pasta', 'cereal', 'coffee', 'tea', 'yogurt',
    'juice', 'chips', 'cookies', 'fish'
]

random.seed(42)

num_transactions = random.randint(20, 50)

transactions = []

for _ in range(num_transactions):
    basket_size = random.randint(3, 8)
    
    # introduce some realistic patterns
    basket = []
    
    # Common co-occurrences
    if random.random() < 0.4:
        basket += ['milk', 'bread']
    if random.random() < 0.3:
        basket += ['beer', 'diapers']
    if random.random() < 0.3:
        basket += ['chips', 'beer']
    
    # fill remaining items randomly
    remaining = basket_size - len(basket)
    if remaining > 0:
        basket += random.sample(items_pool, remaining)
    
    transactions.append(list(set(basket)))  # unique items per basket

# Display sample
for t in transactions[:5]:
    print(t)


['diapers', 'beer', 'milk', 'chips', 'bread']
['eggs', 'cookies', 'beer', 'milk', 'coffee', 'bread']
['beer', 'chips', 'diapers']
['eggs', 'diapers', 'beer', 'milk', 'coffee', 'cheese', 'bananas', 'rice']
['pasta', 'milk', 'bananas', 'bread']


In [2]:
from mlxtend.preprocessing import TransactionEncoder

# Convert to one-hot encoded DataFrame
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_ary, columns=te.columns_)
df.head()


Unnamed: 0,apples,bananas,beer,bread,butter,cereal,cheese,chicken,chips,coffee,cookies,diapers,eggs,fish,juice,milk,pasta,rice,tea,yogurt
0,False,False,True,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False
1,False,False,True,True,False,False,False,False,False,True,True,False,True,False,False,True,False,False,False,False
2,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False
3,False,True,True,False,False,False,True,False,False,True,False,True,True,False,False,True,False,True,False,False
4,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False


In [3]:
from mlxtend.frequent_patterns import apriori, association_rules

# Frequent itemsets with support â‰¥ 0.2
freq_items = apriori(df, min_support=0.2, use_colnames=True)
freq_items


Unnamed: 0,support,itemsets
0,0.7,(beer)
1,0.425,(bread)
2,0.2,(butter)
3,0.45,(chips)
4,0.225,(cookies)
5,0.525,(diapers)
6,0.3,(eggs)
7,0.2,(juice)
8,0.475,(milk)
9,0.3,"(beer, bread)"


In [4]:
rules = association_rules(freq_items,
        metric="confidence",
        min_threshold=0.5)

# Sort by lift (descending)
rules_sorted = rules.sort_values(by="lift", ascending=False)

# Show top 5 rules
top5 = rules_sorted.head(5)
print(top5)


      antecedents    consequents  antecedent support  consequent support  \
17  (beer, bread)         (milk)               0.300               0.475   
19         (milk)  (beer, bread)               0.475               0.300   
20        (bread)   (beer, milk)               0.425               0.350   
16   (beer, milk)        (bread)               0.350               0.425   
9          (milk)        (bread)               0.475               0.425   

    support  confidence      lift  representativity  leverage  conviction  \
17    0.250    0.833333  1.754386               1.0  0.107500    3.150000   
19    0.250    0.526316  1.754386               1.0  0.107500    1.477778   
20    0.250    0.588235  1.680672               1.0  0.101250    1.578571   
16    0.250    0.714286  1.680672               1.0  0.101250    2.012500   
9     0.325    0.684211  1.609907               1.0  0.123125    1.820833   

    zhangs_metric   jaccard  certainty  kulczynski  
17       0.614286  0.476190