In [14]:
import pandas as pd
from itertools import combinations
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

<h1>ARM for clusters</h1>

In [2]:
# load and merge all appropriate datasets
df_products   = pd.read_csv("../data_versions/clusters.csv")        # Description  | Cluster
df_labels     = pd.read_csv("../data_versions/cluster_labels.csv")  # Cluster | Label
df_transactions = pd.read_csv('../data_versions/cleaned_1.csv')


df_products_labeled   = df_products.merge(df_labels, on="Cluster", how="outer")  
df_transactions_labeled = df_transactions.merge(df_products_labeled, on="Description", how="outer")

In [3]:
df_transactions_labeled.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Cluster,Label
0,536522,72800B,4 PURPLE FLOCK DINNER CANDLES,2,2010-12-01 12:49:00,2.55,15012.0,United Kingdom,22,CANDLE
1,537044,72800B,4 PURPLE FLOCK DINNER CANDLES,12,2010-12-05 10:52:00,2.55,18055.0,United Kingdom,22,CANDLE
2,539595,72800B,4 PURPLE FLOCK DINNER CANDLES,2,2010-12-20 13:43:00,5.06,,United Kingdom,22,CANDLE
3,540247,72800B,4 PURPLE FLOCK DINNER CANDLES,1,2011-01-05 15:56:00,2.55,15464.0,United Kingdom,22,CANDLE
4,542226,72800B,4 PURPLE FLOCK DINNER CANDLES,1,2011-01-26 13:20:00,2.55,17075.0,United Kingdom,22,CANDLE


In [4]:
df_transactions_labeled.to_csv('../data_versions/whole_data.csv', index=False) # save finalized dataset

In [5]:
df_transactions_grouped = df_transactions_labeled.groupby("InvoiceNo")['Label'].agg(list).reset_index()
df_transactions_grouped.head()

Unnamed: 0,InvoiceNo,Label
0,536365,"[HEART, T-LIGHT, MUG, PINK, BOX, T-LIGHT, CANDLE]"
1,536366,"[MUG, MUG]"
2,536367,"[EGG, SET, BOX, BOX, HOME, PINK, HOME, MUG, HO..."
3,536368,"[STAND, BOX, STAND, STAND]"
4,536369,[HOME]


In [6]:
# generate size 2 itemsets
labels = list(df_labels['Label'])
labels[:5]

['MUG', 'GLASS', 'DRAWER', 'CUSHION', 'WRAP']

In [7]:
result = list(combinations(sorted(labels), 2))
result[:5]

[('BAG', 'BOX'),
 ('BAG', 'CANDLE'),
 ('BAG', 'CARD'),
 ('BAG', 'CHRISTMAS'),
 ('BAG', 'CLOCK')]

In [8]:
# construct dictionary that will hold all 2-itemsets and their "count"
rules_count = {}
for r in result:
    rules_count.update({r: 0})

In [9]:
#iterate through the dataset and update rule counts accordingly
for row in range(df_transactions_grouped.shape[0]):
    t = df_transactions_grouped.iloc[row, 1]
    itemsets = combinations(sorted(t), 2)
    for i in itemsets:
        if len(set(i)) > 1: #ignore when 2 items from same cluster are bought
            rules_count[i] += 1

In [10]:
# extract rule frequencies
rules_strength = [rules_count[rule] for rule in rules_count]
rules_strength[:5]

[414945, 125245, 104736, 153349, 33943]

In [11]:
# calculate proportion of transactions that a given rule appears in - this will be the 'strength' factor
rules_strength_array = np.array(rules_strength)
rules_strength_prop = rules_strength_array / df_transactions_grouped.shape[0]

In [12]:
# save rules and their strength to csv
df_rule_strengths = pd.DataFrame(
    {
        'Rule': list(rules_count.keys()),
        'Strength': rules_strength_prop
    }
)

In [13]:
df_rule_strengths.to_csv('../data_versions/rules_strengths.csv', index=False)

<h1>ARM for products</h1>

In [58]:
basket = (df_transactions_labeled
          .groupby("InvoiceNo")["Description"]
          .apply(lambda items: sorted(set(items)))       
          .reset_index())

basket.head()

Unnamed: 0,InvoiceNo,Description
0,536365,"[CREAM CUPID HEARTS COAT HANGER, GLASS STAR FR..."
1,536366,"[HAND WARMER RED POLKA DOT, HAND WARMER UNION ..."
2,536367,"[ASSORTED COLOUR BIRD ORNAMENT, BOX OF 6 ASSOR..."
3,536368,"[BLUE COAT RACK PARIS FASHION, JAM MAKING SET ..."
4,536369,[BATH BUILDING BLOCK WORD]


In [59]:
# 3. Convert to one‑hot encoded matrix for Apriori
# explode so each row = one (invoice, label) pair
explode = basket.explode("Description")

# one‑hot encode
basket_ohe = (explode
              .assign(value=1)
              .pivot_table(index="InvoiceNo",
                           columns="Description",
                           values="value",
                           fill_value=0)
             )

In [65]:
# 4. Frequent itemsets & association rules
freq_sets = apriori(basket_ohe,
                    min_support=0.025,
                    use_colnames=True)



In [71]:
# generate rules with minimum confidence threshold
rules = association_rules(freq_sets,
                          metric="confidence",
                          min_threshold=0.35)   # adjust as desired

In [72]:
rules.shape

(58, 14)