In [1]:
import pandas as pd
from itertools import combinations
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

<h1>ARM for clusters</h1>

In [3]:
# load and merge all appropriate datasets
df_products   = pd.read_csv("../data_versions/clusters.csv")        # Description  | Cluster
df_labels     = pd.read_csv("../data_versions/cluster_labels.csv")  # Cluster | Label
df_transactions = pd.read_csv('../data_versions/cleaned_1.csv')


df_products_labeled   = df_products.merge(df_labels, on="Cluster", how="outer")  
df_transactions_labeled = df_transactions.merge(df_products_labeled, on="Description", how="outer")

In [4]:
df_transactions.loc[df_transactions.CustomerID.isna(),:]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
613,536414,22139,RETROSPOT TEA SET CERAMIC 11 PC,56,2010-12-01 11:52:00,0.00,,United Kingdom
1431,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1432,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1433,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1434,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
530263,581498,85099B,JUMBO BAG RED RETROSPOT,5,2011-12-09 10:26:00,4.13,,United Kingdom
530264,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,2011-12-09 10:26:00,4.13,,United Kingdom
530265,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,2011-12-09 10:26:00,4.96,,United Kingdom
530266,581498,85174,S/4 CACTI CANDLES,1,2011-12-09 10:26:00,10.79,,United Kingdom


In [5]:
df_transactions_labeled.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Cluster,Label
0,536522,72800B,4 PURPLE FLOCK DINNER CANDLES,2,2010-12-01 12:49:00,2.55,15012.0,United Kingdom,22,CANDLE
1,537044,72800B,4 PURPLE FLOCK DINNER CANDLES,12,2010-12-05 10:52:00,2.55,18055.0,United Kingdom,22,CANDLE
2,539595,72800B,4 PURPLE FLOCK DINNER CANDLES,2,2010-12-20 13:43:00,5.06,,United Kingdom,22,CANDLE
3,540247,72800B,4 PURPLE FLOCK DINNER CANDLES,1,2011-01-05 15:56:00,2.55,15464.0,United Kingdom,22,CANDLE
4,542226,72800B,4 PURPLE FLOCK DINNER CANDLES,1,2011-01-26 13:20:00,2.55,17075.0,United Kingdom,22,CANDLE


In [6]:
df_transactions_labeled.to_csv('../data_versions/whole_data.csv', index=False) # save finalized dataset

In [7]:
df_transactions_grouped = df_transactions_labeled.groupby("InvoiceNo")['Label'].agg(list).reset_index()
df_transactions_grouped.head()

Unnamed: 0,InvoiceNo,Label
0,536365,"[HEART, T-LIGHT, MUG, PINK, BOX, T-LIGHT, CANDLE]"
1,536366,"[MUG, MUG]"
2,536367,"[EGG, SET, BOX, BOX, HOME, PINK, HOME, MUG, HO..."
3,536368,"[STAND, BOX, STAND, STAND]"
4,536369,[HOME]


In [8]:
# generate size 2 itemsets
labels = list(df_labels['Label'])
labels[:5]

['MUG', 'GLASS', 'DRAWER', 'CUSHION', 'WRAP']

In [9]:
result = list(combinations(sorted(labels), 2))
result[:5]

[('BAG', 'BOX'),
 ('BAG', 'CANDLE'),
 ('BAG', 'CARD'),
 ('BAG', 'CHRISTMAS'),
 ('BAG', 'CLOCK')]

In [10]:
# construct dictionary that will hold all 2-itemsets and their "count"
rules_count = {}
for r in result:
    rules_count.update({r: 0})

In [11]:
#iterate through the dataset and update rule counts accordingly
for row in range(df_transactions_grouped.shape[0]):
    t = df_transactions_grouped.iloc[row, 1]
    itemsets = combinations(sorted(t), 2)
    for i in itemsets:
        if len(set(i)) > 1: #ignore when 2 items from same cluster are bought
            rules_count[i] += 1

In [12]:
# extract rule frequencies
rules_strength = [rules_count[rule] for rule in rules_count]
rules_strength[:5]

[414945, 125245, 104736, 153349, 33943]

In [13]:
# calculate proportion of transactions that a given rule appears in - this will be the 'strength' factor
rules_strength_array = np.array(rules_strength)
rules_strength_prop = rules_strength_array / df_transactions_grouped.shape[0]

In [14]:
# save rules and their strength to csv
df_rule_strengths = pd.DataFrame(
    {
        'Rule': list(rules_count.keys()),
        'Strength': rules_strength_prop
    }
)

In [15]:
df_rule_strengths.to_csv('../data_versions/rules_strengths.csv', index=False)

<h1>ARM for products</h1>

In [16]:
basket = (df_transactions_labeled
          .groupby("InvoiceNo")["Description"]
          .apply(lambda items: sorted(set(items)))       
          .reset_index())

basket.head()

Unnamed: 0,InvoiceNo,Description
0,536365,"[CREAM CUPID HEARTS COAT HANGER, GLASS STAR FR..."
1,536366,"[HAND WARMER RED POLKA DOT, HAND WARMER UNION ..."
2,536367,"[ASSORTED COLOUR BIRD ORNAMENT, BOX OF 6 ASSOR..."
3,536368,"[BLUE COAT RACK PARIS FASHION, JAM MAKING SET ..."
4,536369,[BATH BUILDING BLOCK WORD]


In [17]:
# 3. Convert to one‑hot encoded matrix for Apriori
# explode so each row = one (invoice, label) pair
explode = basket.explode("Description")

# one‑hot encode
basket_ohe = (explode
              .assign(value=1)
              .pivot_table(index="InvoiceNo",
                           columns="Description",
                           values="value",
                           fill_value=0)
             )

In [18]:
# 4. Frequent itemsets & association rules
freq_sets = apriori(basket_ohe,
                    min_support=0.025,
                    use_colnames=True)



In [19]:
freq_sets.shape

(213, 2)

In [20]:
# generate rules with minimum confidence threshold
rules = association_rules(freq_sets,
                          metric="confidence",
                          min_threshold=0.35)   # adjust as desired

In [21]:
rules.shape

(58, 14)

In [22]:
rules.to_csv("../data_versions/raw_rules.csv", index=False)

In [23]:
rules2 = rules.iloc[:, [0,1]]
rules2.head()

Unnamed: 0,antecedents,consequents
0,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN)
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED )
2,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG)
3,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT)
4,(SPACEBOY LUNCH BOX ),(DOLLY GIRL LUNCH BOX)


In [29]:
df_transactions.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [24]:
rules2['ante_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.antecedents]
rules2['conse_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.consequents]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules2['ante_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.antecedents]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules2['conse_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.consequents]


In [25]:
rules2['cluster_rel'] = rules2.ante_cluster + ' ' + rules2.conse_cluster

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules2['cluster_rel'] = rules2.ante_cluster + ' ' + rules2.conse_cluster


In [26]:
pd.unique(rules2.cluster_rel)

array(['CLOCK CLOCK', 'BAG HOME', 'HOME BAG', 'BOX BOX', 'FLOWER FLOWER',
       'MUG MUG', 'BAG BAG', 'HOME HOME', 'BOX HOME', 'HOME BOX',
       'WRAP WRAP', 'MUG STAND', 'WALL WALL'], dtype=object)

In [27]:
rules2.head()

Unnamed: 0,antecedents,consequents,ante_cluster,conse_cluster,cluster_rel
0,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),CLOCK,CLOCK,CLOCK CLOCK
1,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),CLOCK,CLOCK,CLOCK CLOCK
2,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),BAG,HOME,BAG HOME
3,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),HOME,BAG,HOME BAG
4,(SPACEBOY LUNCH BOX ),(DOLLY GIRL LUNCH BOX),BOX,BOX,BOX BOX
