In [1]:
import pandas as pd
from itertools import combinations
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

<h1>ARM for clusters</h1>

In [2]:
# load and merge all appropriate datasets
df_products   = pd.read_csv("../data_versions/clusters.csv")        # Description  | Cluster
df_labels     = pd.read_csv("../data_versions/cluster_labels.csv")  # Cluster | Label
df_transactions = pd.read_csv('../data_versions/cleaned_1.csv')


df_products_labeled   = df_products.merge(df_labels, on="Cluster", how="outer")  
df_transactions_labeled = df_transactions.merge(df_products_labeled, on="Description", how="outer")

In [3]:
df_transactions.loc[df_transactions.CustomerID.isna(),:]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
613,536414,22139,RETROSPOT TEA SET CERAMIC 11 PC,56,2010-12-01 11:52:00,0.00,,United Kingdom
1431,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1432,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1433,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1434,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
530263,581498,85099B,JUMBO BAG RED RETROSPOT,5,2011-12-09 10:26:00,4.13,,United Kingdom
530264,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,2011-12-09 10:26:00,4.13,,United Kingdom
530265,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,2011-12-09 10:26:00,4.96,,United Kingdom
530266,581498,85174,S/4 CACTI CANDLES,1,2011-12-09 10:26:00,10.79,,United Kingdom


In [4]:
df_transactions_labeled.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Cluster,Label
0,536522,72800B,4 PURPLE FLOCK DINNER CANDLES,2,2010-12-01 12:49:00,2.55,15012.0,United Kingdom,22,CANDLE
1,537044,72800B,4 PURPLE FLOCK DINNER CANDLES,12,2010-12-05 10:52:00,2.55,18055.0,United Kingdom,22,CANDLE
2,539595,72800B,4 PURPLE FLOCK DINNER CANDLES,2,2010-12-20 13:43:00,5.06,,United Kingdom,22,CANDLE
3,540247,72800B,4 PURPLE FLOCK DINNER CANDLES,1,2011-01-05 15:56:00,2.55,15464.0,United Kingdom,22,CANDLE
4,542226,72800B,4 PURPLE FLOCK DINNER CANDLES,1,2011-01-26 13:20:00,2.55,17075.0,United Kingdom,22,CANDLE


In [5]:
df_transactions_labeled.to_csv('../data_versions/whole_data.csv', index=False) # save finalized dataset

In [6]:
df_transactions_grouped = df_transactions_labeled.groupby("InvoiceNo")['Label'].agg(list).reset_index()
df_transactions_grouped.head()

Unnamed: 0,InvoiceNo,Label
0,536365,"[HEART, T-LIGHT, MUG, PINK, BOX, T-LIGHT, CANDLE]"
1,536366,"[MUG, MUG]"
2,536367,"[EGG, SET, BOX, BOX, HOME, PINK, HOME, MUG, HO..."
3,536368,"[STAND, BOX, STAND, STAND]"
4,536369,[HOME]


In [7]:
# generate size 2 itemsets
labels = list(df_labels['Label'])
labels[:5]

['MUG', 'GLASS', 'DRAWER', 'CUSHION', 'WRAP']

In [8]:
result = list(combinations(sorted(labels), 2))
result[:5]

[('BAG', 'BOX'),
 ('BAG', 'CANDLE'),
 ('BAG', 'CARD'),
 ('BAG', 'CHRISTMAS'),
 ('BAG', 'CLOCK')]

In [9]:
# construct dictionary that will hold all 2-itemsets and their "count"
rules_count = {}
for r in result:
    rules_count.update({r: 0})

In [10]:
#iterate through the dataset and update rule counts accordingly
for row in range(df_transactions_grouped.shape[0]):
    t = df_transactions_grouped.iloc[row, 1]
    itemsets = combinations(sorted(t), 2)
    for i in itemsets:
        if len(set(i)) > 1: #ignore when 2 items from same cluster are bought
            rules_count[i] += 1

In [11]:
# extract rule frequencies
rules_strength = [rules_count[rule] for rule in rules_count]
rules_strength[:5]

[414945, 125245, 104736, 153349, 33943]

In [12]:
# calculate proportion of transactions that a given rule appears in - this will be the 'strength' factor
rules_strength_array = np.array(rules_strength)
rules_strength_prop = rules_strength_array / df_transactions_grouped.shape[0]

In [13]:
# save rules and their strength to csv
df_rule_strengths = pd.DataFrame(
    {
        'Rule': list(rules_count.keys()),
        'Strength': rules_strength_prop
    }
)

In [14]:
df_rule_strengths.to_csv('../data_versions/rules_strengths.csv', index=False)

<h1>ARM for products</h1>

In [15]:
basket = (df_transactions_labeled
          .groupby("InvoiceNo")["Description"]
          .apply(lambda items: sorted(set(items)))       
          .reset_index())

basket.head()

Unnamed: 0,InvoiceNo,Description
0,536365,"[CREAM CUPID HEARTS COAT HANGER, GLASS STAR FR..."
1,536366,"[HAND WARMER RED POLKA DOT, HAND WARMER UNION ..."
2,536367,"[ASSORTED COLOUR BIRD ORNAMENT, BOX OF 6 ASSOR..."
3,536368,"[BLUE COAT RACK PARIS FASHION, JAM MAKING SET ..."
4,536369,[BATH BUILDING BLOCK WORD]


In [16]:
# 3. Convert to one‑hot encoded matrix for Apriori
# explode so each row = one (invoice, label) pair
explode = basket.explode("Description")

# one‑hot encode
basket_ohe = (explode
              .assign(value=1)
              .pivot_table(index="InvoiceNo",
                           columns="Description",
                           values="value",
                           fill_value=0)
             )

In [17]:
# 4. Frequent itemsets & association rules
freq_sets = apriori(basket_ohe,
                    min_support=0.025,
                    use_colnames=True)



In [18]:
freq_sets.shape

(213, 2)

In [19]:
# generate rules with minimum confidence threshold
rules = association_rules(freq_sets,
                          metric="confidence",
                          min_threshold=0.35)   # adjust as desired

In [20]:
rules.shape

(58, 14)

In [21]:
rules.to_csv("../data_versions/raw_rules.csv", index=False)

In [63]:
rules2 = rules.iloc[:, [0,1]]
rules2.head()

Unnamed: 0,antecedents,consequents
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED )
1,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN)
2,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG)
3,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT)
4,(SPACEBOY LUNCH BOX ),(DOLLY GIRL LUNCH BOX)


In [41]:
list(rules2.iloc[0,0])[0]

'ALARM CLOCK BAKELIKE GREEN'

In [68]:
for s in rules2.antecedents:
    print(list(s)[0])
    print(list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0])
    print()

ALARM CLOCK BAKELIKE GREEN
CLOCK

ALARM CLOCK BAKELIKE RED 
CLOCK

CHARLOTTE BAG PINK POLKADOT
BAG

RED RETROSPOT CHARLOTTE BAG
HOME

SPACEBOY LUNCH BOX 
BOX

DOLLY GIRL LUNCH BOX
BOX

GARDENERS KNEELING PAD CUP OF TEA 
FLOWER

GARDENERS KNEELING PAD KEEP CALM 
FLOWER

PINK REGENCY TEACUP AND SAUCER
MUG

GREEN REGENCY TEACUP AND SAUCER
MUG

GREEN REGENCY TEACUP AND SAUCER
MUG

ROSES REGENCY TEACUP AND SAUCER 
MUG

JUMBO  BAG BAROQUE BLACK WHITE
BAG

JUMBO BAG APPLES
BAG

JUMBO BAG RED RETROSPOT
HOME

JUMBO BAG PINK POLKADOT
BAG

JUMBO STORAGE BAG SUKI
BAG

JUMBO BAG PINK POLKADOT
BAG

JUMBO BAG PINK VINTAGE PAISLEY
BAG

JUMBO BAG STRAWBERRY
BAG

JUMBO BAG WOODLAND ANIMALS
BAG

JUMBO SHOPPER VINTAGE RED PAISLEY
BAG

JUMBO STORAGE BAG SUKI
BAG

LUNCH BAG RED RETROSPOT
HOME

JUMBO STORAGE BAG SUKI
BAG

JUMBO SHOPPER VINTAGE RED PAISLEY
BAG

LUNCH BAG CARS BLUE
BAG

LUNCH BAG  BLACK SKULL.
HOME

LUNCH BAG PINK POLKADOT
BAG

LUNCH BAG  BLACK SKULL.
HOME

LUNCH BAG RED RETROSPOT
HOME

LUNCH 

In [69]:
rules2['ante_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.antecedents]
rules2['conse_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.consequents]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rules2['ante_cluster'] = [list(df_products_labeled.loc[df_products_labeled.Description==list(s)[0], 'Label'])[0] for s in rules2.antecedents]


In [70]:
rules2

Unnamed: 0,antecedents,consequents,ante_cluster
0,(ALARM CLOCK BAKELIKE GREEN),(ALARM CLOCK BAKELIKE RED ),CLOCK
1,(ALARM CLOCK BAKELIKE RED ),(ALARM CLOCK BAKELIKE GREEN),CLOCK
2,(CHARLOTTE BAG PINK POLKADOT),(RED RETROSPOT CHARLOTTE BAG),BAG
3,(RED RETROSPOT CHARLOTTE BAG),(CHARLOTTE BAG PINK POLKADOT),HOME
4,(SPACEBOY LUNCH BOX ),(DOLLY GIRL LUNCH BOX),BOX
5,(DOLLY GIRL LUNCH BOX),(SPACEBOY LUNCH BOX ),BOX
6,(GARDENERS KNEELING PAD CUP OF TEA ),(GARDENERS KNEELING PAD KEEP CALM ),FLOWER
7,(GARDENERS KNEELING PAD KEEP CALM ),(GARDENERS KNEELING PAD CUP OF TEA ),FLOWER
8,(PINK REGENCY TEACUP AND SAUCER),(GREEN REGENCY TEACUP AND SAUCER),MUG
9,(GREEN REGENCY TEACUP AND SAUCER),(PINK REGENCY TEACUP AND SAUCER),MUG
