# Lab07 - Frequent Pattern Mining


## Dataset import
- Online Retail Dataset
- COCO Dataset

In [1]:
!curl -L https://github.com/dbdmg/data-science-lab/raw/master/datasets/online_retail.csv -o online_retail.csv
!curl -L https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/modified_coco.json -o modified_coco.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 43.4M  100 43.4M    0     0  5129k      0  0:00:08  0:00:08 --:--:-- 6195k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  851k  100  851k    0     0  2137k      0 --:--:-- --:--:-- --:--:-- 2140k


In [2]:
!pip install mlxtend



## Association rules from frequent itemsets

In [3]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder


df = pd.read_csv("online_retail.csv")
df = df.loc[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df.dropna()

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
invoices_agg = df.groupby('InvoiceNo')['Description'].agg(list).reset_index()

invoices_agg.head()

Unnamed: 0,InvoiceNo,Description
0,536365,"[WHITE HANGING HEART T-LIGHT HOLDER, WHITE MET..."
1,536366,"[HAND WARMER UNION JACK, HAND WARMER RED POLKA..."
2,536367,"[ASSORTED COLOUR BIRD ORNAMENT, POPPY'S PLAYHO..."
3,536368,"[JAM MAKING SET WITH JARS, RED COAT RACK PARIS..."
4,536369,[BATH BUILDING BLOCK WORD]


In [5]:
itemsets = invoices_agg['Description'].tolist()
te = TransactionEncoder()
itemsets_encoded = te.fit_transform(itemsets)

df_basket = pd.DataFrame(itemsets_encoded, columns=te.columns_)

df_basket.head()


Unnamed: 0,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,ZINC STAR T-LIGHT HOLDER,ZINC SWEETHEART SOAP DISH,ZINC SWEETHEART WIRE LETTER RACK,ZINC T-LIGHT HOLDER STAR LARGE,ZINC T-LIGHT HOLDER STARS LARGE,ZINC T-LIGHT HOLDER STARS SMALL,ZINC TOP 2 DOOR WOODEN SHELF,ZINC WILLIE WINKIE CANDLE STICK,ZINC WIRE KITCHEN ORGANISER,ZINC WIRE SWEETHEART LETTER TRAY
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
from mlxtend.frequent_patterns import fpgrowth, association_rules, apriori

fi = fpgrowth(df_basket, min_support=0.02, use_colnames=True)  

print(len(fi))
print(fi.to_string())


243
      support                                                                                             itemsets
0    0.106334                                                                 (WHITE HANGING HEART T-LIGHT HOLDER)
1    0.023792                                                                             (HAND WARMER UNION JACK)
2    0.074180                                                                      (ASSORTED COLOUR BIRD ORNAMENT)
3    0.037279                                                                           (HOME BUILDING BLOCK WORD)
4    0.029942                                                                           (LOVE BUILDING BLOCK WORD)
5    0.028647                                                                                (DOORMAT NEW ENGLAND)
6    0.021634                                                                  (FELTCRAFT PRINCESS CHARLOTTE DOLL)
7    0.047421                                                               

In [7]:
rules = association_rules(fi, metric="confidence", min_threshold=0.1)

selected = rules[
    rules['consequents'].apply(lambda x: 'WHITE HANGING HEART T-LIGHT HOLDER' in x)
]

selected[['antecedents', 'consequents', 'support', 'confidence']]

Unnamed: 0,antecedents,consequents,support,confidence
6,(RED HANGING HEART T-LIGHT HOLDER),(WHITE HANGING HEART T-LIGHT HOLDER),0.024547,0.670103


In [8]:
frequent_itemsets = fpgrowth(df_basket, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.85)

print("Numero di regole:", len(rules))
print(rules[['antecedents', 'consequents', 'support', 'confidence']].to_string())

Numero di regole: 10
                                                                                    antecedents                         consequents   support  confidence
0                                                               (POPPY'S PLAYHOUSE LIVINGROOM )         (POPPY'S PLAYHOUSE KITCHEN)  0.011599    0.853175
1                                    (POPPY'S PLAYHOUSE KITCHEN, POPPY'S PLAYHOUSE LIVINGROOM )        (POPPY'S PLAYHOUSE BEDROOM )  0.010035    0.865116
2                                   (POPPY'S PLAYHOUSE BEDROOM , POPPY'S PLAYHOUSE LIVINGROOM )         (POPPY'S PLAYHOUSE KITCHEN)  0.010035    0.907317
3                                    (REGENCY CAKESTAND 3 TIER, PINK REGENCY TEACUP AND SAUCER)   (GREEN REGENCY TEACUP AND SAUCER)  0.014620    0.877023
4                                    (REGENCY CAKESTAND 3 TIER, PINK REGENCY TEACUP AND SAUCER)  (ROSES REGENCY TEACUP AND SAUCER )  0.014297    0.857605
5  (REGENCY CAKESTAND 3 TIER, ROSES REGENCY TEACUP AND 

## Apriori implementation

In [9]:
def my_apriori(transactions, min_support=0.05):
    num_transactions = len(transactions)
    
    # Itemset di lunghezza 1
    item_counts = {}
    for t in transactions:
        for item in t:
            item_counts[item] = item_counts.get(item, 0) + 1
    frequent_1 = {item for item, count in item_counts.items() if count / num_transactions >= min_support}
    
    # Itemset di lunghezza 2
    pair_counts = {}
    for t in transactions:
        filtered = [item for item in t if item in frequent_1]
        for i in range(len(filtered)):
            for j in range(i+1, len(filtered)):
                pair = tuple(sorted([filtered[i], filtered[j]]))
                pair_counts[pair] = pair_counts.get(pair, 0) + 1
    frequent_2 = {pair for pair, count in pair_counts.items() if count / num_transactions >= min_support}
    
    return frequent_1, frequent_2

# Esempio d'uso:
import json
with open("modified_coco.json") as f:
    coco = json.load(f)
transactions = [list(set(img['annotations'])) for img in coco]

f1, f2 = my_apriori(transactions, min_support=0.05)
print("Frequenti lunghezza 1:", f1)
print("Frequenti lunghezza 2:", f2)

Frequenti lunghezza 1: {'truck', 'traffic light', 'car', 'person', 'bicycle', 'bus', 'chair', 'handbag', 'bench', 'stop sign', 'backpack', 'parking meter', 'fire hydrant'}
Frequenti lunghezza 2: {('car', 'handbag'), ('car', 'stop sign'), ('bus', 'traffic light'), ('person', 'truck'), ('bus', 'person'), ('bench', 'car'), ('traffic light', 'truck'), ('car', 'truck'), ('person', 'traffic light'), ('bicycle', 'person'), ('bench', 'person'), ('handbag', 'traffic light'), ('backpack', 'person'), ('handbag', 'person'), ('bus', 'car'), ('fire hydrant', 'person'), ('bench', 'chair'), ('car', 'traffic light'), ('bench', 'handbag'), ('car', 'person'), ('car', 'fire hydrant')}


In [10]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
min_support = 0.05  

apriori_itemsets = apriori(df, min_support=min_support, use_colnames=True)
fpgrowth_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)

merged = pd.merge(apriori_itemsets, fpgrowth_itemsets, on=['itemsets', 'support'], how='outer', indicator=True)
print(merged)

    support                          itemsets _merge
0    0.0852                        (backpack)   both
1    0.3704                             (car)   both
2    0.0576                  (stop sign, car)   both
3    0.1978              (traffic light, car)   both
4    0.1032                      (truck, car)   both
5    0.0538            (fire hydrant, person)   both
6    0.1224                 (handbag, person)   both
7    0.0518          (traffic light, handbag)   both
8    0.1902           (traffic light, person)   both
9    0.0828                   (truck, person)   both
10   0.0778            (truck, traffic light)   both
11   0.2386                     (car, person)   both
12   0.0604              (person, car, bench)   both
13   0.0544                (person, car, bus)   both
14   0.0526         (traffic light, car, bus)   both
15   0.0558      (person, traffic light, bus)   both
16   0.1346                    (fire hydrant)   both
17   0.0572            (handbag, car, person) 

In [11]:
import timeit

start = timeit.default_timer()
frequent_itemsets = my_apriori(transactions, min_support=0.05)
print('My Apriori implementation: ', timeit.default_timer() - start, "sec")

start = timeit.default_timer()
fi = apriori(df_basket, min_support=0.05, use_colnames=True)
print('Mlxtend\'s Apriori implementation: ',timeit.default_timer() - start, "sec")

My Apriori implementation:  0.015507250092923641 sec
Mlxtend's Apriori implementation:  0.08603704208508134 sec
