# Lab07 - Frequent Pattern Mining


## Dataset import
- Online Retail Dataset
- COCO Dataset

In [1]:
!curl -L https://github.com/dbdmg/data-science-lab/raw/master/datasets/online_retail.csv -o online_retail.csv
!curl -L https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/modified_coco.json -o modified_coco.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 43.4M  100 43.4M    0     0  5129k      0  0:00:08  0:00:08 --:--:-- 6195k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  851k  100  851k    0     0  2137k      0 --:--:-- --:--:-- --:--:-- 2140k


In [2]:
!pip install mlxtend



## Association rules from frequent itemsets

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder


df = pd.read_csv("online_retail.csv")
df = df.loc[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df.dropna()

df.head()

In [None]:
invoices_agg = df.groupby('InvoiceNo')['Description'].agg(list).reset_index()

invoices_agg.head()

In [None]:
itemsets = invoices_agg['Description'].tolist()
te = TransactionEncoder()
itemsets_encoded = te.fit_transform(itemsets)

df_basket = pd.DataFrame(itemsets_encoded, columns=te.columns_)

df_basket.head()


In [None]:
from mlxtend.frequent_patterns import fpgrowth, association_rules, apriori

fi = fpgrowth(df_basket, min_support=0.02, use_colnames=True)  

print(len(fi))
print(fi.to_string())


In [None]:
rules = association_rules(fi, metric="confidence", min_threshold=0.1)

selected = rules[
    rules['consequents'].apply(lambda x: 'WHITE HANGING HEART T-LIGHT HOLDER' in x)
]

selected[['antecedents', 'consequents', 'support', 'confidence']]

In [None]:
frequent_itemsets = fpgrowth(df_basket, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.85)

print("Numero di regole:", len(rules))
print(rules[['antecedents', 'consequents', 'support', 'confidence']].to_string())

## Apriori implementation

In [None]:
def my_apriori(transactions, min_support=0.05):
    num_transactions = len(transactions)
    
    # Itemset di lunghezza 1
    item_counts = {}
    for t in transactions:
        for item in t:
            item_counts[item] = item_counts.get(item, 0) + 1
    frequent_1 = {item for item, count in item_counts.items() if count / num_transactions >= min_support}
    
    # Itemset di lunghezza 2
    pair_counts = {}
    for t in transactions:
        filtered = [item for item in t if item in frequent_1]
        for i in range(len(filtered)):
            for j in range(i+1, len(filtered)):
                pair = tuple(sorted([filtered[i], filtered[j]]))
                pair_counts[pair] = pair_counts.get(pair, 0) + 1
    frequent_2 = {pair for pair, count in pair_counts.items() if count / num_transactions >= min_support}
    
    return frequent_1, frequent_2

# Esempio d'uso:
import json
with open("modified_coco.json") as f:
    coco = json.load(f)
transactions = [list(set(img['annotations'])) for img in coco]

f1, f2 = my_apriori(transactions, min_support=0.05)
print("Frequenti lunghezza 1:", f1)
print("Frequenti lunghezza 2:", f2)

In [None]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
min_support = 0.05  

apriori_itemsets = apriori(df, min_support=min_support, use_colnames=True)
fpgrowth_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)

merged = pd.merge(apriori_itemsets, fpgrowth_itemsets, on=['itemsets', 'support'], how='outer', indicator=True)
print(merged)

In [None]:
import timeit

start = timeit.default_timer()
frequent_itemsets = my_apriori(transactions, min_support=0.05)
print('My Apriori implementation: ', timeit.default_timer() - start, "sec")

start = timeit.default_timer()
fi = apriori(df_basket, min_support=0.05, use_colnames=True)
print('Mlxtend\'s Apriori implementation: ',timeit.default_timer() - start, "sec")