In [1]:
import itertools

from collections import defaultdict

In [2]:
support_threshold_percentage = 0.01
k_max = 3  # triplets

## Frequent itemsets

In [3]:
data_file = "./data/T10I4D100K.dat"

k_list = [None] * k_max

tot_transactions = 0
with open(data_file, 'r') as data_file:
    singleton_to_transactions = defaultdict(set)
    for transaction_id, transaction in enumerate(data_file):
        for item in transaction.strip(" \n").split(" "):
            singleton_to_transactions[frozenset({item})].add(transaction_id)
        tot_transactions += 1
    k_list[0] = singleton_to_transactions


print("Total number of transactions: {}".format(tot_transactions))
print("Total number of distinct items: {}".format(len(k_list[0].keys())))
support_threshold = int(support_threshold_percentage * tot_transactions)
print("Support percentage thr {}, equivalent to at leat {} transactions".format(
    support_threshold_percentage, support_threshold))

Total number of transactions: 100000
Total number of distinct items: 870
Support percentage thr 0.01, equivalent to at leat 1000 transactions


In [4]:
def filter_and_remove(set_to_transactions):
    items_below_threshold = [
        item for item, transactions in set_to_transactions.items() if len(transactions) < support_threshold
    ]
    for item in items_below_threshold:
        del(set_to_transactions[item])
        
filter_and_remove(k_list[0])
print("Remaining singletons {}".format(len(k_list[0].keys())))

Remaining singletons 375


In [5]:
for k in range(2, k_max + 1):
    print("** Computing itemsets of size {} **".format(k))
    singletons = k_list[0]
    k_minus_one_itemsets = k_list[k - 2]
    k_item_set_to_transactions = defaultdict(set)
    for keyA, keyB in itertools.product(singletons.keys(), k_minus_one_itemsets.keys()):
        k_item_set = frozenset(keyA.union(keyB))
        if len(k_item_set) != k:
            continue
        common_txs = singletons[keyA].intersection(k_minus_one_itemsets[keyB])
        k_item_set_to_transactions[k_item_set] = common_txs
    filter_and_remove(k_item_set_to_transactions)
    print("Number of sets with support {} = {}".format(support_threshold, len(k_item_set_to_transactions)))
    k_list[k - 1] = k_item_set_to_transactions

** Computing itemsets of size 2 **
Number of sets with support 1000 = 9
** Computing itemsets of size 3 **
Number of sets with support 1000 = 1
