In [63]:
from collections import Counter
from itertools import combinations
from functools import partial
import logging

In [64]:
transactions = [
    ["cookies", "milk", "juice", "bread", "butter"],
    ["milk", "bread", "butter"],
    ["cookies", "milk", "bread", "butter"],
    ["cookies","juice"],
    ["milk"]
]

support = 4
npartitions = 2
partition_support = support/npartitions

In [65]:
def count_itemsets_in_partition(transactions, itemset_size, support):
    itemset_counter = Counter()
    for transaction in transactions:
        for itemset in combinations(transaction, itemset_size):
            itemset_counter[itemset] += 1
        
    yield set([ itemset for itemset, count in itemset_counter.items() if count>=support ])

def reduce_frequent_itemsets(itemsets1, itemsets2):
    itemsets1 |= itemsets2
    return itemsets1


In [66]:
candidate_itemsets = (sc
    .parallelize(transactions, npartitions)
    .mapPartitions(
        partial(
            count_itemsets_in_partition,
            itemset_size=1, 
            support=partition_support
        ), 
        preservesPartitioning=True
    )
    .reduce(reduce_frequent_itemsets)
)

In [75]:
for itemset in candidate_itemsets:
    print(itemset)

('butter',)
('bread',)
('milk',)
('cookies',)


In [67]:
def count_itemsets_in_transaction(transaction, itemset_size, candidate_itemsets):
    itemset_counter = Counter()
    for itemset in combinations(transaction, itemset_size):
        if itemset in candidate_itemsets:
            itemset_counter[itemset] += 1
    
    for itemset, count in itemset_counter.items():
        yield (itemset, count)

In [68]:
frequent_itemsets = (sc
    .parallelize(transactions)
    .flatMap(partial(count_itemsets_in_transaction,itemset_size=1, candidate_itemsets=candidate_itemsets))
    .reduceByKey(lambda x, y: x+y)
    .filter(lambda x: x[1]>=support)
    .collect()
)

In [73]:
for itemset, itemset_support in frequent_itemsets:
    print(f"itemset: {itemset} with support {itemset_support}")

itemset: ('milk',) with support 4
