# Loading data

In [86]:
import csv, collections, itertools

with open('myDataFile.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)

    items = next(csvreader)
    transactions = [
        set(items[i] for i, v in enumerate(row) if v == 't')
        for row in csvreader
    ]

N_TRANSACTIONS = len(transactions)

print('Transactions:', N_TRANSACTIONS)

Transactions: 9835


# Parameters

In [87]:
MIN_SUPPORT = 0.005
MIN_CONFIDENCE = 0.6

# Apriori algorithm

In [88]:
L = []
supports = {}

histogram = collections.defaultdict(int)
for transaction in transactions:
    for item in transaction:
        histogram[(item,)] += 1

layer_1 = set()
for itemset, count in histogram.items():
    support = count / N_TRANSACTIONS
    if support >= MIN_SUPPORT:
        layer_1.add(itemset)
        supports[itemset] = support

L.append(layer_1)

In [89]:
k = 0
while len(L[k]) != 0:
    k += 1

    candidates = []
    for itemset1 in L[k-1]:
        for itemset2 in L[k-1]:
            if itemset1[:-1] != itemset2[:-1] or itemset1[-1] >= itemset2[-1]:
                continue

            candidate = itemset1 + (itemset2[-1],)
            include = True

            for subset in itertools.combinations(candidate, k):
                if subset not in L[k-1]:
                    include = False
                    break

            if include:
                candidates.append(candidate)

    histogram = collections.defaultdict(int)
    for transaction in transactions:
        for candidate in candidates:
            if set(candidate).issubset(transaction):
                histogram[candidate] += 1

    new_layer = set()
    for itemset, count in histogram.items():
        support = count / N_TRANSACTIONS
        if support >= MIN_SUPPORT:
            new_layer.add(itemset)
            supports[itemset] = support

    L.append(new_layer)

In [90]:
for k, layer in enumerate(L):
    print(f'L({k+1}):', len(layer))

L(1): 120
L(2): 605
L(3): 264
L(4): 12
L(5): 0


# Rule generation

In [91]:
rules = set()

def subsets(size, itemset):
    if size == 0:
        return

    for subset in itertools.combinations(itemset, size):
        confidence = supports[itemset] / supports[subset]

        if confidence >= MIN_CONFIDENCE:
            rules.add((subset, '=>', frozenset(itemset) - frozenset(subset), f'(confidence={confidence:.3f})'))

            subsets(size - 1, subset)

for k, layer in zip(range(len(L), -1, -1), reversed(L)):
    for itemset in layer:
        subsets(k - 1, itemset)

print('Rules:', len(rules))
for rule in rules:
    print(*rule)

Rules: 22
('fruit_vegetable_juice', 'other_vegetables', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.617)
('butter', 'root_vegetables') => frozenset({'whole_milk'}) (confidence=0.638)
('butter', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.639)
('domestic_eggs', 'tropical_fruit') => frozenset({'whole_milk'}) (confidence=0.607)
('other_vegetables', 'root_vegetables', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.606)
('other_vegetables', 'root_vegetables', 'whipped_sour_cream') => frozenset({'whole_milk'}) (confidence=0.607)
('butter', 'domestic_eggs') => frozenset({'whole_milk'}) (confidence=0.621)
('butter', 'tropical_fruit') => frozenset({'whole_milk'}) (confidence=0.622)
('domestic_eggs', 'pip_fruit') => frozenset({'whole_milk'}) (confidence=0.624)
('pip_fruit', 'root_vegetables', 'whole_milk') => frozenset({'other_vegetables'}) (confidence=0.614)
('other_vegetables', 'tropical_fruit', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.620)
('butter', 'whipp