# Loading data

In [50]:
import csv, collections, itertools, time

with open('myDataFile.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)

    items = next(csvreader)
    transactions = [
        set(items[i] for i, v in enumerate(row) if v == 't')
        for row in csvreader
    ]

N_TRANSACTIONS = len(transactions)

print('Transactions:', N_TRANSACTIONS)

Transactions: 9835


# Parameters

In [51]:
MIN_SUPPORT = 0.005
MIN_CONFIDENCE = 0.6

# Apriori algorithm

In [52]:
L = []
supports = {}

histogram = collections.defaultdict(int)
for transaction in transactions:
    for item in transaction:
        histogram[(item,)] += 1

layer_1 = set()
for itemset, count in histogram.items():
    support = count / N_TRANSACTIONS
    if support >= MIN_SUPPORT:
        layer_1.add(itemset)
        supports[itemset] = support

L.append(layer_1)

In [53]:
k = 0
while len(L[k]) != 0:
    k += 1

    candidates = []
    for itemset1 in L[k-1]:
        for itemset2 in L[k-1]:
            if itemset1[:-1] != itemset2[:-1] or itemset1[-1] >= itemset2[-1]:
                continue

            candidate = itemset1 + (itemset2[-1],)
            include = True

            for i in range(len(candidate)):
                subset = candidate[:i] + candidate[i+1:]
                if subset not in L[k-1]:
                    include = False
                    break

            if include:
                candidates.append(candidate)

    histogram = collections.defaultdict(int)
    for transaction in transactions:
        for candidate in candidates:
            if set(candidate).issubset(transaction):
                histogram[candidate] += 1

    new_layer = set()
    for itemset, count in histogram.items():
        support = count / N_TRANSACTIONS
        if support >= MIN_SUPPORT:
            new_layer.add(itemset)
            supports[itemset] = support

    L.append(new_layer)

In [54]:
for k, layer in enumerate(L):
    print(f'L({k+1}):', len(layer))

L(1): 120
L(2): 605
L(3): 264
L(4): 12
L(5): 0


# Rule generation

In [59]:
# rules = []

# for k in range(1, len(L)):
#     for itemset in L[k]:
#         itemset_as_set = set(itemset)

#         for i in range(1, 1 << (k + 1)):
#             subset = tuple(
#                 itemset[j]
#                 for j in range(k + 1) if (i & (1 << j))
#             )

#             print(subset)

#             if subset == itemset:
#                 break

#             confidence = supports[itemset] / supports[subset]

#             if confidence >= MIN_CONFIDENCE:
#                 subset_as_set = set(subset)

#                 rules.append(
#                     (subset_as_set, '=>',
#                     itemset_as_set - subset_as_set,
#                     f'(confidence={confidence:.3f})')
#                 )

# print('Rules:', len(rules))
# for rule in rules:
#     print(*rule)

t0 = time.time()

for _ in range(1000):
    rules = set()

    def subsets(size, itemset):
        if size == 0:
            return

        for subset in itertools.combinations(itemset, size):
            confidence = supports[itemset] / supports[subset]

            if confidence >= MIN_CONFIDENCE:
                rules.add((subset, '=>', frozenset(set(itemset) - set(subset)), f'(confidence={confidence:.3f})'))

                subsets(size - 1, subset)


    for k, layer in zip(range(len(L), -1, -1), reversed(L)):
        for itemset in layer:
            subsets(k - 1, itemset)

    print(len(rules))
    for rule in rules:
        print(*rule)

t1 = time.time()

print((t1 - t0) / 1000)



22
('fruit_vegetable_juice', 'other_vegetables', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.617)
('butter', 'root_vegetables') => frozenset({'whole_milk'}) (confidence=0.638)
('butter', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.639)
('domestic_eggs', 'tropical_fruit') => frozenset({'whole_milk'}) (confidence=0.607)
('other_vegetables', 'root_vegetables', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.606)
('other_vegetables', 'root_vegetables', 'whipped_sour_cream') => frozenset({'whole_milk'}) (confidence=0.607)
('butter', 'domestic_eggs') => frozenset({'whole_milk'}) (confidence=0.621)
('butter', 'tropical_fruit') => frozenset({'whole_milk'}) (confidence=0.622)
('domestic_eggs', 'pip_fruit') => frozenset({'whole_milk'}) (confidence=0.624)
('pip_fruit', 'root_vegetables', 'whole_milk') => frozenset({'other_vegetables'}) (confidence=0.614)
('other_vegetables', 'tropical_fruit', 'yogurt') => frozenset({'whole_milk'}) (confidence=0.620)
('butter', 'whipped_sour