# Loading data

In [1]:
import csv
from collections import defaultdict

with open('myDataFile.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)

    items = next(csvreader)
    transactions = []

    for row in csvreader:
        row_items = []
        for i, value in enumerate(row):
            if value == 't':
                row_items.append(items[i])
        transactions.append(sorted(row_items))

N_ITEMS = len(items)
N_TRANSACTIONS = len(transactions)

# Parameters

In [2]:
MIN_SUPPORT = 0.005
MIN_CONFIDENCE = 0.6

# Apriori algorithm

In [3]:
L = []
supports = {}

histogram = defaultdict(int)
for t in transactions:
    for item in t:
        histogram[item] += 1

new_layer = []
for item, count in histogram.items():
    if (count / N_TRANSACTIONS) >= MIN_SUPPORT:
        new_layer.append([item])
        supports[(item,)] = count

L.append(new_layer)

k = 0
while len(L[k]) != 0:
    k += 1

    candidates = []
    for set1 in L[k-1]:
        for set2 in L[k-1]:
            if set1[:-1] != set2[:-1] or set1[-1] >= set2[-1]:
                continue

            candidate = set1 + [set2[-1]]
            include = True

            for i in range(len(candidate)):
                subset = candidate[:i] + candidate[i+1:]
                if subset not in L[k - 1]:
                    include = False
                    break

            if include:
                candidates.append(candidate)

    histogram = defaultdict(int)
    for t in transactions:
        for c in candidates:
            if set(c).issubset(set(t)):
                histogram[tuple(c)] += 1

    new_layer = []
    for itemset, count in histogram.items():
        if (count / N_TRANSACTIONS) >= MIN_SUPPORT:
            new_layer.append(list(itemset))
            supports[itemset] = count

    L.append(new_layer)

In [4]:
for k, layer in enumerate(L):
    print(f'L({k+1}):', len(layer))

L(1): 120
L(2): 605
L(3): 264
L(4): 12
L(5): 0


# Rule generation

In [6]:
for k in range(1, len(L)):
    for itemset in L[k]:
        x = set(itemset)

        for i in range(1, 2**k):
            s = (itemset[j] for j in range(k) if (i & (1 << j)))
            confidence = supports[tuple(itemset)] / supports[tuple(s)]

            if confidence >= MIN_CONFIDENCE:
                print(s, '=>', x - set(s), f'(confidence={confidence})')

<generator object <genexpr> at 0x119b78890> => {'butter', 'tropical_fruit', 'whole_milk'} (confidence=0.6224489795918368)
<generator object <genexpr> at 0x119b78cf0> => {'bottled_water', 'butter', 'whole_milk'} (confidence=0.6022727272727273)
<generator object <genexpr> at 0x119b78cf0> => {'curd', 'tropical_fruit', 'whole_milk'} (confidence=0.6336633663366337)
<generator object <genexpr> at 0x119b78890> => {'tropical_fruit', 'domestic_eggs', 'whole_milk'} (confidence=0.6071428571428571)
<generator object <genexpr> at 0x119b78d60> => {'domestic_eggs', 'margarine', 'whole_milk'} (confidence=0.6219512195121951)
<generator object <genexpr> at 0x119b78d60> => {'pip_fruit', 'domestic_eggs', 'whole_milk'} (confidence=0.6235294117647059)
<generator object <genexpr> at 0x119b78d60> => {'butter', 'root_vegetables', 'whole_milk'} (confidence=0.6377952755905512)
<generator object <genexpr> at 0x119b78890> => {'butter', 'whipped_sour_cream', 'whole_milk'} (confidence=0.66)
<generator object <genexp