# Homework 2: Discovery of Frequent Itemsets and Association Rules

## Libraries

In [2]:
from collections import defaultdict
import itertools
import time

## Functions

In [165]:
def filterSupport(L, s):
  """
  Filter candidates with lower frequency than s
  """
  
  # Return: {(x, y, ..., z): [count] >= s, (a, b, ..., z): [count] >= s, ...}
  # Type: Dictionary
  return {i: L[i]  for i in L if L[i] >= s}

In [166]:
def reduceByKey(L):
  """
  Reduce list of items by key and count
  """

  new_L = defaultdict(int)
  for k in L:
    new_L[(k,)] += 1
  # Return: {(x, y, ..., z): [count], (a, b, ..., z): [count] , ...}
  # Type: Dictionary
  return new_L

In [176]:
def apriori_gen(C_tak, single_item):
  """
  Generate new items of size (0, 1, ..., k)
  """
  Ck = {}
  # Loop through all (k-1)-sub-itemsets
  for item_set in C_tak:
    # Check of 1-sub-itemsets is within arbitrary (k-1)-sub-itemsets
    # If not, a new k-sub-itemset is created from:
    #
    # ((k-1)-sub-itemsets , 1-sub-itemsets)
    for item in single_items:
      if item[0] not in item_set:
        new_item = tuple(sorted(item_set + item))
        if new_item not in Ck:
          Ck[new_item] = 0

  # Return: New k-sub-itemsets
  # Structure: {item0: 0, item1:0, ...}
  return Ck

In [162]:
def read_file():
  """
  Read database file of sale transactions
  """

  L = list()
  with open("T10I4D100K.dat") as file:
    for line in file:
      L.append(tuple(map(int, line.rstrip().split(" "))))
  transactions = L
  single_items = list(itertools.chain(*L))

  # Return: transactions: list of transactions in the form of tuples
  #         single_items: list of total items bought
  return transactions, single_items

## Algorithm

### Creating baskets

In [173]:
s  = 1000
transactions, single_items = read_file()

### First iteration of the algorithm

In [174]:
L = reduceByKey(single_items)
single_items = filterSupport(L, s)
L = single_items
frequent_k_itemsets = []
frequent_k_itemsets.append(single_items)

### Apriori main algorithm

In [175]:
k = 2
start = time.time()
print("Frequent itemsets ( k =", k-1, ")", L)

# Iterate until there is no frequent itemsets left for size k
while L:
  # Generate k-sub-itemsets
  Ck = apriori_gen(L, single_item_set)
  new_transactions = []
  # Iterate of transactions and find how many k-sub-itemsets are
  # is in each transaction and add them up --> Filter out k-sub-itemsets <= s
  for transaction in transactions:
    relevant = False
    # Make all possible transaction combination of size k
    transaction_combinations = itertools.combinations(transaction, k)
    # Find which k-sub-itemsets is within a specific transaction
    # and add each occurence
    for combination in transaction_combinations:
      if combination in Ck:
        Ck[combination] += 1
        relevant = True
    
    # If transaction contain arbitrary k-sub-itemset
    # the transaction will be used for next k+1 iteration
    if relevant:
      new_transactions.append(transaction)

  # Filter k-sub-itemsets below < s occurences
  L = filterSupport(Ck, s)
  # Update transactions to only transactions which have been containing k-sub-itemsets
  transactions = new_transactions
  # Add frequenct k-sub-itemsets
  frequent_k_itemsets.append(L)
  k += 1

  print("Frequent itemsets ( k =", k, ")", L)
print("Time: ", time.time()- start, "s")
    

Frequent itemsets ( k = 1 ) {(25,): 1395, (52,): 1983, (240,): 1399, (274,): 2628, (368,): 7828, (448,): 1370, (538,): 3982, (561,): 2783, (630,): 1523, (687,): 1762, (775,): 3771, (825,): 3085, (834,): 1373, (39,): 4258, (120,): 4973, (205,): 3605, (401,): 3667, (581,): 2943, (704,): 1794, (814,): 1672, (35,): 1984, (674,): 2527, (733,): 1141, (854,): 2847, (950,): 1463, (422,): 1255, (449,): 1890, (857,): 1588, (895,): 3385, (937,): 4681, (964,): 1518, (229,): 2281, (283,): 4082, (294,): 1445, (381,): 2959, (708,): 1090, (738,): 2129, (766,): 6265, (853,): 1804, (883,): 4902, (966,): 3921, (978,): 1141, (104,): 1158, (143,): 1417, (569,): 2835, (620,): 2100, (798,): 3103, (185,): 1529, (214,): 1893, (350,): 3069, (529,): 7057, (658,): 1881, (682,): 4132, (782,): 2767, (809,): 2163, (947,): 3690, (970,): 2086, (227,): 1818, (390,): 2685, (71,): 3507, (192,): 2004, (208,): 1483, (279,): 3014, (280,): 2108, (496,): 1428, (530,): 1263, (597,): 2883, (618,): 1337, (675,): 2976, (720,): 38