In [1]:
from numpy import *

In [2]:
# Create a simple dataset of transactions
def createDataSet():
    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [3]:
def createC1(dataSet):
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
                
    C1.sort()
    return list(map(frozenset, C1)) # use frozen set so we
                            #can use it as a key in a dict

In [4]:
# Ck - List of candidates
# minSupport - Minimum support value that the candidate sets need to meet
# This function generates the first set of candidates (L1) that meet the required support values
# It also returns a dictionary with support values of the L1 set
def scanDataSet(dataset, Ck, minSupport):
    ssCnt = {}
    for tid in dataset:
        for can in Ck:
            if can.issubset(tid):
                if not can in ssCnt: 
                    ssCnt[can] = 1
                else: 
                    ssCnt[can] += 1
    numItems = float(len(dataset))
    retList = []
    supportData = {}
    for key in ssCnt:
        support = ssCnt[key] / numItems
        if support >= minSupport:
            retList.insert(0,key)
        # add the support for items that did not make the cut for debugging purposes
        supportData[key] = support
    return retList, supportData

In [5]:
dataset = createDataSet()

In [6]:
dataset

[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]

In [7]:
# Converts the list of transactions into a list of single item sets
# It returns a frozenset to ensure that the entries in the set can be used as keys in a dictionary
# since frozenset values are immutable
C1 = createC1(dataset)

In [8]:
C1

[frozenset({1}),
 frozenset({2}),
 frozenset({3}),
 frozenset({4}),
 frozenset({5})]

In [9]:
# Converts the transactions into a set and returns a list of sets
# This is needed because it is easier to work with sets in the apriori algorithm
# rather than with a list of items in a transaction
# This is the dataset we will be working with from now
D = list(map(set, dataset))

In [10]:
D

[{1, 3, 4}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]

In [11]:
# We have set minimum support value as 0.5
L1, supportData0 = scanDataSet(D, C1, 0.5)

In [12]:
L1

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [13]:
# We can see that since 4 has a support of less than 0.5, it has not been included in the L1 candidate set
supportData0

{frozenset({1}): 0.5,
 frozenset({3}): 0.75,
 frozenset({4}): 0.25,
 frozenset({2}): 0.75,
 frozenset({5}): 0.75}

In [14]:
# This function takes a list of frequent itemsets (Lk) and the size of itemsets (k)
# It produces the new candidate set Ck
# For example, it will take itemsets {0}, {1}, {2} and produce {0, 1}, {0, 2} and {1, 2}
def CandidateSetGenerator(Lk, k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk): # Consider one item and a time
        for j in range(i+1, lenLk): # Considers all items after "ith" item in the list
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1==L2: #if first k-2 elements are equal
                retList.append(Lk[i] | Lk[j]) # This is the set union operator in Python
    return retList

In [15]:
def aprioriAlgorithm(dataset, minSupport = 0.5):
    C1 = createC1(dataset)
    D = list(map(set, dataset))
    L1, supportData = scanDataSet(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = CandidateSetGenerator(L[k-2], k)
        Lk, supK = scanDataSet(D, Ck, minSupport) # scan transactions to get Lk
        supportData.update(supK) # Replace the support data with 
        L.append(Lk)
        k += 1
    return L, supportData

In [16]:
L,suppData = aprioriAlgorithm(dataset)

In [17]:
L

[[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})],
 [frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})],
 [frozenset({2, 3, 5})],
 []]

In [18]:
L[0]

[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]

In [19]:
L[1]

[frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3})]

In [20]:
L[2]

[frozenset({2, 3, 5})]

In [21]:
L[3] # Since output is empty, we stop here

[]

In [22]:
# This function takes the list of all frequent itemsets and mines the association rules from it
# It returns a list of association rules that meet the given minConf criteria of 0.7
# supportData 
def generateRules(L, supportData, minConf=0.7):  # supportData is a dict coming from scanDataSet
    bigRuleList = []
    for i in range(1, len(L)): # only get the sets with two or more items
        for freqSet in L[i]:
            H1 = [frozenset([item]) for item in freqSet]
            if (i > 1):
                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
            else:
                calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList  

In [23]:
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
    prunedH = [] # create new list to return
    for conseq in H:
        temp_val = supportData[freqSet - conseq]
        conf = supportData[freqSet] / supportData[freqSet - conseq]
        if conf >= minConf: 
            print (freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

In [24]:
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): # try further merging
        Hmp1 = aprioriAlgorithm(H, m+1) # create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    # need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)

In [25]:
L,suppData= aprioriAlgorithm(dataset,minSupport=0.5)

In [26]:
rules= generateRules(L,suppData, minConf=0.7)

frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({1}) --> frozenset({3}) conf: 1.0


TypeError: unsupported operand type(s) for -: 'frozenset' and 'list'

In [27]:

from __future__ import division, print_function
import pandas as pd
import numpy as np
import itertools


class Rule():
    def __init__(self, antecedent, concequent, confidence, support):
        self.antecedent = antecedent
        self.concequent = concequent
        self.confidence = confidence
        self.support = support


class Apriori():
    """A method for determining frequent itemsets in a transactional database and
    also for generating rules for those itemsets.

    Parameters:
    -----------
    min_sup: float
        The minimum fraction of transactions an itemets needs to
        occur in to be deemed frequent
    min_conf: float:
        The minimum fraction of times the antecedent needs to imply
        the concequent to justify rule
    """
    def __init__(self, min_sup=0.3, min_conf=0.81):

        self.min_sup = min_sup
        self.min_conf = min_conf
        self.freq_itemsets = None       # List of freqeuent itemsets
        self.transactions = None        # List of transactions

    def _calculate_support(self, itemset):
        count = 0
        for transaction in self.transactions:
            if self._transaction_contains_items(transaction, itemset):
                count += 1
        support = count / len(self.transactions)
        return support

    # Prunes the candidates that are not frequent
    # => returns list with only frequent itemsets
    def _get_frequent_itemsets(self, candidates):
        frequent = []
        # Find frequent items
        for itemset in candidates:
            support = self._calculate_support(itemset)
            if support >= self.min_sup:
                frequent.append(itemset)
        return frequent

    # True or false depending on the candidate has any
    # subset with size k - 1 that is not in the frequent
    # itemset
    def _has_infrequent_itemsets(self, candidate):
        k = len(candidate)
        # Find all combinations of size k-1 in candidate
        # E.g [1,2,3] => [[1,2],[1,3],[2,3]]
        subsets = list(itertools.combinations(candidate, k - 1))
        for t in subsets:
            # t - is tuple. If size == 1 get the element
            subset = list(t) if len(t) > 1 else t[0]
            if not subset in self.freq_itemsets[-1]:
                return True
        return False

    # Joins the elements in the frequent itemset and prunes
    # resulting sets if they contain subsets that have been determined
    # to be infrequent.
    def _generate_candidates(self, freq_itemset):
        candidates = []
        for itemset1 in freq_itemset:
            for itemset2 in freq_itemset:
                # Valid if every element but the last are the same
                # and the last element in itemset1 is smaller than the last
                # in itemset2
                valid = False
                single_item = isinstance(itemset1, int)
                if single_item and itemset1 < itemset2:
                    valid = True
                elif not single_item and np.array_equal(itemset1[:-1], itemset2[:-1]) and itemset1[-1] < itemset2[-1]:
                    valid = True

                if valid:
                    # JOIN: Add the last element in itemset2 to itemset1 to
                    # create a new candidate
                    if single_item:
                        candidate = [itemset1, itemset2]
                    else:
                        candidate = itemset1 + [itemset2[-1]]
                    # PRUNE: Check if any subset of candidate have been determined
                    # to be infrequent
                    infrequent = self._has_infrequent_itemsets(candidate)
                    if not infrequent:
                        candidates.append(candidate)
        return candidates

    # True or false depending on each item in the itemset is
    # in the transaction
    def _transaction_contains_items(self, transaction, items):
        # If items is in fact only one item
        if isinstance(items, int):
            return items in transaction
        # Iterate through list of items and make sure that
        # all items are in the transaction
        for item in items:
            if not item in transaction:
                return False
        return True

    # Returns the set of frequent itemsets in the list of transactions
    def find_frequent_itemsets(self, transactions):
        self.transactions = transactions
        # Get all unique items in the transactions
        unique_items = set(item for transaction in self.transactions for item in transaction)
        # Get the frequent items
        self.freq_itemsets = [self._get_frequent_itemsets(unique_items)]
        while(True):
            # Generate new candidates from last added frequent itemsets
            candidates = self._generate_candidates(self.freq_itemsets[-1])
            # Get the frequent itemsets among those candidates
            frequent_itemsets = self._get_frequent_itemsets(candidates)

            # If there are no frequent itemsets we're done
            if not frequent_itemsets:
                break

            # Add them to the total list of frequent itemsets and start over
            self.freq_itemsets.append(frequent_itemsets)

        # Flatten the array and return every frequent itemset
        frequent_itemsets = [
            itemset for sublist in self.freq_itemsets for itemset in sublist]
        return frequent_itemsets

    # Recursive function which returns the rules where confidence >= min_confidence
    # Starts with large itemset and recursively explores rules for subsets
    def _rules_from_itemset(self, initial_itemset, itemset):
        rules = []
        k = len(itemset)
        # Get all combinations of sub-itemsets of size k - 1 from itemset
        # E.g [1,2,3] => [[1,2],[1,3],[2,3]]
        subsets = list(itertools.combinations(itemset, k - 1))
        support = self._calculate_support(initial_itemset)
        for antecedent in subsets:
            # itertools.combinations returns tuples => convert to list
            antecedent = list(antecedent)
            antecedent_support = self._calculate_support(antecedent)
            # Calculate the confidence as sup(A and B) / sup(B), if antecedent
            # is B in an itemset of A and B
            confidence = float("{0:.2f}".format(support / antecedent_support))
            if confidence >= self.min_conf:
                # The concequent is the initial_itemset except for antecedent
                concequent = [itemset for itemset in initial_itemset if not itemset in antecedent]
                # If single item => get item
                if len(antecedent) == 1:
                    antecedent = antecedent[0]
                if len(concequent) == 1:
                    concequent = concequent[0]
                # Create new rule
                rule = Rule(
                        antecedent=antecedent,
                        concequent=concequent,
                        confidence=confidence,
                        support=support)
                rules.append(rule)

                # If there are subsets that could result in rules
                # recursively add rules from subsets
                if k - 1 > 1:
                    rules += self._rules_from_itemset(initial_itemset, antecedent)
        return rules

    def generate_rules(self, transactions):
        self.transactions = transactions
        frequent_itemsets = self.find_frequent_itemsets(transactions)
        # Only consider itemsets of size >= 2 items
        frequent_itemsets = [itemset for itemset in frequent_itemsets if not isinstance(
                itemset, int)]
        rules = []
        for itemset in frequent_itemsets:
            rules += self._rules_from_itemset(itemset, itemset)
        # Remove empty values
        return rules


def main():
    # Demo transaction set
    # Example 2: https://en.wikipedia.org/wiki/Apriori_algorithm
    transactions = np.array([[1, 2, 3, 4], [1, 2, 4], [1, 2], [2, 3, 4], [2, 3], [3, 4], [2, 4]])
    print ("- Apriori -")
    min_sup = 0.25
    min_conf = 0.8
    print ("Minimum Support: %.2f, Minimum Confidence: %s" % (min_sup, min_conf))
    print ("Transactions:")
    for transaction in transactions:
        print ("\t%s" % transaction)

    apriori = Apriori(min_sup=min_sup, min_conf=min_conf)

    # Get and print the frequent itemsets
    frequent_itemsets = apriori.find_frequent_itemsets(transactions)
    print ("Frequent Itemsets:\n\t%s" % frequent_itemsets)

    # Get and print the rules
    rules = apriori.generate_rules(transactions)
    print ("Rules:")
    for rule in rules:
        print ("\t%s -> %s (support: %.2f, confidence: %s)" % (rule.antecedent, rule.concequent, rule.support, rule.confidence,))


if __name__ == "__main__":
    main()


- Apriori -
Minimum Support: 0.25, Minimum Confidence: 0.8
Transactions:
	[1, 2, 3, 4]
	[1, 2, 4]
	[1, 2]
	[2, 3, 4]
	[2, 3]
	[3, 4]
	[2, 4]
Frequent Itemsets:
	[1, 2, 3, 4, [1, 2], [1, 4], [2, 3], [2, 4], [3, 4], [1, 2, 4], [2, 3, 4]]
Rules:
	1 -> 2 (support: 0.43, confidence: 1.0)
	4 -> 2 (support: 0.57, confidence: 0.8)
	[1, 4] -> 2 (support: 0.29, confidence: 1.0)
