In [0]:
# Standard library
import itertools
import unittest
from copy import deepcopy
from timeit import timeit
from collections import Counter

# Third-party library
from pandas import DataFrame as df
from pandas import read_csv

In [0]:
# https://www-users.cs.umn.edu/~kumar001/dmbook/ch6.pdf
class AprioriAlgorithm(object):
    def __init__(self, *, dataset = None, minsup = None, minconf = None):
        # Attributes
        self.all_frequent_itemsets = None
        self.all_rules = None
        self.dataset = None
        self.minconf = None
        self.minsup = None
        self.n = None # number of transaction

        # Call methods
        self.set_dataset(dataset) if dataset else None
        self.set_minsup(minsup)
        self.set_minconf(minconf)


    def set_dataset(self, dataset):
        self.dataset = dataset
        self.n = len(self.dataset)
        return self
    
    
    def set_minconf(self, minconf):
        self.minconf = minconf
        return self


    def set_minsup(self, minsup):
        self.minsup = minsup
        return self
        

    def set_transaction_dataset(self, *, transaction_dataset, transaction_id_column, itemset_column):
        transaction_dataset_cloned = transaction_dataset.copy(deep = True)
        transaction_dataset_cloned = transaction_dataset_cloned[{
            transaction_id_column,
            itemset_column
        }] \
        .drop_duplicates() \
        .sort_values([transaction_id_column, itemset_column]) \
        .set_index([transaction_id_column])

        self.dataset = []
        for transaction_id in transaction_dataset_cloned.index.unique():
            itemsets = transaction_dataset_cloned.loc[[transaction_id], itemset_column].tolist()
            self.dataset.append(itemsets)
        self.n = len(self.dataset)
        return self
    
    
    def _calculate_support(self, itemset):
        itemset_cloned = frozenset(itemset)
        support = 0.0
        for event in self.dataset:
            if itemset_cloned.issubset(frozenset(event)):
                support += 1
        return support/self.n


    def generate_all_frequent_itemsets(self):
        self.all_frequent_itemsets = dict()

        frequent_single_itemsets = previous_frequent_itemsets = self._generate_frequent_single_itemsets()
        self.all_frequent_itemsets.update(frequent_single_itemsets)

        while True:
            previous_itemsets = list(previous_frequent_itemsets.keys())
            candidate_itemsets = self._generate_candidate_itemsets(previous_itemsets)
            frequent_itemsets = self._prune_frequent_itemsets(candidate_itemsets)

            if frequent_itemsets:
                self.all_frequent_itemsets.update(frequent_itemsets)
                previous_frequent_itemsets = frequent_itemsets
            else:
                break
        return self


    def generate_all_rules(self):
        self.all_rules = dict()

        if not self.all_frequent_itemsets:
            self.generate_all_frequent_itemsets()

        for frequent_itemset, support in self.all_frequent_itemsets.items():
            if len(frequent_itemset) <= 1:
                continue
            
            self._generate_rules(itemset=frequent_itemset)
        return self

    
    @staticmethod
    def _generate_candidate_itemsets(itemsets):
        candidate_itemsets = list()
        for i, itemset in enumerate(itemsets[:-1]):
            for itemset2 in itemsets[i+1:]:
                itemset_cloned = list(deepcopy(itemset))
                itemset_cloned.sort()
                itemset2_cloned = list(deepcopy(itemset2))
                itemset2_cloned.sort()
                if itemset_cloned[:-1] == itemset2_cloned[:-1]:
                    candidate_itemset = set().union(itemset_cloned, itemset2_cloned)
                    candidate_itemsets.append(frozenset(candidate_itemset))
        return candidate_itemsets


    def _generate_frequent_single_itemsets(self):
        single_itemsets = set()
        for event in self.dataset:
            for item in event:
                single_itemsets.add(frozenset([item]))
        frequent_single_itemsets = self._prune_frequent_itemsets(single_itemsets)
        return frequent_single_itemsets


    def _generate_rules(self, itemset, previous_consequents=None):
        if previous_consequents:
            consequents = self._generate_candidate_itemsets(previous_consequents)
        else:
            consequents = list(deepcopy(itemset))

        if consequents and len(itemset) == len(consequents[0]):
            return

        for consequent in deepcopy(consequents):
            antecedent =  itemset.difference(frozenset([consequent]))
            confident = self.all_frequent_itemsets[frozenset(itemset)] / self.all_frequent_itemsets[frozenset(antecedent)]
            if confident >= self.minconf:
                lift = confident / self.all_frequent_itemsets[frozenset([consequent])]
                self.all_rules[(frozenset(antecedent), frozenset([consequent]))] = {
                    'support': self.all_frequent_itemsets[frozenset(itemset)],
                    'confident': round(confident, 7),
                    'lift': round(lift, 7)
                }
            else:
                consequents.remove(consequent)
        
        if consequents and len(itemset) > len(consequents[0]) + 1:
            self._generate_rules(itemset, previous_consequents = consequents)
        

    def _prune_frequent_itemsets(self, itemsets):
        frequent_itemsets = dict()
        for itemset in itemsets:
            itemset_cloned = frozenset(itemset)
            support = self._calculate_support(itemset_cloned)
            if support >= self.minsup:
                frequent_itemsets[itemset_cloned] = round(support, 7)
        return frequent_itemsets
        



class TestAprioriAlgorithm(unittest.TestCase):
    maxDiff = None
    dataset =  [
        ["a"], ["a", "b", "c"], ["a", "c"], ["c"],
        ["a"], ["c"], ["b", "c"],
        ["a", "b"], ["d"], ["c"], ["b"], ["c"],
        ["a"], ["c"], ["b"], ["c"]
    ]

    apriori_algorithm_case_1 = AprioriAlgorithm(dataset = dataset, minsup = 2/16, minconf = 0.1)
    apriori_algorithm_case_2 = AprioriAlgorithm(dataset = dataset, minsup = 3/16, minconf = 0.2)
    apriori_algorithm_case_3 = AprioriAlgorithm(dataset = dataset, minsup = 6/16, minconf = 0.3)


    def test__calculate_support(self):
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['a']), 6/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['b']), 5/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['c']), 9/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['d']), 1/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['a','b']), 2/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['a','c']), 2/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['a','b','c']), 1/16)
        self.assertEqual(self.apriori_algorithm_case_1._calculate_support(['a','b','c','d']), 0/16)


    def test__prune_frequent_itemsets(self):
        self.assertEqual(
            self.apriori_algorithm_case_1._prune_frequent_itemsets([['a'],['b'],['c'],['d']]), 
            {
                frozenset({'a'}): 0.375,
                frozenset({'b'}): 0.3125,
                frozenset({'c'}): 0.5625
            }
        )
        self.assertEqual(
            self.apriori_algorithm_case_2._prune_frequent_itemsets([['a'],['b'],['c'],['d']]), 
            {
                frozenset({'a'}): 0.375,
                frozenset({'b'}): 0.3125,
                frozenset({'c'}): 0.5625
            }
        )
        self.assertEqual(
            self.apriori_algorithm_case_3._prune_frequent_itemsets([['a'],['b'],['c'],['d']]), 
            {
                frozenset({'a'}): 0.375,
                frozenset({'c'}): 0.5625
            }
        )       


    def test__generate_frequent_single_itemsets(self):
        self.assertEqual(
            self.apriori_algorithm_case_1._generate_frequent_single_itemsets(), 
            {
                frozenset({'a'}): 0.375,
                frozenset({'b'}): 0.3125,
                frozenset({'c'}): 0.5625
            }
        )
        self.assertEqual(
            self.apriori_algorithm_case_2._generate_frequent_single_itemsets(), 
            {
                frozenset({'a'}): 0.375,
                frozenset({'b'}): 0.3125,
                frozenset({'c'}): 0.5625
            }
        )
        self.assertEqual(
            self.apriori_algorithm_case_3._generate_frequent_single_itemsets(), 
            {
                frozenset({'a'}): 0.375,
                frozenset({'c'}): 0.5625
            }
        )


    def test__generate_candidate_itemsets(self):
        candidate_itemsets = self.apriori_algorithm_case_1._generate_candidate_itemsets([frozenset({'a'}), frozenset({'c'}), frozenset({'b'})])
        self.assertEqual(
            Counter(candidate_itemsets), 
            Counter([frozenset({'a', 'b'}), frozenset({'a', 'c'}), frozenset({'c', 'b'})])
        )

        candidate_itemsets = self.apriori_algorithm_case_1._generate_candidate_itemsets(
            [frozenset({'a', 'b'}), frozenset({'a', 'c'}), frozenset({'c', 'b'})]
        )
        self.assertEqual(
            Counter(candidate_itemsets), 
            Counter([frozenset({'c', 'b', 'a'})])
        )

        candidate_itemsets = self.apriori_algorithm_case_1._generate_candidate_itemsets([['a'], ['b'], ['c']])
        self.assertEqual(
            Counter(candidate_itemsets), 
            Counter([frozenset({'a', 'b'}), frozenset({'a', 'c'}), frozenset({'c', 'b'})])
        )

        candidate_itemsets = self.apriori_algorithm_case_1._generate_candidate_itemsets(
            [['a','b'], ['b','c'], ['a','c']]
        )
        self.assertEqual(
            Counter(candidate_itemsets), 
            Counter([frozenset({'c', 'b', 'a'})])
        )

        
    def test_generate_all_frequent_itemsets(self):
        self.apriori_algorithm_case_1.generate_all_frequent_itemsets()
        self.assertEqual(
            self.apriori_algorithm_case_1.all_frequent_itemsets,
            {
                frozenset({'c'}): 0.5625,
                frozenset({'b'}): 0.3125,
                frozenset({'a'}): 0.375,
                frozenset({'b', 'c'}): 0.125,
                frozenset({'a', 'c'}): 0.125,
                frozenset({'a', 'b'}): 0.125
            }
        )

        self.apriori_algorithm_case_2.generate_all_frequent_itemsets()
        self.assertEqual(
            self.apriori_algorithm_case_2.all_frequent_itemsets,
            {
                frozenset({'c'}): 0.5625, 
                frozenset({'b'}): 0.3125, 
                frozenset({'a'}): 0.375
            }
        )

        self.apriori_algorithm_case_3.generate_all_frequent_itemsets()
        self.assertEqual(
            self.apriori_algorithm_case_3.all_frequent_itemsets,
            {
                frozenset({'c'}): 0.5625, 
                frozenset({'a'}): 0.375
            }
        )


    def test_generate_all_rules(self):
        self.apriori_algorithm_case_1.generate_all_rules()
        self.assertEqual(
            self.apriori_algorithm_case_1.all_rules,
            {
                (frozenset({'a'}), frozenset({'c'})): { 
                    'confident': 0.3333333,
                    'lift': 0.5925926,
                    'support': 0.125
                },
                (frozenset({'c'}), frozenset({'a'})): {
                    'confident': 0.2222222,
                    'lift': 0.5925926,
                    'support': 0.125
                },
                (frozenset({'a'}), frozenset({'b'})): {
                    'confident': 0.3333333,
                    'lift': 1.0666667,
                    'support': 0.125
                },
                (frozenset({'b'}), frozenset({'a'})): {
                    'confident': 0.4,
                    'lift': 1.0666667,
                    'support': 0.125
                },
                (frozenset({'c'}), frozenset({'b'})): {
                    'confident': 0.2222222,
                    'lift': 0.7111111,
                    'support': 0.125
                },
                (frozenset({'b'}), frozenset({'c'})): {
                    'confident': 0.4,
                    'lift': 0.7111111,
                    'support': 0.125
                }
            }
        )
        
        self.apriori_algorithm_case_1a = deepcopy(self.apriori_algorithm_case_1)
        self.apriori_algorithm_case_1a.set_minconf(0.3)
        self.apriori_algorithm_case_1a.generate_all_frequent_itemsets().generate_all_rules()
        self.assertEqual(
            self.apriori_algorithm_case_1a.all_rules,
            {
                (frozenset({'a'}), frozenset({'c'})): { 
                    'confident': 0.3333333,
                    'lift': 0.5925926,
                    'support': 0.125
                },
                (frozenset({'a'}), frozenset({'b'})): {
                    'confident': 0.3333333,
                    'lift': 1.0666667,
                    'support': 0.125
                },
                (frozenset({'b'}), frozenset({'a'})): {
                    'confident': 0.4,
                    'lift': 1.0666667,
                    'support': 0.125
                },
                (frozenset({'b'}), frozenset({'c'})): {
                    'confident': 0.4,
                    'lift': 0.7111111,
                    'support': 0.125
                }
            }
        )
        
        self.apriori_algorithm_case_1b = deepcopy(self.apriori_algorithm_case_1)
        self.apriori_algorithm_case_1b.set_minconf(0.4)
        self.apriori_algorithm_case_1b.generate_all_frequent_itemsets().generate_all_rules()
        self.assertEqual(
            self.apriori_algorithm_case_1b.all_rules,
            {
                (frozenset({'b'}), frozenset({'a'})): {
                    'confident': 0.4,
                    'lift': 1.0666667,
                    'support': 0.125
                },
                (frozenset({'b'}), frozenset({'c'})): {
                    'confident': 0.4,
                    'lift': 0.7111111,
                    'support': 0.125
                }
            }
        )


test_apriori_algorithm = TestAprioriAlgorithm()
test_apriori_algorithm.test__calculate_support()
test_apriori_algorithm.test__prune_frequent_itemsets()
test_apriori_algorithm.test__generate_frequent_single_itemsets()
test_apriori_algorithm.test__generate_candidate_itemsets()
test_apriori_algorithm.test_generate_all_frequent_itemsets()
test_apriori_algorithm.test_generate_all_rules()

In [33]:
dataset =  [
    ["a"], ["a", "b", "c"], ["a", "c"], ["c"],
    ["a"], ["c"], ["b", "c"],
    ["a", "b"], ["d"], ["c"], ["b"], ["c"],
    ["a"], ["c"], ["b"], ["c"]
]
apriori_algorithm = AprioriAlgorithm(dataset = dataset, minsup = 2/16, minconf = 0.1)
apriori_algorithm.generate_all_frequent_itemsets()
apriori_algorithm.all_frequent_itemsets
apriori_algorithm.generate_all_rules()
apriori_algorithm.all_rules

{(frozenset({'a'}), frozenset({'c'})): {'confident': 0.3333333,
  'lift': 0.5925926,
  'support': 0.125},
 (frozenset({'c'}), frozenset({'a'})): {'confident': 0.2222222,
  'lift': 0.5925926,
  'support': 0.125},
 (frozenset({'a'}), frozenset({'b'})): {'confident': 0.3333333,
  'lift': 1.0666667,
  'support': 0.125},
 (frozenset({'b'}), frozenset({'a'})): {'confident': 0.4,
  'lift': 1.0666667,
  'support': 0.125},
 (frozenset({'c'}), frozenset({'b'})): {'confident': 0.2222222,
  'lift': 0.7111111,
  'support': 0.125},
 (frozenset({'b'}), frozenset({'c'})): {'confident': 0.4,
  'lift': 0.7111111,
  'support': 0.125}}

In [0]:
coffee_shop_df = read_csv(
    'https://raw.githubusercontent.com/alexjdata/ml_projects/master/ml_projects/sequential_pattern/coffee_shop_dataset.csv', 
    parse_dates = ['transaction_date']
)
dataset = AprioriAlgorithm().set_transaction_dataset(
    transaction_dataset = coffee_shop_df,
    transaction_id_column = 'transaction_id',
    itemset_column = 'product_type'
).dataset

In [39]:
def test():
    apriori_algorithm = AprioriAlgorithm(dataset = dataset, minsup = 0.005, minconf = 0.02)
    apriori_algorithm.generate_all_frequent_itemsets()
    apriori_algorithm.generate_all_rules()
    return apriori_algorithm

print(timeit(test, number=10))
x = test()

12.36990643900026


In [38]:
x.all_rules

{(frozenset({'Brewed Chai tea'}),
  frozenset({'Biscotti'})): {'confident': 0.0477774, 'lift': 0.9895923, 'support': 0.0069517},
 (frozenset({'Biscotti'}),
  frozenset({'Brewed Chai tea'})): {'confident': 0.1439875, 'lift': 0.9895923, 'support': 0.0069517},
 (frozenset({'Barista Espresso'}),
  frozenset({'Biscotti'})): {'confident': 0.0447682, 'lift': 0.9272645, 'support': 0.0062642},
 (frozenset({'Biscotti'}),
  frozenset({'Barista Espresso'})): {'confident': 0.1297476, 'lift': 0.9272645, 'support': 0.0062642},
 (frozenset({'Gourmet brewed coffee'}),
  frozenset({'Biscotti'})): {'confident': 0.0476528, 'lift': 0.9870109, 'support': 0.0069008},
 (frozenset({'Biscotti'}),
  frozenset({'Gourmet brewed coffee'})): {'confident': 0.1429332, 'lift': 0.9870109, 'support': 0.0069008},
 (frozenset({'Scone'}),
  frozenset({'Organic brewed coffee'})): {'confident': 0.0656375, 'lift': 0.9022222, 'support': 0.0056276},
 (frozenset({'Organic brewed coffee'}),
  frozenset({'Scone'})): {'confident': 0

In [0]:
# dataset

In [0]:
"""
print(timeit(test, number=3))
88.58
"""
def transform_transaction_dataset(
    transaction_dataset, *, 
    transaction_id_column, itemset_column
):
    transaction_dataset_cloned = transaction_dataset.copy(deep = True)
    transaction_dataset_cloned = transaction_dataset_cloned[{
        transaction_id_column,
        itemset_column
    }].drop_duplicates()

    dataset = []
    for transaction_id in transaction_dataset_cloned[transaction_id_column].unique():
        itemsets = transaction_dataset_cloned[
            transaction_dataset_cloned[transaction_id_column] == transaction_id
        ][itemset_column].tolist()
        dataset.append(itemsets)
    return dataset

def test():
    return transform_transaction_dataset(
        transaction_dataset = coffee_shop_df,
        transaction_id_column = 'transaction_id',
        itemset_column = 'product_type'
    )

In [0]:
"""
print(timeit(test, number=3))
49.56
"""
def transform_transaction_dataset(
    transaction_dataset, *, 
    transaction_id_column, itemset_column
):
    transaction_dataset_cloned = transaction_dataset.copy(deep = True)
    transaction_dataset_cloned = transaction_dataset_cloned[{
        transaction_id_column,
        itemset_column
    }] \
    .drop_duplicates() \
    .sort_values([transaction_id_column, itemset_column]) \
    .set_index([transaction_id_column])

    dataset = []
    for transaction_id in transaction_dataset_cloned.index.unique():
        itemsets = transaction_dataset_cloned.loc[[transaction_id], itemset_column].tolist()
        dataset.append(itemsets)
    return dataset

def test():
    return transform_transaction_dataset(
        transaction_dataset = coffee_shop_df,
        transaction_id_column = 'transaction_id',
        itemset_column = 'product_type'
    )

In [0]:
list('anc')

['a', 'n', 'c']