In [8]:
from typing import List, Tuple
from itertools import combinations
from abc import ABC, abstractmethod, abstractstaticmethod, abstractclassmethod
import pandas as pd
import logging

logging.basicConfig()
logger = logging.getLogger("foobar")


In [9]:
class FileReader(ABC):
    def __init__(self, file_path):
        self.file_path = file_path
        
    @abstractmethod
    def read(self):
        pass
        
        
class CollapsedCsvFileReader(FileReader):
    """the file format is lines, with individual transactinos"""
    """separated by commma - thus calling this collapsed"""
    """file format as it is non-traditional"""
    def read(self):
        file_iter = open(self.file_path, 'r')
        raw_transactions = list()
        for line in file_iter:
            line = line.strip().rstrip(',')
            record = tuple(sorted(line.split(',')))
            raw_transactions.append(record)

        return raw_transactions
    

In [17]:
class Apriori(object):
# following https://en.wikipedia.org/wiki/Apriori_algorithm#Examples
    def __init__(self, transactions: List):
        self.__verify__(transactions)

        self._transactions = transactions

    def __str__(self):
        return str(tuple(self))


    def __verify__(self, transactions):
        if transactions is None:
            raise ValueError("Transaction itemset is none")

        if not isinstance(transactions, List):
            raise ValueError("Transaction itemset is not a List")

        if len(transactions) == 0:
            raise ValueError("Transaction is empty")

        if len(transactions) > 0 and not isinstance(transactions[0], Tuple):
            raise ValueError("Transaction lement is not a Tuple")


    @property
    def transactions(self):
        return self._itemset_transactions
    

    @transactions.setter
    def transactions(self, value):

        self.__verify__(value)
        self._itemset_transactions = value
        
        
    def generate_levels(self, support_level=0.20, drop_below_support=True):
        k = 1
        full_set = list() # this contains a dataframe for each level.
        while True:
            logger.info("k = {0}".format(k))
            item_levels = self.generate_combinination_levels(self._transactions, k)
            sl = self.gen_support_level(self._transactions, item_levels, support = support_level, drop = drop_below_support)
            print(len(sl))
            k += 1

            if len(sl) == 0 or k == 100:
                break

            df = pd.DataFrame.from_dict(sl, orient='index', columns=['count'])
            full_set.append(df)
            
        return full_set
    
        
    def generate_combinination_levels(self, tran_list, level):
        """generate keys that are used for subset checking"""
        """on each transaction"""
        results = list()
        for t in tran_list:
            #logger.debug(t)
            # TODO: figure out why at level 1 trailing comma
            [ (logger.debug("new key: {0}".format(i)), results.append(i)) for i in combinations(t, level)]

        return sorted(set(results))


    def gen_support_level(self, tran_list, items_keys, support = 0.20, drop = True):
        """for each key which can be a set find in transactions"""
        """how many contain the combination"""        
        logger.info('Using support level of {0}'.format(support))
        logger.info('drop below support? {0}'.format(drop))
        tran_count = len(tran_list)
        base_level = tran_count * support
        logger.info('base level count: {0}'.format(base_level))
        itemSet = dict()

        for key in items_keys:
            for t in tran_list:
                if set(key).issubset(t):
                    logger.debug('is subset: {0}'.format(t))
                    if (key) in itemSet:
                        itemSet[key] += 1
                    else:
                        itemSet[key] = 1

        if drop:
            return {key:value for (key,value) in itemSet.items() if value > base_level}
        else:
            return {key:value for (key,value) in itemSet.items()} 



In [18]:
fr = CollapsedCsvFileReader('./data/test4.csv')
t2 = fr.read()
g = Apriori(t2)
output = g.generate_levels()
output

7
14
12
5
1
0


[                count
 (Corn,)             2
 (Eggs,)             4
 (KidneyBeans,)      5
 (Milk,)             3
 (Nutmeg,)           2
 (Onion,)            3
 (Yogurt,)           3,
                        count
 (Corn, KidneyBeans)        2
 (Eggs, KidneyBeans)        4
 (Eggs, Milk)               2
 (Eggs, Nutmeg)             2
 (Eggs, Onion)              3
 (Eggs, Yogurt)             2
 (KidneyBeans, Milk)        3
 (KidneyBeans, Nutmeg)      2
 (KidneyBeans, Onion)       3
 (KidneyBeans, Yogurt)      3
 (Milk, Yogurt)             2
 (Nutmeg, Onion)            2
 (Nutmeg, Yogurt)           2
 (Onion, Yogurt)            2,
                                count
 (Eggs, KidneyBeans, Milk)          2
 (Eggs, KidneyBeans, Nutmeg)        2
 (Eggs, KidneyBeans, Onion)         3
 (Eggs, KidneyBeans, Yogurt)        2
 (Eggs, Nutmeg, Onion)              2
 (Eggs, Nutmeg, Yogurt)             2
 (Eggs, Onion, Yogurt)              2
 (KidneyBeans, Milk, Yogurt)        2
 (KidneyBeans, Nutmeg,

In [None]:
output