In [104]:
from typing import List, Tuple
from itertools import combinations, product, permutations
from abc import ABC, abstractmethod, abstractstaticmethod, abstractclassmethod
import pandas as pd
import logging

logging.basicConfig()
logger = logging.getLogger("apriori")


In [None]:
class FileReader(ABC):
    def __init__(self, file_path):
        self.file_path = file_path
        
    @abstractmethod
    def read(self) -> list:
        pass
        
        
class CollapsedCsvFileReader(FileReader):
    """the file format is lines, with individual transactinos"""
    """separated by commma - thus calling this collapsed"""
    """file format as it is non-traditional"""
    def read(self) -> list:
        file_iter = open(self.file_path, 'r')
        raw_transactions = list()
        for line in file_iter:
            line = line.strip().rstrip(',')
            record = tuple(sorted(line.split(',')))
            raw_transactions.append(record)

        return raw_transactions
    

In [118]:
class Apriori(object):
# following https://en.wikipedia.org/wiki/Apriori_algorithm#Examples
    def __init__(self, transactions: List):
        self.__verify__(transactions)

        self._transactions = transactions

    def __str__(self):
        return str(tuple(self))


    def __verify__(self, transactions):
        if transactions is None:
            raise ValueError("Transaction itemset is none")

        if not isinstance(transactions, List):
            raise ValueError("Transaction itemset is not a List")

        if len(transactions) == 0:
            raise ValueError("Transaction is empty")

        if len(transactions) > 0 and not isinstance(transactions[0], Tuple):
            raise ValueError("Transaction lement is not a Tuple")


    @property
    def transactions(self) -> List:
        return self._transactions
    

    @transactions.setter
    def transactions(self, value: List):

        self.__verify__(value)
        self._transactions  = value
        
        
    def generate_levels(self, support_level=0.20, drop_below_support=True) -> pd.DataFrame:
        k = 1
        full_set = list() # this contains a dataframe for each level.
        while True:
            logger.info("k = {0}".format(k))
            item_levels = self.__generate_combinination_levels(self._transactions, k)
            sl = self.__gen_support_level(self._transactions, item_levels, 
                                          support = support_level, drop = drop_below_support)
            
            #logger.debug("transactions at level {k} are {n}".format(k = k, n = (len(sl))))
            k += 1

            if len(sl) == 0 or k == 100:
                break

            df = pd.DataFrame.from_dict(sl, orient='index', columns=['count'])
            df.index.name = 'itemsets'
            df.reset_index()
            full_set.append(df)
            
        rv = self.__append_colums(full_set)
        return rv
    
    def __append_colums(self, data: List, tran_list = None) -> pd.DataFrame:
        if tran_list == None:
            tran_list = self.transactions
            
        tran_count = len(tran_list)
        
        rows_list = []
        for r in data:
            #logger.debug('type of r is: {0}'.format(type(r)))
            #logger.debug('len of r is: {0}'.format(len(r)))
            #logger.debug('r is: {0}'.format(r))
            for index, row in r.iterrows():
                #d = { 'count' : r['count'], 'support': r['count']/tran_count}
                d = {'itemsets': index, 'count' : row['count'], 'support': row['count']/tran_count}
                #logger.debug("THE DICTd: {0}".format(d))
                rows_list.append(d)
            
        # TODO: fix rv
        
        df = pd.DataFrame(rows_list)
        
        #logger.debug(df)
        return df  #data
        
        
    def __generate_combinination_levels(self, tran_list, level):
        """generate keys that are used for subset checking"""
        """on each transaction"""
        results = list()
        for t in tran_list:
            logger.debug("gen_com_levell: t: {0}  and level: {1}".format(t, level))
            # TODO: figure out why at level 1 trailing comma
            #if level == 1:
            #    results.append(t)
            #else:
            [ results.append(i) for i in combinations(t, level)]


        rv = sorted(set(results))
        logger.debug("combo levels: {0}".format(rv))
        return rv


    def __gen_support_level(self, tran_list, items_keys, support = 0.20, drop = True):
        """for each key which can be a set find in transactions"""
        """how many contain the combination"""        
        logger.info('Using support level of {0}'.format(support))
        logger.info('drop below support? {0}'.format(drop))
        tran_count = len(tran_list)
        base_level = tran_count * support
        logger.debug('base level count: {0}'.format(base_level))
        itemSet = dict()

        for key in items_keys:
            for t in tran_list:
                if set(key).issubset(t):
                    #logger.debug('is subset: {0}'.format(t))
                    if (key) in itemSet:
                        itemSet[key] += 1
                    else:
                        itemSet[key] = 1

        if drop:
            return {key:value for (key,value) in itemSet.items() if value >= base_level}
        else:
            return {key:value for (key,value) in itemSet.items()} 



In [124]:
logging.basicConfig()
logger = logging.getLogger('apriori')
logger.setLevel(logging.INFO)

#fr = CollapsedCsvFileReader('./data/test4.csv')
fr = CollapsedCsvFileReader('./data/test-dm-bookch6.csv')

t2 = fr.read()

#logger.info("tran count: {0}".format(t2))
g = Apriori(t2)
#output = g.generate_levels(support_level=0.60, drop_below_support=False)
output = g.generate_levels(support_level=0.22, drop_below_support=True)

#out2 = g.append_colums(output)



#[(p) for p in out2]

INFO:apriori:k = 1
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True
INFO:apriori:k = 2
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True
INFO:apriori:k = 3
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True
INFO:apriori:k = 4
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True


In [125]:
output

Unnamed: 0,itemsets,count,support
0,"( I2,)",4,0.444444
1,"( I3,)",6,0.666667
2,"( I4,)",2,0.222222
3,"( I5,)",2,0.222222
4,"(I1,)",6,0.666667
5,"(I2,)",3,0.333333
6,"( I2, I3)",2,0.222222
7,"( I2, I5)",2,0.222222
8,"( I2, I1)",4,0.444444
9,"( I3, I1)",4,0.444444


In [97]:
def generate_associations(data: pd.DataFrame, threshold = 0.10):
    rv = None
    items = list()
    for index, row in data.iterrows():
        rv = row['itemsets']
        print(row['itemsets'])
        items.append(row['itemsets'])
        #break
    return items

In [98]:
r = generate_associations(output)

('Eggs',)
('KidneyBeans',)
('Milk',)
('Onion',)
('Yogurt',)
('Eggs', 'KidneyBeans')
('Eggs', 'Onion')
('KidneyBeans', 'Milk')
('KidneyBeans', 'Onion')
('KidneyBeans', 'Yogurt')
('Eggs', 'KidneyBeans', 'Onion')


In [99]:
r

[('Eggs',),
 ('KidneyBeans',),
 ('Milk',),
 ('Onion',),
 ('Yogurt',),
 ('Eggs', 'KidneyBeans'),
 ('Eggs', 'Onion'),
 ('KidneyBeans', 'Milk'),
 ('KidneyBeans', 'Onion'),
 ('KidneyBeans', 'Yogurt'),
 ('Eggs', 'KidneyBeans', 'Onion')]

In [110]:
r

[('Eggs',),
 ('KidneyBeans',),
 ('Milk',),
 ('Onion',),
 ('Yogurt',),
 ('Eggs', 'KidneyBeans'),
 ('Eggs', 'Onion'),
 ('KidneyBeans', 'Milk'),
 ('KidneyBeans', 'Onion'),
 ('KidneyBeans', 'Yogurt'),
 ('Eggs', 'KidneyBeans', 'Onion')]