In [1]:
from typing import List, Tuple
from itertools import combinations, product, permutations
from abc import ABC, abstractmethod, abstractstaticmethod, abstractclassmethod
import pandas as pd
import logging

logging.basicConfig()
logger = logging.getLogger("apriori")


In [2]:
class FileReader(ABC):
    def __init__(self, file_path):
        self.file_path = file_path
        
    @abstractmethod
    def read(self) -> list:
        pass
        
        
class CollapsedCsvFileReader(FileReader):
    """the file format is lines, with individual transactinos"""
    """separated by commma - thus calling this collapsed"""
    """file format as it is non-traditional"""
    def read(self) -> list:
        file_iter = open(self.file_path, 'r')
        raw_transactions = list()
        for line in file_iter:
            line = line.strip().rstrip(',')
            # remove whitespace around items
            trimmed = [ i.strip() for i in line.split(',') ]
            record = tuple(sorted(trimmed))
            raw_transactions.append(record)

        return raw_transactions
    

In [3]:
class Apriori(object):
# following https://en.wikipedia.org/wiki/Apriori_algorithm#Examples
    def __init__(self, transactions: List):
        self.__verify__(transactions)

        self._transactions = transactions

    def __str__(self):
        return str(tuple(self))


    def __verify__(self, transactions):
        if transactions is None:
            raise ValueError("Transaction itemset is none")

        if not isinstance(transactions, List):
            raise ValueError("Transaction itemset is not a List")

        if len(transactions) == 0:
            raise ValueError("Transaction is empty")

        if len(transactions) > 0 and not isinstance(transactions[0], Tuple):
            raise ValueError("Transaction lement is not a Tuple")


    @property
    def transactions(self) -> List:
        return self._transactions
    

    @transactions.setter
    def transactions(self, value: List):

        self.__verify__(value)
        self._transactions  = value
        
        
    def generate_levels(self, support_level=0.20, drop_below_support=True) -> pd.DataFrame:
        k = 1
        full_set = list() # this contains a dataframe for each level.
        while True:
            logger.info("k = {0}".format(k))
            item_levels = self.__generate_combinination_levels(self._transactions, k)
            sl = self.__gen_support_level(self._transactions, item_levels, 
                                          support = support_level, drop = drop_below_support)
            
            #logger.debug("transactions at level {k} are {n}".format(k = k, n = (len(sl))))
            k += 1

            if len(sl) == 0 or k == 100:
                break

            df = pd.DataFrame.from_dict(sl, orient='index', columns=['count'])
            df.index.name = 'itemsets'
            df.reset_index()
            full_set.append(df)
            
        rv = self.__append_colums(full_set)
        return rv
    
    def __append_colums(self, data: List, tran_list = None) -> pd.DataFrame:
        if tran_list == None:
            tran_list = self.transactions
            
        tran_count = len(tran_list)
        
        rows_list = []
        for r in data:
            #logger.debug('type of r is: {0}'.format(type(r)))
            #logger.debug('len of r is: {0}'.format(len(r)))
            #logger.debug('r is: {0}'.format(r))
            for index, row in r.iterrows():
                #d = { 'count' : r['count'], 'support': r['count']/tran_count}
                d = {'itemsets': index, 'count' : row['count'], 'support': row['count']/tran_count}
                #logger.debug("THE DICTd: {0}".format(d))
                rows_list.append(d)
            
        df = pd.DataFrame(rows_list)
        
        return df
        
        
    def __generate_combinination_levels(self, tran_list, level):
        """generate keys that are used for subset checking"""
        """on each transaction"""
        results = list()
        for t in tran_list:
            logger.debug("gen_com_levell: t: {0}  and level: {1}".format(t, level))
            [ results.append(i) for i in combinations(t, level)]


        rv = sorted(set(results))
        logger.debug("combo levels: {0}".format(rv))
        return rv


    def __gen_support_level(self, tran_list, items_keys, support = 0.20, drop = True):
        """for each key which can be a set find in transactions"""
        """how many contain the combination"""        
        logger.info('Using support level of {0}'.format(support))
        logger.info('drop below support? {0}'.format(drop))
        tran_count = len(tran_list)
        base_level = tran_count * support
        logger.debug('base level count: {0}'.format(base_level))
        itemSet = dict()

        for key in items_keys:
            for t in tran_list:
                if set(key).issubset(t):
                    #logger.debug('is subset: {0}'.format(t))
                    if (key) in itemSet:
                        itemSet[key] += 1
                    else:
                        itemSet[key] = 1

        if drop:
            return {key:value for (key,value) in itemSet.items() if value >= base_level}
        else:
            return {key:value for (key,value) in itemSet.items()} 



In [4]:
logging.basicConfig()
logger = logging.getLogger('apriori')
logger.setLevel(logging.INFO)

#fr = CollapsedCsvFileReader('./data/test4.csv')
fr = CollapsedCsvFileReader('./data/test-dm-bookch6.csv')

t2 = fr.read()

#logger.info("tran count: {0}".format(t2))
g = Apriori(t2)
#output = g.generate_levels(support_level=0.60, drop_below_support=False)
output = g.generate_levels(support_level=0.22, drop_below_support=True)

#out2 = g.append_colums(output)



#[(p) for p in out2]

INFO:apriori:k = 1
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True
INFO:apriori:k = 2
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True
INFO:apriori:k = 3
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True
INFO:apriori:k = 4
INFO:apriori:Using support level of 0.22
INFO:apriori:drop below support? True


In [5]:
output

Unnamed: 0,itemsets,count,support
0,"(I1,)",6,0.666667
1,"(I2,)",7,0.777778
2,"(I3,)",6,0.666667
3,"(I4,)",2,0.222222
4,"(I5,)",2,0.222222
5,"(I1, I2)",4,0.444444
6,"(I1, I3)",4,0.444444
7,"(I1, I5)",2,0.222222
8,"(I2, I3)",4,0.444444
9,"(I2, I4)",2,0.222222


In [212]:
#maybe make a data frame outbound..
def generate_associations(data: pd.DataFrame, threshold = 0.10):
    rv = list()
    for r in data.iterrows():
        # current row ID
        idx = r[0]
        item = data.iloc[idx]['itemsets']
        #print("at {0}  for {1}".format(idx, item))
        ##maybe make this a dict with key, support, and current itemset, associated itemsets
        all_other = [ k for k,v in data.iterrows() if k != idx]
        ##print("at {0}  for {1}".format(idx, item))
        ##print(all_other) # all other itemsets current belongs to...
        
        ## THIS current item support.
        support = data.iloc[idx]['support']
        
        ###rv = [ y for y in all_other ]
        ###rv = [ (item, support, output.iloc[y]['itemsets'], output.iloc[y]['itemsets']['support']) for y in all_other ]
        temp = [ (item, support, data.iloc[y]['itemsets'], data.iloc[y]['support']) for y in all_other ]
        #print("type: {0}".format(type(temp)))
        rv.extend(temp)
        

        
    return rv


In [213]:
#output[output['itemsets'] == ('I2',)]['support']


nnn = generate_associations(output)
#type(nnn[0][0][0])
nnn


# this may work, just need to remove rowss where the antecdent appears in the consequent 
pd.DataFrame(nnn)

#output[output['itemsets'] == nnn[0]]['support']

#len(nnn)




Unnamed: 0,0,1,2,3
0,"(I1,)",0.666667,"(I2,)",0.777778
1,"(I1,)",0.666667,"(I3,)",0.666667
2,"(I1,)",0.666667,"(I4,)",0.222222
3,"(I1,)",0.666667,"(I5,)",0.222222
4,"(I1,)",0.666667,"(I1, I2)",0.444444
...,...,...,...,...
151,"(I1, I2, I5)",0.222222,"(I1, I5)",0.222222
152,"(I1, I2, I5)",0.222222,"(I2, I3)",0.444444
153,"(I1, I2, I5)",0.222222,"(I2, I4)",0.222222
154,"(I1, I2, I5)",0.222222,"(I2, I5)",0.222222


In [185]:
nnn
#output

[[(('I1',), 0.6666666666666666, ('I2',), 0.7777777777777778),
  (('I1',), 0.6666666666666666, ('I3',), 0.6666666666666666),
  (('I1',), 0.6666666666666666, ('I4',), 0.2222222222222222),
  (('I1',), 0.6666666666666666, ('I5',), 0.2222222222222222),
  (('I1',), 0.6666666666666666, ('I1', 'I2'), 0.4444444444444444),
  (('I1',), 0.6666666666666666, ('I1', 'I3'), 0.4444444444444444),
  (('I1',), 0.6666666666666666, ('I1', 'I5'), 0.2222222222222222),
  (('I1',), 0.6666666666666666, ('I2', 'I3'), 0.4444444444444444),
  (('I1',), 0.6666666666666666, ('I2', 'I4'), 0.2222222222222222),
  (('I1',), 0.6666666666666666, ('I2', 'I5'), 0.2222222222222222),
  (('I1',), 0.6666666666666666, ('I1', 'I2', 'I3'), 0.2222222222222222),
  (('I1',), 0.6666666666666666, ('I1', 'I2', 'I5'), 0.2222222222222222)],
 [(('I2',), 0.7777777777777778, ('I1',), 0.6666666666666666),
  (('I2',), 0.7777777777777778, ('I3',), 0.6666666666666666),
  (('I2',), 0.7777777777777778, ('I4',), 0.2222222222222222),
  (('I2',), 0.777

In [154]:
[k  for k,v in output.iterrows() ] 

output.iloc[1]['itemsets']

('I2',)

In [105]:
output.iloc[0]['itemsets']

('I1',)

In [76]:
output.shape[0]

13

In [79]:
first = output.iloc[0]['itemsets']
first

('I1',)

In [92]:
# for an index, get all OTHER rows.


s = [ v['itemsets'] for k,v in output.iterrows() if k != 0]
s

[('I2',),
 ('I3',),
 ('I4',),
 ('I5',),
 ('I1', 'I2'),
 ('I1', 'I3'),
 ('I1', 'I5'),
 ('I2', 'I3'),
 ('I2', 'I4'),
 ('I2', 'I5'),
 ('I1', 'I2', 'I3'),
 ('I1', 'I2', 'I5')]

In [93]:
[ print(i) for i in s if set(first).issubset(i) ]

('I1', 'I2')
('I1', 'I3')
('I1', 'I5')
('I1', 'I2', 'I3')
('I1', 'I2', 'I5')


[None, None, None, None, None]

In [91]:
s[0:12, 0]

TypeError: list indices must be integers or slices, not tuple

In [62]:
df.iloc[lambda x: x.index %2 == 0]

Unnamed: 0,0
0,I1


In [7]:
r = generate_associations(output)

('I1',)
('I2',)
('I3',)
('I4',)
('I5',)
('I1', 'I2')
('I1', 'I3')
('I1', 'I5')
('I2', 'I3')
('I2', 'I4')
('I2', 'I5')
('I1', 'I2', 'I3')
('I1', 'I2', 'I5')


In [10]:
set(r[0]).issubset(r[5])

True

In [13]:
[ print(i) for i in r if set(i).issubset(i) ]

('I1',)
('I2',)
('I3',)
('I4',)
('I5',)
('I1', 'I2')
('I1', 'I3')
('I1', 'I5')
('I2', 'I3')
('I2', 'I4')
('I2', 'I5')
('I1', 'I2', 'I3')
('I1', 'I2', 'I5')


[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [23]:
r[[0]]

TypeError: list indices must be integers or slices, not list

In [30]:
output['itemsets'].values

array([('I1',), ('I2',), ('I3',), ('I4',), ('I5',), ('I1', 'I2'),
       ('I1', 'I3'), ('I1', 'I5'), ('I2', 'I3'), ('I2', 'I4'),
       ('I2', 'I5'), ('I1', 'I2', 'I3'), ('I1', 'I2', 'I5')], dtype=object)

In [35]:
[print(i) for i in combinations(r, r=2)]

(('I1',), ('I2',))
(('I1',), ('I3',))
(('I1',), ('I4',))
(('I1',), ('I5',))
(('I1',), ('I1', 'I2'))
(('I1',), ('I1', 'I3'))
(('I1',), ('I1', 'I5'))
(('I1',), ('I2', 'I3'))
(('I1',), ('I2', 'I4'))
(('I1',), ('I2', 'I5'))
(('I1',), ('I1', 'I2', 'I3'))
(('I1',), ('I1', 'I2', 'I5'))
(('I2',), ('I3',))
(('I2',), ('I4',))
(('I2',), ('I5',))
(('I2',), ('I1', 'I2'))
(('I2',), ('I1', 'I3'))
(('I2',), ('I1', 'I5'))
(('I2',), ('I2', 'I3'))
(('I2',), ('I2', 'I4'))
(('I2',), ('I2', 'I5'))
(('I2',), ('I1', 'I2', 'I3'))
(('I2',), ('I1', 'I2', 'I5'))
(('I3',), ('I4',))
(('I3',), ('I5',))
(('I3',), ('I1', 'I2'))
(('I3',), ('I1', 'I3'))
(('I3',), ('I1', 'I5'))
(('I3',), ('I2', 'I3'))
(('I3',), ('I2', 'I4'))
(('I3',), ('I2', 'I5'))
(('I3',), ('I1', 'I2', 'I3'))
(('I3',), ('I1', 'I2', 'I5'))
(('I4',), ('I5',))
(('I4',), ('I1', 'I2'))
(('I4',), ('I1', 'I3'))
(('I4',), ('I1', 'I5'))
(('I4',), ('I2', 'I3'))
(('I4',), ('I2', 'I4'))
(('I4',), ('I2', 'I5'))
(('I4',), ('I1', 'I2', 'I3'))
(('I4',), ('I1', 'I2', '

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]