In [105]:
# https://medium.com/analytics-vidhya/association-analysis-in-python-2b955d0180c

class AssociationAnalysis:
    
    def __init__(self, D, verbose=True, lemmatization=True, min_support=3/5):
        '''
            D is the data basis containing all transaction sets.
            One transaction for example contains the set of items you bought (e.g. `{'strawberries', 'blueberries'}`).
            The cardinality of D is the amount of transactions in the data basis D.
            D contains items from the statistical population of items P (all possible items).
            
            X is any subset of the population P (k-item-set with k>=1) or a string which will be automatically converted to a 1-element subset (e.g. X='lemon juice' will be converted to X={'lemon juice'}).
            
            Lemmatization is enabled by default and requires downloaded lemmatization packages from nltk:
            If you haven't done so, please execute:
                `
                import nltk
                nltk.download()
                `
        '''
        
        self.verbose = verbose
        self.lemmatization = lemmatization
        self.min_support = min_support
        
        if self.lemmatization:
            import nltk
            # nltk.download() required
            from nltk.stem import WordNetLemmatizer 
            self.lemmatizer = WordNetLemmatizer()
            self.D = self.lemmatize_list_of_sets(D)
        else:
            self.D = D
        
        self.D_cardinality = len(D)

        self.P = set().union(*D)
        self.P_cardinality = len(self.P)
        self.P_remaining_items = []
        self.P_remaining_items = list(self.P)
        self.P_remaining_subsets = {}
        self.P_remaining_subsets[1] = self.P
        self.supports = {}
    
    def n(self, X, prepare=False):
        '''
            Absolute prevalence of X in data basis D
        '''
        if prepare:
            X = self.prepare_subset(X)
        n = 0
        for transaction in self.D:
            if X.issubset(transaction): n += 1
        # if self.verbose: self.latex(rf'$n(${X}$)={n}$')
        return n

    def support(self, *X, prepare=False):
        '''
            Relative prevalence of item(-s) in data basis D
            If `prepare=True`, prepare the parameters `*X` via `self.prepare_subset()`.
        '''
        if prepare:
            X = self.prepare_subset(X)
        else:
            X = self.tuple_to_set(X)
        n_X = self.n(X,prepare=prepare)
        
        support = n_X/self.D_cardinality
        
        if self.verbose:
            output = rf'$support(X = $ {X}$) = \frac{{n(X)}}{{|D|}} = \frac{{{n_X}}}{{{self.D_cardinality}}}$'
            if support < self.min_support:
                output = rf'<font color="#ccc">' + output + rf'</font>'
                
            self.latex(output)
        return support
    
    def confidence(self, *Z, prepare=False):
        '''
            $confidence(X \to Y) = \frac{n(X \cup Y)}{n(X)}$
            
            If `prepare=True`, prepare the parameters `X` and `*y` via `self.prepare_subset()`.
        '''
        if prepare:
            y = self.prepare_subset(Z[-1])
            X = self.prepare_subset(Z[:-1])
        else:
            y = self.tuple_to_set(Z[-1]) # only last element
            X = self.tuple_to_set(Z[:-1]) # every element but the last        

        n_X = self.n(X,prepare=prepare)
        
        X_union_y = set().union(X, y)
        n_X_union_y = self.n(X_union_y)
        if n_X == 0:
            confidence = None
        else:
            confidence = n_X_union_y/n_X
        self.latex(rf'$confidence(X= $ {X} $ \to Y=$ {y}$) = \frac{{n(X \cup Y)}}{{n(X)}} = \frac{{{n_X_union_y}}}{{{n_X}}}$')
        return confidence

    def filter_P_by_min_support(self, k):
        remove_sets = [X for X, support_value in self.supports.items() if support_value < self.min_support]
                
        P_snapshot_k = self.P_remaining_subsets[k]
        self.P_remaining_subsets[k] = []
        for subset in P_snapshot_k:
            if subset not in remove_sets:
                self.P_remaining_subsets[k].append(subset)
        self.P_remaining_items = frozenset().union(*self.P_remaining_subsets[k])
    
    def apriori(self):
        self.apriori_step_1()
        # TODO implement step 2
        
    def apriori_step_1(self):
        import itertools
        
        if self.verbose: self.latex("### Supports")
        k = 1
        while(k <= len(self.P) and self.P_remaining_items):
            if self.verbose: self.latex(f"$k = {k}$")
            
            self.P_remaining_subsets[k] = [frozenset(X) for X in itertools.combinations(list(self.P_remaining_items), k)]
            for X in self.P_remaining_subsets[k]:
                X_set = frozenset(self.tuple_to_set(X))
                self.supports[X_set] = self.support(*X)
            self.filter_P_by_min_support(k)
            k += 1
        frequent_subsets = [set([*X]) for X, support_value in self.supports.items() if support_value >= self.min_support]
        if self.verbose:
            frequent_subsets_output = "### Frequent subsets $F$"
            frequent_subsets_output += '\n' + ', '.join(str(subset) for subset in frequent_subsets)
            self.latex(frequent_subsets_output)
                    
    def tuple_to_set(self, X):
        '''
            Helper function to convert a given n-tuple to an n-element set
        '''
        if isinstance(X, tuple):
            X = set(X)
        return X
    def string_to_set(self, X):
        '''
            Helper function to convert a given string to a 1-element set
        '''
        if isinstance(X, str):
            X = set([X])
        return X
    
    def prepare_subset(self, X, lemmatization=None):
        '''
            Helper function to prepare a subset X.
            If a tuple is given, convert the n-element tuple to an n-element set.
            If a string is given, convert the string to a 1-element set.
            If lemmatization is enabled, lemmatize the set.
        '''
        if lemmatization is None:
            lemmatization = self.lemmatization   
            
        X = self.tuple_to_set(X)        
        X = self.string_to_set(X)
        
        if lemmatization:
            X = self.lemmatize_set(X)
            
        return X

    def lemmatize_list_of_sets(self, D):
        return [self.lemmatize_set(transaction) for transaction in D]
    def lemmatize_set(self, set_of_items):
        import collections
        if(isinstance(set_of_items, collections.Hashable)):
            return {self.l(item) for item in set_of_items}
        else:
            return set_of_items
    def l(self, item):
        '''
         Helper function l(item) = lemmatize(item) returns the lemmatized version of a string item.
        '''
        import collections
        if(isinstance(item, collections.Hashable)):
            l = self.lemmatizer.lemmatize(item)
        else:
            l = item
        return l

    def latex(self, string):
        '''
        Helper function to output latex in verbose mode.
        '''
        if self.verbose:
            from IPython.display import display, Markdown
            display(Markdown(rf"""{string}"""))

In [106]:
D = []
D.append({'strawberries','lemon juice'})
D.append({'strawberries', 'toothbrush', 'oatmilk', 'lentils', 'lemon juice'})
D.append({'lemon juice', 'toothbrush', 'oatmilk'})
D.append({'strawberries', 'lemon juice', 'toothbrush'})
D.append({'strawberries', 'oatmilk'})

association_analysis = AssociationAnalysis(D,lemmatization=True)
association_analysis.support('strawberries', 'lemon juice',prepare=True)
association_analysis.confidence('strawberries', 'lemon juice', 'toothbrush', prepare=True)
association_analysis.apriori()

$support(X = $ {'strawberries', 'lemon juice'}$) = \frac{n(X)}{|D|} = \frac{3}{5}$

$confidence(X= $ {'strawberries', 'lemon juice'} $ \to Y=$ {'toothbrush'}$) = \frac{n(X \cup Y)}{n(X)} = \frac{2}{3}$

### Supports

$k = 1$

$support(X = $ {'strawberries'}$) = \frac{n(X)}{|D|} = \frac{4}{5}$

$support(X = $ {'lemon juice'}$) = \frac{n(X)}{|D|} = \frac{4}{5}$

$support(X = $ {'oatmilk'}$) = \frac{n(X)}{|D|} = \frac{3}{5}$

<font color="#ccc">$support(X = $ {'lentils'}$) = \frac{n(X)}{|D|} = \frac{1}{5}$</font>

$support(X = $ {'toothbrush'}$) = \frac{n(X)}{|D|} = \frac{3}{5}$

$k = 2$

<font color="#ccc">$support(X = $ {'strawberries', 'toothbrush'}$) = \frac{n(X)}{|D|} = \frac{2}{5}$</font>

$support(X = $ {'strawberries', 'lemon juice'}$) = \frac{n(X)}{|D|} = \frac{3}{5}$

<font color="#ccc">$support(X = $ {'strawberries', 'oatmilk'}$) = \frac{n(X)}{|D|} = \frac{2}{5}$</font>

$support(X = $ {'toothbrush', 'lemon juice'}$) = \frac{n(X)}{|D|} = \frac{3}{5}$

<font color="#ccc">$support(X = $ {'toothbrush', 'oatmilk'}$) = \frac{n(X)}{|D|} = \frac{2}{5}$</font>

<font color="#ccc">$support(X = $ {'lemon juice', 'oatmilk'}$) = \frac{n(X)}{|D|} = \frac{2}{5}$</font>

$k = 3$

<font color="#ccc">$support(X = $ {'strawberries', 'toothbrush', 'lemon juice'}$) = \frac{n(X)}{|D|} = \frac{2}{5}$</font>

### Frequent subsets $F$
{'strawberries'}, {'lemon juice'}, {'oatmilk'}, {'toothbrush'}, {'strawberries', 'lemon juice'}, {'toothbrush', 'lemon juice'}