In [52]:
import pandas as pd
import math
import seaborn as sns
import collections
import itertools

test = {'T100':['M','O','N','K','E','Y'],
        'T200':['D','O','N','K','E','Y'],
        'T300':['M','A','K','E'],
        'T400':['M','U','C','K','Y'], 
        'T500':['C','O','O','K','I','E']}
    
# data set adult.data
url="http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# data headers from adult.names
headers = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]

# read data into pandas
#data=pd.read_csv(url,names=headers)

# Remove duplicates in transactions
def remove_duplicates(data):
    for key, value in data.items():
        data[key] = list(set(value))
    return data

# Calculates support (n) from %
def get_min_support(data, support_pct):
    return math.ceil(len(data) * support_pct)
    
# Generate L(1)
def find_frequent_1_itemsets(data):
    # new dict to store freq
    freq_data = {}
    # for each transaction
    for key, value in data.items():        
        # for each item in the transaction
        for i in range(len(value)):
            # if the key is in the new dict
            if (value[i] in freq_data):
                # increment
                freq_data[value[i]] += 1
            else:
                # else add the value to the new dict with count of one
                freq_data[value[i]] = 1
    # return the new dict
    return freq_data

# Prune 
def prune_itemsets(candidates, support_count):
    # new dict
    meet_min = {}
    # for each candidate
    for key, value in candidates.items():
        # if count > support_count
        if value >= support_count:
            # add to the new dict
            meet_min[key] = value
    # return set of 
    return meet_min

# Generate candidates from L(1)
def candidate_gen_1_itemset(data):
    # sort current candidates alphabetically
    keys = sorted(list(data.keys()))
    # new dict for new candidates
    candidates = {}
    # generate all possible combinations, add to the new dict with starting count 0 
    for i in range(len(keys)):
        for j in range(i+1,len(keys)):
            candidates[(keys[i], keys[j])] = 0    
    return candidates

# Generate candidates from L(2+) using (k-1)x(k-1) method
def candidate_gen_k_itemset(candidates):
    # sort current candidates alphabetically
    keys = sorted(list(candidates.keys()))
    k_candidates = {}
    # for all current candidates
    for i in range(len(keys)):
        # for every combination of current candidates
        for j in range(i+1,len(keys)):
            # if (k-1) == (k-1)
            if (keys[i][:-1]==keys[j][:-1]):
                prefix = keys[i]
                suffix = keys[j][-1]
                # new candidate = k + k[-1]
                k_candidates[(*prefix, suffix)] = 0
    # return new candidates            
    return k_candidates


def find_frequent_k_itemsets(candidates, data):
    #new dict
    freq_data = {}
    
    for key_d, value_d in data.items():
        
        for key_c, value_c in candidates.items():
            if set(key_c).issubset(set(value_d)):
                if (key_c in freq_data):
                    freq_data[key_c] += 1
                else:
                    freq_data[key_c] = 1
    return freq_data


def apriori(data, support_pct):
    
    # list of all min_support combinations
    all_freq = []
    
    # get support count from pct
    support_count = get_min_support(data, support_pct)
    
    # find frequent 1 itemsets that meet min_support and generate candidates from that
    candidates = candidate_gen_1_itemset(prune_itemsets(find_frequent_1_itemsets(data), support_count))
    
    while (candidates != {}):
        # find candidate frequency
        L = (prune_itemsets(find_frequent_k_itemsets(candidates, data), support_count))
        all_freq.append(L)
        candidates = candidate_gen_k_itemset(L)
    
    return all_freq
        
def gen_association_rules(freq_item):
    all_associations = []
    
    n = len(freq_item) - 1
    
    for i in range(n,0,-1):
        subsets = set(itertools.combinations(freq_item,i))
        for s in subsets:
            l = (set(freq_item) - set(s))
            all_associations.append([s,l])
    return all_associations

#def prune_confidence()

candidates = apriori(test, .6)
print(candidates)
gen_association_rules(('E','K', 'M'))

[{('E', 'K'): 4, ('E', 'O'): 3, ('K', 'M'): 3, ('K', 'O'): 3, ('K', 'Y'): 3}, {('E', 'K', 'O'): 3}]


[[('E', 'K'), {'M'}],
 [('K', 'M'), {'E'}],
 [('E', 'M'), {'K'}],
 [('K',), {'E', 'M'}],
 [('E',), {'K', 'M'}],
 [('M',), {'E', 'K'}]]