In [1]:
import numpy

In [2]:
def load_dataset(file_name):
    with open(file_name, 'r') as f:
        content = f.readlines()
        data = [[int(x) for x in line.rstrip().split()] for line in content]
    return data

In [3]:
small_dataset = load_dataset('small_retail.txt')
small_dataset

[[1, 2, 5],
 [2, 4],
 [2, 3],
 [1, 2, 4],
 [1, 3],
 [2, 3],
 [1, 3],
 [1, 2, 3, 5],
 [1, 2, 3]]

In [4]:
def createSet1(dataset):
    c1 = []
    allItems = set()
    for transaction in dataset:
        for item in transaction:
            allItems.add(item)
    for item in sorted(allItems):
        c1.append(frozenset([item]))
    return c1

In [5]:
def filter_candidates(candidate, dataset, min_sup):
    retlist = []
    support_data = {cand:0 for cand in candidate}
    for transaction in dataset:
        for candidate in support_data:
            if candidate.issubset(transaction):
                support_data[candidate] += 1
    for candidate in support_data:
        if support_data[candidate] >= min_sup:
            retlist.append(candidate)
    return retlist, support_data
#print(filter_candidates(createSet1(small_dataset), small_dataset, 1))

In [6]:
from itertools import combinations

def generateNextItemsets(freq_sets):
    retlist = []
    for i in range(len(freq_sets)):
        for j in range(i, len(freq_sets)):
            if i != j:
                new_set = freq_sets[i] & freq_sets[j]
                if len(new_set) == len(freq_sets[i]) - 1:
                    new_set = freq_sets[i] | freq_sets[j]
                if valid_candidateSet(new_set, freq_sets):
                    retlist.append(new_set)
    return retlist

# Returns True if all subsets of new_set are candidate sets (meet minsup), returns False otherwise
def valid_candidateSet(new_set, old_sets):
    if len(new_set) == 0:
        return False
    subsets = list(combinations(new_set, len(new_set)-1))
    old_set_list = [sorted(old_cand) for old_cand in old_sets]
    for subset in subsets:
        if sorted(subset) not in old_set_list:
            return False
    return True

#freq, di = filter_candidates(createSet1(small_dataset), small_dataset, 6)
#print(freq)
#new_list = generateNextItemsets(freq)
#print(new_list)
#print(valid_candidateSet(frozenset([1,2,3]), [frozenset([1,2]), frozenset([2,3]), frozenset([1,3])]))
#print(list(frozenset([1,2])) == [1,2])

In [7]:
def aprioriFreqItemsets(dataset, minsup):
    total_itemset = []
    total_support_data = {}
    retlist, support_data = filter_candidates(createSet1(dataset), dataset, minsup)
    while retlist:
        for i in retlist: total_itemset.append(i)
        total_support_data.update(support_data)
        newCandidates = generateNextItemsets(retlist)
        retlist, support_data = filter_candidates(newCandidates, dataset, minsup)
    return total_itemset, total_support_data
#i, j = aprioriFreqItemsets(small_dataset, 4)
#print(i)

In [8]:
data = load_dataset('large_retail.txt')
totalTransactions = len(data)
freqItemsets, support_data = aprioriFreqItemsets(data, 300)
tempset = [sorted(i) for i in freqItemsets]

sorted_freqItemsets = sorted(tempset, key=lambda x: (len(list(x)),list(x)))

output = "Sup\tFreq Itemset \n"
for itemSet in sorted_freqItemsets:
    support_fraction = support_data[frozenset(itemSet)] / totalTransactions
    output += str(round(support_fraction, 2)) + "\t" + str(itemSet) + "\n"

file = open('apriori_itemsets.txt','w') 
file.write(output)
file.close()
print(output)

Sup	Freq Itemset 
0.1	[31]
0.23	[32]
0.11	[36]
0.26	[38]
0.6	[39]
0.31	[41]
0.47	[48]
0.11	[60]
0.11	[65]
0.11	[89]
0.14	[32, 39]
0.12	[32, 48]
0.17	[38, 39]
0.11	[38, 41]
0.13	[38, 48]
0.23	[39, 41]
0.33	[39, 48]
0.18	[41, 48]
0.14	[39, 41, 48]



In [9]:
output = "Sup\tFreq Itemset \n"
for i in range(len(sorted_freqItemsets)):
    for j in range(i, len(sorted_freqItemsets)):
        if set(sorted_freqItemsets[j]).issuperset(sorted_freqItemsets[i]) and support_data[frozenset(sorted_freqItemsets[i])] <= support_data[frozenset(sorted_freqItemsets[j])]:
            support_fraction = support_data[frozenset(sorted_freqItemsets[i])] / totalTransactions
            output += str(round(support_fraction, 2)) + "\t" + str(sorted_freqItemsets[i]) + "\n"
print(output)

Sup	Freq Itemset 
0.1	[31]
0.23	[32]
0.11	[36]
0.26	[38]
0.6	[39]
0.31	[41]
0.47	[48]
0.11	[60]
0.11	[65]
0.11	[89]
0.14	[32, 39]
0.12	[32, 48]
0.17	[38, 39]
0.11	[38, 41]
0.13	[38, 48]
0.23	[39, 41]
0.33	[39, 48]
0.18	[41, 48]
0.14	[39, 41, 48]

