In [1]:
import sys

from itertools import chain, combinations
from collections import defaultdict
from optparse import OptionParser

In [2]:
def subsets(arr):
    """ Returns non empty subsets of arr"""
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)])

In [3]:
def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet):
    """calculates the support for items in the itemSet and returns a subset
    of the itemSet each of whose elements satisfies the minimum support"""
    _itemSet = set()
    localSet = defaultdict(int)

    for item in itemSet:
        for transaction in transactionList:
            if item.issubset(transaction):
                freqSet[item] += 1
                localSet[item] += 1

    for item, count in localSet.items():
        support = float(count) / len(transactionList)

        if support >= minSupport:
            _itemSet.add(item)

    return _itemSet

In [4]:
def joinSet(itemSet, length):
    """Join a set with itself and returns the n-element itemsets"""
    return set(
        [i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length]
    )

In [5]:
def getItemSetTransactionList(data_iterator):
    transactionList = list()
    itemSet = set()
    for record in data_iterator:
        transaction = frozenset(record)
        transactionList.append(transaction)
        for item in transaction:
            itemSet.add(frozenset([item]))  # Generate 1-itemSets
    return itemSet, transactionList

In [6]:
def runApriori(data_iter, minSupport, minConfidence):
    """
    run the apriori algorithm. data_iter is a record iterator
    Return both:
     - items (tuple, support)
     - rules ((pretuple, posttuple), confidence)
    """
    itemSet, transactionList = getItemSetTransactionList(data_iter)

    freqSet = defaultdict(int)
    largeSet = dict()
    # Global dictionary which stores (key=n-itemSets,value=support)
    # which satisfy minSupport

    assocRules = dict()
    # Dictionary which stores Association Rules

    oneCSet = returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet)

    currentLSet = oneCSet
    k = 2
    while currentLSet != set([]):
        largeSet[k - 1] = currentLSet
        currentLSet = joinSet(currentLSet, k)
        currentCSet = returnItemsWithMinSupport(
            currentLSet, transactionList, minSupport, freqSet
        )
        currentLSet = currentCSet
        k = k + 1
    def getSupport(item):
        """local function which Returns the support of an item"""
        return float(freqSet[item]) / len(transactionList)

    toRetItems = []
    for key, value in largeSet.items():
        toRetItems.extend([(tuple(item), getSupport(item)) for item in value])

    toRetRules = []
    for key, value in list(largeSet.items())[1:]:
        for item in value:
            _subsets = map(frozenset, [x for x in subsets(item)])
            for element in _subsets:
                remain = item.difference(element)
                if len(remain) > 0:
                    confidence = getSupport(item) / getSupport(element)
                    if confidence >= minConfidence:
                        toRetRules.append(((tuple(element), tuple(remain)), confidence))
    return toRetItems, toRetRules

In [7]:
def printResults(items, rules):
    """prints the generated itemsets sorted by support and the confidence rules sorted by confidence"""
    for item, support in sorted(items, key=lambda x: x[1]):
        print("item: %s , %.3f" % (str(item), support))
    print("\n------------------------ RULES:")
    for rule, confidence in sorted(rules, key=lambda x: x[1]):
        pre, post = rule
        print("Rule: %s ==> %s , %.3f" % (str(pre), str(post), confidence))

In [8]:
def to_str_results(items, rules):
    """prints the generated itemsets sorted by support and the confidence rules sorted by confidence"""
    i, r = [], []
    for item, support in sorted(items, key=lambda x: x[1]):
        x = "item: %s , %.3f" % (str(item), support)
        i.append(x)

    for rule, confidence in sorted(rules, key=lambda x: x[1]):
        pre, post = rule
        x = "Rule: %s ==> %s , %.3f" % (str(pre), str(post), confidence)
        r.append(x)

    return i, r

In [9]:
def dataFromFile(fname):
    """Function which reads from the file and yields a generator"""
    with open(fname, "rU") as file_iter:
        for line in file_iter:
            line = line.strip().rstrip(",")  # Remove trailing comma
            record = frozenset(line.split(","))
            yield record

In [11]:
inFile=dataFromFile('basket.csv')
minSupport = 0.15
minConfidence = 0.5

items, rules = runApriori(inFile, minSupport, minConfidence)

printResults(items, rules)

item: ('Brooklyn',) , 0.152
item: ('HISPANIC',) , 0.164
item: ('HISPANIC', 'MBE') , 0.164
item: ('WBE', 'MBE') , 0.169
item: ('New York', 'MBE') , 0.170
item: ('New York', 'WBE') , 0.175
item: ('ASIAN', 'MBE') , 0.200
item: ('ASIAN',) , 0.202
item: ('New York',) , 0.295
item: ('NON-MINORITY',) , 0.300
item: ('NON-MINORITY', 'WBE') , 0.300
item: ('BLACK',) , 0.301
item: ('MBE', 'BLACK') , 0.301
item: ('WBE',) , 0.477
item: ('MBE',) , 0.671

------------------------ RULES:
Rule: ('New York',) ==> ('MBE',) , 0.578
Rule: ('New York',) ==> ('WBE',) , 0.594
Rule: ('WBE',) ==> ('NON-MINORITY',) , 0.628
Rule: ('ASIAN',) ==> ('MBE',) , 0.990
Rule: ('BLACK',) ==> ('MBE',) , 1.000
Rule: ('NON-MINORITY',) ==> ('WBE',) , 1.000
Rule: ('HISPANIC',) ==> ('MBE',) , 1.000


  with open(fname, "rU") as file_iter:


In [12]:
inFile=dataFromFile('basket.csv')
minSupport = 0.3
minConfidence = 0.5

items, rules = runApriori(inFile, minSupport, minConfidence)

printResults(items, rules)

item: ('NON-MINORITY',) , 0.300
item: ('NON-MINORITY', 'WBE') , 0.300
item: ('BLACK',) , 0.301
item: ('MBE', 'BLACK') , 0.301
item: ('WBE',) , 0.477
item: ('MBE',) , 0.671

------------------------ RULES:
Rule: ('WBE',) ==> ('NON-MINORITY',) , 0.628
Rule: ('NON-MINORITY',) ==> ('WBE',) , 1.000
Rule: ('BLACK',) ==> ('MBE',) , 1.000


  with open(fname, "rU") as file_iter:


In [13]:
inFile=dataFromFile('basket.csv')
minSupport = 0.15
minConfidence = 0.3

items, rules = runApriori(inFile, minSupport, minConfidence)

printResults(items, rules)

item: ('Brooklyn',) , 0.152
item: ('HISPANIC',) , 0.164
item: ('HISPANIC', 'MBE') , 0.164
item: ('WBE', 'MBE') , 0.169
item: ('New York', 'MBE') , 0.170
item: ('New York', 'WBE') , 0.175
item: ('ASIAN', 'MBE') , 0.200
item: ('ASIAN',) , 0.202
item: ('New York',) , 0.295
item: ('NON-MINORITY',) , 0.300
item: ('NON-MINORITY', 'WBE') , 0.300
item: ('BLACK',) , 0.301
item: ('MBE', 'BLACK') , 0.301
item: ('WBE',) , 0.477
item: ('MBE',) , 0.671

------------------------ RULES:
Rule: ('WBE',) ==> ('MBE',) , 0.354
Rule: ('WBE',) ==> ('New York',) , 0.367
Rule: ('MBE',) ==> ('BLACK',) , 0.448
Rule: ('New York',) ==> ('MBE',) , 0.578
Rule: ('New York',) ==> ('WBE',) , 0.594
Rule: ('WBE',) ==> ('NON-MINORITY',) , 0.628
Rule: ('ASIAN',) ==> ('MBE',) , 0.990
Rule: ('BLACK',) ==> ('MBE',) , 1.000
Rule: ('NON-MINORITY',) ==> ('WBE',) , 1.000
Rule: ('HISPANIC',) ==> ('MBE',) , 1.000


  with open(fname, "rU") as file_iter:


In [14]:
inFile=dataFromFile('basket.csv')
minSupport = 0.6
minConfidence = 0.25

items, rules = runApriori(inFile, minSupport, minConfidence)

printResults(items, rules)

item: ('MBE',) , 0.671

------------------------ RULES:


  with open(fname, "rU") as file_iter:


In [17]:
minSupport = 0.05
minConfidence = 0
Rules_max = 0
SupportValues=defaultdict(set)

while minSupport <= 1:
    minConfidence = 0.05
    while minConfidence <= 1:
        inFile=dataFromFile('basket.csv')
        items, rules = runApriori(inFile, minSupport, minConfidence)
        Rules_max = max(Rules_max, len(rules))
        if len(rules) == 0:
            SupportValues[round(minSupport,2)].add(round(minConfidence,2))
        minConfidence += 0.05
        minSupport += 0.05

print("Maximum Number Of Rules Are: ")
print(Rules_max)
print("Confidence Values where number of rules are minimum i.e. 0")

print("Support Values\tConfidence Values")
for i in SupportValues.keys():
    print(str(i)+" :\t\t",SupportValues[i])

  with open(fname, "rU") as file_iter:


Maximum Number Of Rules Are: 
74
Confidence Values where number of rules are minimum i.e. 0
Support Values	Confidence Values
0.35 :		 {0.35}
0.4 :		 {0.4}
0.45 :		 {0.45}
0.5 :		 {0.5}
0.55 :		 {0.55}
0.6 :		 {0.6}
0.65 :		 {0.65}
0.7 :		 {0.7}
0.75 :		 {0.75}
0.8 :		 {0.8}
0.85 :		 {0.85}
0.9 :		 {0.9}
0.95 :		 {0.95}
