In [3]:
'''Implementation of SON'''
import pandas as pd
import numpy as np
import itertools
import time
import matplotlib.pyplot as plt 

#pass 1 - iterate through basket array, count quantity for each item and store in dict
def pass1(basketArray):
    itemCount = {}
    for i in range(len(basketArray)):
        lines = str(basketArray[i]).strip(' \n').split(' ') #string of items in basket

        for j in lines:
            if j in itemCount:
                itemCount[j] += 1
            else:
                itemCount[j] = 1
    return itemCount


#pass 2 - find pairs where both elements are frequent
def pass2(basketArray, frequentItems):
    candidateCount = {}

    for a in range(len(basketArray)):
        lines = str(basketArray[a]).strip(' \n').split(' ') #string of items in basket
        freqItems = [i for i in lines if i in frequentItems]
        pairsList = list(itertools.combinations (freqItems, 2)) #iterable list of pairs

        #compares pairs made from frequent items and pairs made from whole basket list
        for i in pairsList:
            if i in candidateCount:
                candidateCount[i] += 1
            else:
                candidateCount[i] = 1         
    return candidateCount


#check if each item in list is frequent
def checkFrequent(countList, support):
    frequentList = []
    for i in countList:
        if countList[i] >= support:
            frequentList.append(i)
    return set(frequentList)


def SON(fileName, supp, chunkNum): 
    #read in file
    baskets = pd.read_csv(fileName, sep = '\t', header = None) 
    basketArray = baskets.values.ravel()
    candidateSet = [] #stores all candidate pairs for eval in 2nd phase
     
    #getting subsets 
    subsets = np.array_split(basketArray, chunkNum)
    
    #SON pass 1
    for i in range(len(subsets)):
        #pass 1
        itemCount = pass1(subsets[i])
                    
        #getting frequent items
        frequentItems = checkFrequent(itemCount, int(supp * len(subsets[i])))
    
        #pass 2
        candidateCount = pass2(basketArray, frequentItems)
        
        # getting candidate set
        frequentPairs = checkFrequent(candidateCount, int(supp * len(subsets[i]))) #returns set
        candidateSet.extend(frequentPairs)

    #SON pass 2 - find frequent pairs from candidate set by selecting those that are above support in the whole dataset
    pairsCount = {}
    for i in candidateSet:
        if i in pairsCount:
            pairsCount[i] += 1
        else:       
            pairsCount[i] = 1
            
    frequentPairs = checkFrequent(pairsCount, int(supp * len(pairsCount)))
   
    #writing list to txt file
    with open('frequentPairsSON.txt','w+') as f:
        f.write(" ".join(map(str, frequentPairs)))
    return frequentPairs


st = time.time()   
frequentPairs = SON("retail.txt", 0.01, 5)
print("frequent pairs:", len(frequentPairs))
et = time.time()
execution = et - st
print(execution)

frequent pairs: 240
1.8180270195007324
