In [3]:
'''Implementation of random sampling on Apriori algorithm'''
import pandas as pd
import numpy as np
import itertools
import time

#pass 1 - iterate through basket array, count quantity for each item and store in dict
def pass1(basketArray):
    itemCount = {}
    for i in range(len(basketArray)):
        lines = str(basketArray[i]).strip(' \n').split(' ') #string of items in basket
        
        for j in lines:
            if j in itemCount:
                itemCount[j] += 1
            else:
                itemCount[j] = 1
    return itemCount


#pass 2 - find pairs where both elements are frequent
def pass2(basketArray, frequentItems):
    candidateCount = {}

    for a in range(len(basketArray)):
        lines = str(basketArray[a]).strip(' \n').split(' ') #string of items in basket
        freqItems = [i for i in lines if i in frequentItems]
        pairsList = list(itertools.combinations (freqItems, 2)) #iterable list of pairs

        #compares pairs made from frequent items and pairs made from whole basket list
        for i in pairsList:
            if i in candidateCount:
                candidateCount[i] += 1
            else:
                candidateCount[i] = 1         
    return candidateCount


#check if each item in list is frequent
def checkFrequent(countList, support):
    frequentList = []
    for i in countList:
        if countList[i] >= support:
            frequentList.append(i)
    return set(frequentList)
    

def randomSampling(fileName, supp, samplePercent): 
    #read in file
    baskets = pd.read_csv(fileName, sep = '\t', header = None) 
    basketArray = baskets.values.ravel()

    #take % random sample from original data
    randomSubset = np.random.choice(basketArray, int(len(baskets)*samplePercent), replace = False)
    support = int(supp * len(randomSubset)) 
   
    #pass 1
    itemCount = pass1(randomSubset)
                
    #getting frequent items
    frequentItems = checkFrequent(itemCount, support)

    #if no items are above support threshold
    if len(frequentItems) == 0:
        print("No frequent items")
        return

    #pass 2
    candidateCount = pass2(basketArray, frequentItems)
        
    #getting frequent pairs 
    frequentPairs = checkFrequent(candidateCount, support)

    #writing list to txt file
    with open('frequentPairsRandomSampling.txt','w+') as f:
        f.write(" ".join(map(str, frequentPairs)))
    print(len(frequentItems))
    return frequentPairs
 
st = time.time()   
frequentPairs = randomSampling("retail.txt", 0.02, 0.3)
print(len(frequentPairs))
et = time.time()
execution = et - st
print(execution)

2092
1598098
15674.471752643585
