In [1]:
from datetime import date
import csv

In [20]:
import sys

from itertools import chain, combinations
from collections import defaultdict
from optparse import OptionParser


def subsets(arr):
    """ Returns non empty subsets of arr"""
    return chain(*[combinations(arr, i + 1) for i, a in enumerate(arr)])


def returnItemsWithMinSupport(itemSet, transactionList, minSupport, freqSet):
        """calculates the support for items in the itemSet and returns a subset
       of the itemSet each of whose elements satisfies the minimum support"""
        _itemSet = set()
        localSet = defaultdict(int)

        for item in itemSet:
                for transaction in transactionList:
                        if item.issubset(transaction):
                                freqSet[item] += 1
                                localSet[item] += 1

        for item, count in localSet.items():
                support = float(count)/len(transactionList)

                if support >= minSupport:
                        _itemSet.add(item)

        return _itemSet


def joinSet(itemSet, length):
        """Join a set with itself and returns the n-element itemsets"""
        return set([i.union(j) for i in itemSet for j in itemSet if len(i.union(j)) == length])


def getItemSetTransactionList(data_iterator):
    transactionList = list()
    itemSet = set()
    for record in data_iterator:
        transaction = frozenset(record)
        transactionList.append(transaction)
        for item in transaction:
            itemSet.add(frozenset([item]))              # Generate 1-itemSets
    return itemSet, transactionList


def runApriori(data_iter, minSupport, minConfidence):
    """
    run the apriori algorithm. data_iter is a record iterator
    Return both:
     - items (tuple, support)
     - rules ((pretuple, posttuple), confidence)
    """
    itemSet, transactionList = getItemSetTransactionList(data_iter)

    freqSet = defaultdict(int)
    largeSet = dict()
    # Global dictionary which stores (key=n-itemSets,value=support)
    # which satisfy minSupport

    assocRules = dict()
    # Dictionary which stores Association Rules

    oneCSet = returnItemsWithMinSupport(itemSet,
                                        transactionList,
                                        minSupport,
                                        freqSet)

    currentLSet = oneCSet
    k = 2
    while(currentLSet != set([])):
        largeSet[k-1] = currentLSet
        currentLSet = joinSet(currentLSet, k)
        currentCSet = returnItemsWithMinSupport(currentLSet,
                                                transactionList,
                                                minSupport,
                                                freqSet)
        currentLSet = currentCSet
        k = k + 1

    def getSupport(item):
            """local function which Returns the support of an item"""
            return float(freqSet[item])/len(transactionList)

    toRetItems = []
    for key, value in largeSet.items():
        toRetItems.extend([(tuple(item), getSupport(item))
                           for item in value])

    toRetRules = []
    for key, value in largeSet.items()[1:]:
        for item in value:
            _subsets = map(frozenset, [x for x in subsets(item)])
            for element in _subsets:
                remain = item.difference(element)
                if len(remain) > 0:
                    confidence = getSupport(item)/getSupport(element)
                    if confidence >= minConfidence:
                        toRetRules.append(((tuple(element), tuple(remain)),
                                           confidence))
    return toRetItems, toRetRules



def dataFromFile(fname):
        """Function which reads from the file and yields a generator"""
        file_iter = open(fname, 'rU')
        for line in file_iter:
                line = line.strip().rstrip(',')                         # Remove trailing comma
                record = frozenset(line.split(','))
                yield record
                
def printResults(items, rules):
    """prints the generated itemsets sorted by support and the confidence rules sorted by confidence"""
    if len(rules)!=0:
        rs.write("For time boundaries on: "+it+"\n") #показывает периоды какого пака исследуются
        #rs.write("Periods: "+str(len(start_time)))
        #for item, support in sorted(items, key=lambda (item, support): support):
            #rs.write("item: %s , %.3f+\n" % (str(item), support))
            #print "item: %s , %.3f" % (str(item), support)
        rs.write("\n------------------------ RULES:+\n")
        for rule, confidence in sorted(rules, key=lambda (rule, confidence): confidence):
            pre, post = rule
            rs.write("Rule: %s ==> %s , %.3f+\n" % (str(pre), str(post), confidence))
        rs.write("\n"+"\n")


if __name__ == "__main__":

    optparser = OptionParser()
    optparser.add_option('-f', '--inputFile',
                         dest='input',
                         help='filename containing csv',
                         default=None)
    optparser.add_option('-s', '--minSupport',
                         dest='minS',
                         help='minimum support value',
                         default=0.15,
                         type='float')
    optparser.add_option('-c', '--minConfidence',
                         dest='minC',
                         help='minimum confidence value',
                         default=0.6,
                         type='float')

    (options, args) = optparser.parse_args()


    inFile = dataFromFile("without_bound.csv")

    minSupport = options.minS
    minConfidence = options.minC

    items, rules = runApriori(inFile, minSupport, minConfidence)

    #printResults(items, rules)

In [3]:
#social_netork='fb'
pcks=[]
with open('trans.csv', 'rb') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='|', quotechar='|')
    pcks=[]
    spamreader.next()
    for row in spamreader:
        if (row[5] not in pcks):
            pcks.append(row[5])
    print len(pcks)

214


In [21]:
from datetime import datetime
now = datetime.now()
minSupport = 0.08 # res4
#minSupport = 0.15 #res3      ## выбираем сами
#minSupport = 0.2       #res2
minConfidence = 0.6 #res3/res4    ## minsup и minconf
#minConfidence = 0.7  #res2
count=0
with open('res3.txt', 'w') as rs:
    for it in pcks:
        times=[]
        # 1) составляем временные промежутки для паков
        
        with open('trans.csv', 'rb') as csvfile:
            spamreader = csv.reader(csvfile, delimiter='|', quotechar='|')
            spamreader.next()
            for row in spamreader:
                if row[5]==it:
                    times.append(int(row[2]))

        difference=172800+43200 # = 2.5 days in seconds
        # сравниваем даты по timestamp
        # если покупки наблюдались в окрестности 2х дней, то
        # будем считать, что в эти дни акция действовала
        times.sort()

        start_time=[]
        end_time=[]

        beg=times[0]

        start_time.append(beg)
        for item in times:
            if beg+difference>=item:
                beg=item
            else:
                start_time.append(item)
                end_time.append(beg)
                beg=item
        end_time.append(beg)

        # 2) проверяем временные промежутки; если все промежутки лежат в рамках 2.5 недель,
        # то считаем пак акционным и проходимся априори алгоритмом по выявленным промежуткам
        
        week_time=604800 # week time
        access=True
        for i in range(0,len(start_time)):
            if end_time[i]-start_time[i] > week_time*2.5: # предполагаем, что акции длятся не более 2х с половиной недель
                access=False
                count=count+1

        if access==True:
            for bound in range(0,len(start_time)):
                ids={}
                # 3) если пак акционный, то забиваем данные для составления ассоциативных правил
                
                with open('trans.csv', 'rb') as csvfile:
                    spam = csv.reader(csvfile, delimiter='|', quotechar='|')
                    spam.next()
                    for row in spam:
                        if (int(row[2])>=start_time[bound] and int(row[2])<=end_time[bound]):
                            if ids.has_key(int(row[0]))==True:
                                val=ids.get(int(row[0]))
                                if row[5] not in val:
                                    val.append(row[5])
                            else:
                                ids[int(row[0])]=[]# каждый id принимает в значение свои транзакции
                                ids[int(row[0])].append(row[5])

                with open('file_for_transactions.csv', 'w') as csf:
                    for key, value in ids.items():
                        st=""
                        for i in range(0,len(value)):
                            st=st+value[i]+','
                        csf.write(st+'\n')
                #now1 = datetime.now()
                #print "файлы: ","  ",(now1-now)

                inFile = dataFromFile("file_for_transactions.csv")
                items, rules = runApriori(inFile, minSupport, minConfidence)    
                printResults(items, rules)

                    
# 4) время работы алгоритма
now1 = datetime.now()
print (now1-now)

0:05:04.332062
