# Package import

In [9]:
import csv
import numpy as np
import pandas as pd
import pyfpgrowth
import time
from datetime import timedelta
from PrefixSpan import *

# 1. Read Data

In [23]:
raw_data = pd.read_csv('../../data/trade_new.csv', index_col=0) # trade_new.csv  and  trade.csv
raw_data = raw_data.reset_index(drop=True)

# raw_data = pd.read_csv('../../data/trade.csv')
# print("Read data...")

In [25]:
# use a dict to store the transaction of each vip
# also record the sldatime
l_data = len(raw_data)
vip_transaction = {}
vip_set = set()
for i in range(l_data):
    v = raw_data.loc[i, 'vipno']
    t = raw_data.loc[i, 'sldatime'] # sldat -> trade.csv   sldatime -> trade_new.csv
    if v in vip_set:
        vip_transaction[v].append((t, i))
    else:
        vip_transaction[v] = [(t, i)]
        vip_set.add(v)

In [26]:
# sort and cut the data
vip_list = list(vip_set)
for v in vip_list:
    # sort
    t = vip_transaction[v]
    t = sorted(t, key=lambda x:x[0])
    
    # cut and pick up the index only
    l = round(len(t) * 0.6)
    t = t[:l]
    vip_transaction[v] = [tt[1] for tt in t] # tt[0] --> time , tt[1] --> index of raw_data

In [27]:
def split_transaction(indice, item_label, transactions):
    split_data = raw_data.loc[indice, ['uid', item_label]] # get the uid and pluno/dptno/bndno columns
    if item_label == 'bndno':
        split_data = split_data.dropna(how='any')
        split_data['bndno'] = split_data['bndno'].astype('int')
    groups = split_data.groupby('uid').groups
    for u in groups:
        index = groups[u] # index of the same uid
        t = list(split_data.loc[index, item_label]) # get one transaction
        t = [[x] for x in t]
        transactions.append(t)

In [28]:
item_labels = ['dptno', 'pluno', 'bndno']
time_duration = []
for item_label in item_labels:
    print("item -- {}".format(item_label))
    # fill the transactions with the items
    transactions = []
    for v in vip_list:
        split_transaction(vip_transaction[v], item_label, transactions)
    print(len(transactions)) # print the transaction number
    # call FP algorithm
    supports = [64, 32, 16, 8, 4, 2]
    for s in supports:
        time_start=time.time()
        patterns = prefixSpan(SquencePattern([], sys.maxsize), transactions, s)
        with open('sp_{0}_{1}.txt'.format(item_label, s), 'w') as f:
            for p in patterns:
                f.write("pattern:{0}, support:{1}\n".format(p.squence, p.support))
        time_end=time.time()
        print('support = {} done. totally cost = '.format(s), timedelta(seconds=time_end-time_start))
        time_duration.append(time_end-time_start)

item -- dptno
2787
support = 64 done. totally cost =  0:00:00.141549
support = 32 done. totally cost =  0:00:00.361789
support = 16 done. totally cost =  0:00:00.625829
support = 8 done. totally cost =  0:00:01.357823
support = 4 done. totally cost =  0:00:02.415359
support = 2 done. totally cost =  0:00:03.952705
item -- pluno
2787
support = 64 done. totally cost =  0:00:00.070485
support = 32 done. totally cost =  0:00:00.189177
support = 16 done. totally cost =  0:00:00.520623
support = 8 done. totally cost =  0:00:01.387272
support = 4 done. totally cost =  0:00:02.879114
support = 2 done. totally cost =  0:00:08.074477
item -- bndno
2024
support = 64 done. totally cost =  0:00:00.029142
support = 32 done. totally cost =  0:00:00.077997
support = 16 done. totally cost =  0:00:00.220577
support = 8 done. totally cost =  0:00:00.389557
support = 4 done. totally cost =  0:00:00.738868
support = 2 done. totally cost =  0:00:01.281443


In [29]:
with open('time.csv', 'w') as f:
    t = ','.join([str(x) for x in time_duration[:6]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in time_duration[6:12]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in time_duration[12:]])
    f.write(t + '\n')

In [21]:
print(time_duration[:6])
print(time_duration[6:12])
print(time_duration[12:])

[0.07019591331481934, 0.11803483963012695, 0.23120594024658203, 0.30949926376342773, 0.5743751525878906, 1.1606569290161133]
[0.02225208282470703, 0.06181192398071289, 0.13244986534118652, 0.356701135635376, 0.5810708999633789, 1.580782175064087]
[0.0110321044921875, 0.014880180358886719, 0.028963088989257812, 0.06637215614318848, 0.14502811431884766, 0.36875486373901367]


In [34]:
item_labels = ['dptno', 'pluno', 'bndno']
numbers = []
for item_label in item_labels:
    supports = [64, 32, 16, 8, 4, 2]
    for s in supports:
        item_number = 0 
        with open('result_trade/sp_{0}_{1}.txt'.format(item_label, s), 'r') as f:
            for line in f:
                item_number += 1
        print(item_number)
        numbers.append(item_number)

16
40
117
288
896
4998
7
22
57
151
479
2666
6
10
35
94
266
1226


In [35]:
with open('number.csv', 'w') as f:
    t = ','.join([str(x) for x in numbers[:6]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in numbers[6:12]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in numbers[12:]])
    f.write(t + '\n')