# Package import

In [17]:
import csv
import numpy as np
import pandas as pd
import time
from datetime import timedelta
from PrefixSpan import *

# 1. Read Data

In [18]:
raw_data = pd.read_csv('../../data/trade_new.csv', index_col=0) # trade_new.csv  and  trade.csv
raw_data = raw_data.reset_index(drop=True)

# raw_data = pd.read_csv('../../data/trade.csv')
# print("Read data...")

In [20]:
# use a dict to store the transaction of each vip
# also record the sldatime
l_data = len(raw_data)
vip_transaction = {}
vip_set = set()
for i in range(l_data):
    v = raw_data.loc[i, 'vipno']
    t = raw_data.loc[i, 'sldatime'] # sldat -> trade.csv   sldatime -> trade_new.csv
    if v in vip_set:
        vip_transaction[v].append((t, i))
    else:
        vip_transaction[v] = [(t, i)]
        vip_set.add(v)

In [21]:
# sort and cut the data
vip_list = list(vip_set)
for v in vip_list:
    # sort
    t = vip_transaction[v]
    t = sorted(t, key=lambda x:x[0])
    
    # cut and pick up the index only
    l = round(len(t) * 0.6)
    t = t[:l]
    vip_transaction[v] = [tt[1] for tt in t] # tt[0] --> time , tt[1] --> index of raw_data

In [22]:
def split_transaction(indice, item_label, transactions):
    sequence = []
    split_data = raw_data.loc[indice, ['uid', item_label]] # get the uid and pluno/dptno/bndno columns
    if item_label == 'bndno':
        split_data = split_data.dropna(how='any')
        split_data['bndno'] = split_data['bndno'].astype('int')
    groups = split_data.groupby('uid').groups
    for u in groups:
        index = groups[u] # index of the same uid
        item = list(split_data.loc[index, item_label]) # get one transaction
        if item != []:
            sequence.append(item)
    if sequence != []:
        transactions.append(sequence)

In [23]:
item_labels = ['bndno', 'dptno', 'pluno', ]
time_duration = []
for item_label in item_labels:
    print("item -- {}".format(item_label))
    # fill the transactions with the items
    transactions = []
    for v in vip_list:
        split_transaction(vip_transaction[v], item_label, transactions)
    print(len(transactions)) # print the transaction number
#     print(transactions)
    # call FP algorithm
    supports = [10, 8, 6, 4]
    for s in supports:
        time_start=time.time()
        patterns = prefixSpan(SquencePattern([], sys.maxsize), transactions, s)
        with open('sp_{0}_{1}.txt'.format(item_label, s), 'w') as f:
            for p in patterns:
                f.write("{0} -- s={1}\n".format(p.squence, p.support))
        time_end=time.time()
        print('support = {} done. totally cost = '.format(s), timedelta(seconds=time_end-time_start))
        time_duration.append(time_end-time_start)

item -- bndno
467
support = 10 done. totally cost =  0:00:00.257557
support = 8 done. totally cost =  0:00:00.399923
support = 6 done. totally cost =  0:00:00.732862
support = 4 done. totally cost =  0:00:01.926044
item -- dptno
486
support = 10 done. totally cost =  0:00:01.380294
support = 8 done. totally cost =  0:00:01.616023
support = 6 done. totally cost =  0:00:03.502696
support = 4 done. totally cost =  0:00:09.546950
item -- pluno
486
support = 10 done. totally cost =  0:00:00.467200
support = 8 done. totally cost =  0:00:00.504950
support = 6 done. totally cost =  0:00:00.763768
support = 4 done. totally cost =  0:00:01.542819


In [24]:
with open('time.csv', 'w') as f:
    t = ','.join([str(x) for x in time_duration[:4]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in time_duration[4:8]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in time_duration[8:]])
    f.write(t + '\n')

In [25]:
print(time_duration[:4])
print(time_duration[4:8])
print(time_duration[8:])

[0.2575571537017822, 0.39992308616638184, 0.7328622341156006, 1.9260437488555908]
[1.380293846130371, 1.616023063659668, 3.5026960372924805, 9.546950340270996]
[0.46719980239868164, 0.5049498081207275, 0.7637677192687988, 1.5428187847137451]


In [26]:
item_labels = ['dptno', 'pluno', 'bndno']
numbers = []
for item_label in item_labels:
    supports = [10, 8, 6, 4]
    for s in supports:
        item_number = 0 
        with open('result_new/sp_{0}_{1}.txt'.format(item_label, s), 'r') as f:
            for line in f:
                item_number += 1
        print(item_number)
        numbers.append(item_number)

3100
6171
16219
117399
350
555
1144
4143
893
1676
4426
26122


In [27]:
with open('number.csv', 'w') as f:
    t = ','.join([str(x) for x in numbers[:4]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in numbers[4:8]])
    f.write(t + '\n')
    t = ','.join([str(x) for x in numbers[8:]])
    f.write(t + '\n')