## <font color='green'> Retail - Frequent Pattern Mining<font>

### <font color='green'> 1. Description<font>

Frequent Pattern Mining using Retail dataset.
Dataset can be downloaded from http://fimi.uantwerpen.be/data/retail.dat


### <font color='green'> 2. Data Preprocessing<font>

In [1]:
import os
import time
import pandas as pd
import numpy as np
import random
from collections import OrderedDict

In [2]:
# Encode string grocery data to numeric form
def encode_data(data):
    import itertools
    unq = np.unique(list(itertools.chain.from_iterable(data)))
    id = np.arange(1, len(unq) + 1, 1)
    transmap = dict(zip(unq, id))
    ret = []
    for e in data:
        enc = [int(transmap[i]) for i in e]
        ret.append(enc)
    return ret

In [3]:
import pyspark
from pyspark.sql import SQLContext
def create_spark_df(data):
    sp_item_list = []
    cnt = 0
    for ilist in data:
        sp_item_list.append((cnt, ilist))
        cnt = cnt + 1
    # Construct spark dataframe
    sqlContext = SQLContext(sc)
    sp_df = sqlContext.createDataFrame(data=sp_item_list, \
                                       schema=["id", "items"])
    return sp_df

In [4]:
def get_names(fname):
    fp = open(fname, 'r')
    line = fp.readline()
    max_ncol = 0
    while line:
        ncol = len(line.split(" "))
        if ncol > max_ncol:
            max_ncol = ncol
        line = fp.readline()
    names = ["item_" + str(i) for i in range(max_ncol)]
    return names

In [5]:
def preprocess_data(fname):
    df = pd.read_csv(fname, sep = " ",
                     names = get_names(fname), # variable no. of fields in each line, hence col-names are provided
                     engine = 'python')  # older pandas version has some parsing issue with c-engine
    item_list = []
    for ilist in df.values.tolist():
        item = [itm for itm in ilist if str(itm) != 'nan']
        item_list.append(item)
    item_list = encode_data(item_list)
    return item_list

In [6]:
#---- Data Preparation ----
# Please download the dataset from below link.
# http://fimi.uantwerpen.be/data/retail.dat

DATA_FILE = "datasets/retail.dat"
item_list = preprocess_data(DATA_FILE)
print("Dataset contains {} item sets".format(len(item_list)))

Dataset contains 88162 item sets


### <font color='green'> 3. Algorithm Evaluation<font>

In [7]:
train_time = []
test_time = []
estimator_name = []

In [8]:
def evaluate(estimator, estimator_nm, data):
    estimator_name.append(estimator_nm)
    start_time = time.time()
    model = estimator.fit(data)
    if "pyspark" in estimator_nm:
        print("total FIS count: %d" % (model.freqItemsets.count())) # Count is required to actually invoke the spark operation (since it is lazy)
    else:
        print("total FIS count: %d" % (len(model.freqItemsets)))
    train_time.append(round(time.time() - start_time, 4))

    start_time = time.time()
    sp_rules = model.associationRules
    if "pyspark" in estimator_nm:
        print("total Rule count: %d" % (sp_rules.count())) # Count is required to actually invoke the spark operation (since it is lazy)
    else:
        print("total Rule count: %d" % (len(sp_rules)))
    test_time.append(round(time.time() - start_time, 4))

#### 3.1 FPGrowth

In [9]:
import pyspark
from pyspark.ml.fpm import FPGrowth as pysparkFPGrowth
sc = pyspark.SparkContext(appName="fpgrowth")
s_est = pysparkFPGrowth(minSupport=0.001, minConfidence=0.05)
e_nm = "fpgrowth_pyspark_" + pyspark.__version__
evaluate(s_est, e_nm, create_spark_df(item_list))
sc.stop()

total FIS count: 7589
total Rule count: 8668


In [10]:
import frovedis
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " +  os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.fpm import FPGrowth as frovFPGrowth
f_est = frovFPGrowth(minSupport=0.001, minConfidence=0.05, mem_opt_level = 1)
e_nm = "fpgrowth_frovedis_" + frovedis.__version__
evaluate(f_est, e_nm, item_list)
f_est.release()
FrovedisServer.shut_down()

total FIS count: 7589
total Rule count: 8668


### <font color='green'> 4. Performance summary<font>

In [11]:
summary = pd.DataFrame(OrderedDict({ "estimator": estimator_name,
                                     "train time": train_time,
                                     "test time": test_time
                                  }))
summary

Unnamed: 0,estimator,train time,test time
0,fpgrowth_pyspark_3.0.2,37.5539,22.8813
1,fpgrowth_frovedis_0.9.10,1.1794,0.4434


In [12]:
speed_up = train_time[0] / train_time[1]
print("Frovedis Speed-up: %.2f" % (speed_up))

Frovedis Speed-up: 31.84
