## <font color='green'> Groceries - Frequent Pattern Mining<font>

### <font color='green'> 1. Description<font>

Frequent Pattern Mining using Groceries dataset.
Dataset can be downloaded from https://www.kaggle.com/irfanasrullah/groceries

The **Groceries Market Basket Dataset** contains groceries data with the list of items bought by customers.

### <font color='green'> 2. Data Preprocessing<font>

In [1]:
import os
import time
import pandas as pd
import numpy as np
from collections import OrderedDict

In [2]:
def encode_data(data):
    '''
    Encode string grocery data to numeric form
    '''
    import itertools
    unq = np.unique(list(itertools.chain.from_iterable(data)))
    id = np.arange(1, len(unq) + 1, 1)
    transmap = dict(zip(unq, id))
    ret = []
    for e in data:
        enc = [int(transmap[i]) for i in e]
        ret.append(enc)
    return ret

In [3]:
import pyspark
from pyspark.sql import SQLContext
def create_spark_df(data):
    sp_item_list = []
    cnt = 0
    for ilist in data:
        sp_item_list.append((cnt, ilist))
        cnt = cnt + 1
    # Construct spark dataframe
    sqlContext = SQLContext(sc)
    sp_df = sqlContext.createDataFrame(data=sp_item_list, \
                                       schema=["id", "items"])
    return sp_df

In [4]:
def preprocess_data(fname):
    '''
    Perform pre-processing on groceries dataset
    '''
    df = pd.read_csv(fname)
    df = df.drop(['Item(s)'], axis=1)
    item_list = []
    for ilist in df.values.tolist():
        item = [itm for itm in ilist if str(itm) != 'nan']
        item_list.append(item)
    item_list = encode_data(item_list)
    return item_list

In [5]:
#---- Data Preparation ----
# Please download the dataset from below link.
# https://www.kaggle.com/irfanasrullah/groceries?select=groceries+-+groceries.csv

DATA_FILE = "datasets/groceries - groceries.csv"
item_list = preprocess_data(DATA_FILE)
print("Dataset contains {} item sets".format(len(item_list)))

Dataset contains 9835 item sets


### <font color='green'> 3. Algorithm Evaluation<font>

In [6]:
train_time = []
test_time = []
estimator_name = []

In [7]:
def evaluate(estimator, estimator_nm, data):
    '''
    To generate performance report for both frovedis and sklearn estimators
    '''
    estimator_name.append(estimator_nm)
    start_time = time.time()
    model = estimator.fit(data)
    if "pyspark" in estimator_nm:
        print("total FIS count: %d" % (model.freqItemsets.count())) # Count is required to actually invoke the spark operation (since it is lazy)
    else:
        print("total FIS count: %d" % (len(model.freqItemsets)))
    train_time.append(round(time.time() - start_time, 4))

    start_time = time.time()
    sp_rules = model.associationRules
    if "pyspark" in estimator_nm:
        print("total Rule count: %d" % (sp_rules.count())) # Count is required to actually invoke the spark operation (since it is lazy)
    else:
        print("total Rule count: %d" % (len(sp_rules)))
    test_time.append(round(time.time() - start_time, 4))

#### 3.1 FPGrowth

In [8]:
import pyspark
from pyspark.ml.fpm import FPGrowth as pysparkFPGrowth
sc = pyspark.SparkContext(appName="fpgrowth")
s_est = pysparkFPGrowth(minSupport=0.01, minConfidence=0.05)
E_NM = "fpgrowth_pyspark_" + pyspark.__version__
evaluate(s_est, E_NM, create_spark_df(item_list))
sc.stop()

total FIS count: 333
total Rule count: 513


In [9]:
import frovedis
from frovedis.exrpc.server import FrovedisServer
FrovedisServer.initialize("mpirun -np 8 " +  os.environ["FROVEDIS_SERVER"])
from frovedis.mllib.fpm import FPGrowth as frovFPGrowth
f_est = frovFPGrowth(minSupport=0.01, minConfidence=0.05)
E_NM = "fpgrowth_frovedis_" + frovedis.__version__
evaluate(f_est, E_NM, item_list)
f_est.release()
FrovedisServer.shut_down()

total FIS count: 333
total Rule count: 513


### <font color='green'> 4. Performance summary<font>

In [10]:
summary = pd.DataFrame(OrderedDict({ "estimator": estimator_name,
                                     "train time": train_time,
                                     "test time": test_time
                                  }))
summary

Unnamed: 0,estimator,train time,test time
0,fpgrowth_pyspark_3.0.2,4.8609,0.594
1,fpgrowth_frovedis_0.9.10,0.2955,0.1177


In [11]:
speed_up = train_time[0] / train_time[1]
print("Frovedis Speed-up: %.2f" % (speed_up))

Frovedis Speed-up: 16.45
