# Init Spark

In [1]:
import os
from pyspark.sql import SparkSession, Row

In [2]:
spark = SparkSession.builder.appName("Product Reccomendation").getOrCreate()

# Load Batch Dataset

In [3]:
df = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load("batch1.csv")

In [4]:
df.count()

100000

In [5]:
df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- city_category: string (nullable = true)
 |-- stay_in_current_city_years: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- product_category_1: string (nullable = true)
 |-- product_category_2: string (nullable = true)
 |-- product_category_3: string (nullable = true)
 |-- purchase: string (nullable = true)



# Slice Table

In [6]:
userProductDf = df[["user_id","product_id"]]

In [7]:
ageProductDf = df[["age","product_id"]]

# Group Table

In [8]:
from pyspark.sql import functions as F
import pyspark.sql.types as T

In [9]:
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1:
        x = x.strip()
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x)
      
    return unique_list
def fudf(val):
    return unique(val)
flattenUdf = F.udf(fudf, T.ArrayType(T.StringType()))

## By UserID

In [10]:
groupedUserProductDf = userProductDf.groupBy('user_id').agg(F.collect_list('product_id').alias("ListItem"))
groupedUserProductDf = groupedUserProductDf.select("user_id", flattenUdf("ListItem").alias("ListItem"))
groupedUserProductDf.show()

+-------+--------------------+
|user_id|            ListItem|
+-------+--------------------+
|1000240|[P00106742, P0022...|
|1000280|[P00003442, P0030...|
|1000665|[P00116742, P0007...|
|1000795|[P00289942, P0011...|
|1000839|[P00184942, P0011...|
|1000888|[P00147742, P0011...|
|1001866|[P00154642, P0029...|
|1002011|[P00142942, P0011...|
|1002185|[P00284642, P0017...|
|1002442|[P00236942, P0028...|
|1002783|[P00070542, P0011...|
|1002883|[P00249542, P0021...|
|1002887|[P00157642, P0025...|
|1003202|[P00157642, P0007...|
|1003366|[P00351142, P0019...|
|1003397|[P00025442, P0008...|
|1003644|         [P00009542]|
|1003663|[P00080342, P0012...|
|1003665|[P00080342, P0011...|
|1004042|[P00351342, P0034...|
+-------+--------------------+
only showing top 20 rows



# Prediction with FPGrowth

In [11]:
from pyspark.ml.fpm import FPGrowth

In [46]:
fpGrowth = FPGrowth(itemsCol="ListItem", minSupport=0.004, minConfidence=0.05)
model = fpGrowth.fit(groupedUserProductDf)

In [47]:
model.associationRules.show()

+-----------+-----------+-------------------+
| antecedent| consequent|         confidence|
+-----------+-----------+-------------------+
|[P00046742]|[P00112142]|0.10606060606060606|
|[P00046742]|[P00034742]|0.09090909090909091|
|[P00220442]|[P00112142]|0.09881422924901186|
|[P00220442]|[P00031042]|0.09486166007905138|
|[P00025442]|[P00112142]|0.08196721311475409|
|[P00025442]|[P00110742]|0.09180327868852459|
|[P00025442]|[P00057642]|0.10819672131147541|
|[P00025442]|[P00058042]|0.08524590163934426|
|[P00025442]|[P00110942]|0.07868852459016394|
|[P00000142]|[P00117442]|0.12077294685990338|
|[P00278642]|[P00184942]| 0.1111111111111111|
|[P00278642]|[P00265242]| 0.1282051282051282|
|[P00110942]|[P00025442]|0.09561752988047809|
|[P00010742]|[P00145042]| 0.1016949152542373|
|[P00028842]|[P00058042]|0.11059907834101383|
|[P00117942]|[P00112142]|0.10588235294117647|
|[P00145042]|[P00010742]|0.09195402298850575|
|[P00031042]|[P00220442]|0.09795918367346938|
|[P00031042]|[P00058042]|0.1061224

In [50]:
from pyspark.sql import Row
temp = [Row(ListItem=["P00220442","P00112142"])]
temp = spark.createDataFrame(temp)

In [51]:
model.transform(temp).show(truncate = False)

+----------------------+--------------------------------------------+
|ListItem              |prediction                                  |
+----------------------+--------------------------------------------+
|[P00220442, P00112142]|[P00031042, P00025442, P00046742, P00117942]|
+----------------------+--------------------------------------------+

