#### Dataset : https://www.kaggle.com/carrie1/ecommerce-data/version/1

# Init Spark

In [1]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.appName("ecommerce prediction with FP-Growth").getOrCreate()

# Load Dataset

In [2]:
df = spark.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load("data.csv")
df = df.na.drop()

In [3]:
df.count()

406829

In [4]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [5]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

# Slice Table

In [6]:
historyDf = df[['InvoiceNo','StockCode']]
itemDf = df[['StockCode','Description']]

In [7]:
itemDf = itemDf.select("StockCode","Description").distinct()

In [8]:
itemDf.show(20, truncate = False)

+---------+----------------------------------+
|StockCode|Description                       |
+---------+----------------------------------+
|21249    |WOODLAND  HEIGHT CHART STICKERS   |
|84987    |SET OF 36 TEATIME PAPER DOILIES   |
|84279P   |CHERRY BLOSSOM  DECORATIVE FLASK  |
|20671    |BLUE TEATIME PRINT BOWL           |
|85015    |SET OF 12  VINTAGE POSTCARD SET   |
|22690    |DOORMAT HOME SWEET HOME BLUE      |
|22708    |WRAP DOLLY GIRL                   |
|90184A   |AMBER CHUNKY BEAD BRACELET W STRAP|
|22236    |CAKE STAND 3 TIER MAGIC GARDEN    |
|21285    |RETROSPOT CANDLE  MEDIUM          |
|47593A   |CAROUSEL PONIES BABY BIB          |
|21002    |ROSE DU SUD DRAWSTRING BAG        |
|35637C   |PINK STRING CURTAIN WITH POLE     |
|37423    |WHITE WITH BLACK CATS PLATE       |
|23432    |PRETTY HANGING QUILTED HEARTS     |
|10002    |INFLATABLE POLITICAL GLOBE        |
|37444A   |YELLOW BREAKFAST CUP AND SAUCER   |
|22197    |SMALL POPCORN HOLDER              |
|22423    |RE

In [9]:
itemDf.printSchema()

root
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)



In [10]:
historyDf.show()

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|   536365|   85123A|
|   536365|    71053|
|   536365|   84406B|
|   536365|   84029G|
|   536365|   84029E|
|   536365|    22752|
|   536365|    21730|
|   536366|    22633|
|   536366|    22632|
|   536367|    84879|
|   536367|    22745|
|   536367|    22748|
|   536367|    22749|
|   536367|    22310|
|   536367|    84969|
|   536367|    22623|
|   536367|    22622|
|   536367|    21754|
|   536367|    21755|
|   536367|    21777|
+---------+---------+
only showing top 20 rows



In [11]:
historyDf.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)



# Group table by InvoiceNo

In [12]:
from pyspark.sql import functions as F
import pyspark.sql.types as T

In [13]:
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1:
        x = x.strip()
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x)
      
    return unique_list

In [14]:
def fudf(val):
    return unique(val)
flattenUdf = F.udf(fudf, T.ArrayType(T.StringType()))

In [15]:
groupedDf = historyDf.groupBy('InvoiceNo').agg(F.collect_list('StockCode').alias("ListItem"))
groupedDf = groupedDf.select("InvoiceNo", flattenUdf("ListItem").alias("ListItem"))

In [16]:
groupedDf.show()

+---------+--------------------+
|InvoiceNo|            ListItem|
+---------+--------------------+
|   536938|[22386, 85099C, 2...|
|   537691|[22791, 22171, 82...|
|   538184|[22585, 21481, 22...|
|   538517|[22491, 21232, 21...|
|   538879|[84819, 22150, 21...|
|   539275|[22909, 22423, 22...|
|   539630|[21484, 85099B, 2...|
|   540499|[21868, 22697, 22...|
|   540540|[21877, 21868, 21...|
|   540976|[22394, 21890, 22...|
|   541432|[21485, 22457, 84...|
|   541518|[21880, 21881, 21...|
|   541783|[22423, 22854, 22...|
|   542026|[21754, 82600, 22...|
|   542375|[21731, 22367, 22...|
|  C540850|             [21231]|
|   543641|[85123A, 21833, 2...|
|   544303|[22660, 48138, 48...|
|   545583|[85099B, 21931, 2...|
|   547122|[21172, 21876, 21...|
+---------+--------------------+
only showing top 20 rows



# Prediction with FPGrowth

In [17]:
from pyspark.ml.fpm import FPGrowth
itemDf.select("Description").show(30 ,truncate = False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WOODLAND  HEIGHT CHART STICKERS   |
|SET OF 36 TEATIME PAPER DOILIES   |
|CHERRY BLOSSOM  DECORATIVE FLASK  |
|BLUE TEATIME PRINT BOWL           |
|SET OF 12  VINTAGE POSTCARD SET   |
|DOORMAT HOME SWEET HOME BLUE      |
|WRAP DOLLY GIRL                   |
|AMBER CHUNKY BEAD BRACELET W STRAP|
|CAKE STAND 3 TIER MAGIC GARDEN    |
|RETROSPOT CANDLE  MEDIUM          |
|CAROUSEL PONIES BABY BIB          |
|ROSE DU SUD DRAWSTRING BAG        |
|PINK STRING CURTAIN WITH POLE     |
|WHITE WITH BLACK CATS PLATE       |
|PRETTY HANGING QUILTED HEARTS     |
|INFLATABLE POLITICAL GLOBE        |
|YELLOW BREAKFAST CUP AND SAUCER   |
|SMALL POPCORN HOLDER              |
|REGENCY CAKESTAND 3 TIER          |
|12 PENCIL SMALL TUBE WOODLAND     |
|SET OF 6 T-LIGHTS WEDDING CAKE    |
|TEA TIME OVEN GLOVE               |
|PINK PADDED MOBILE                |
|BEADED CRYSTAL HEART GREEN SMALL  |
|

### Prediksi Pertama

In [19]:
fpGrowth = FPGrowth(itemsCol="ListItem", minSupport=0.01, minConfidence=0.03)
model = fpGrowth.fit(groupedDf)

In [20]:
model.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|        [15056N]| 384|
|         [22846]| 266|
|        [85123A]|2020|
|         [22423]|1884|
| [22423, 85123A]| 239|
|         [22616]| 384|
|         [21154]| 266|
|        [85099B]|1643|
|[85099B, 85123A]| 255|
|         [21479]| 382|
|         [22840]| 265|
|         [22690]| 265|
|         [47566]|1399|
|  [47566, 22423]| 232|
| [47566, 85123A]| 280|
|         [21314]| 382|
|         [84879]|1385|
|  [84879, 22423]| 231|
| [84879, 85123A]| 271|
|         [22750]| 380|
+----------------+----+
only showing top 20 rows



In [21]:
model.associationRules.show()

+---------------+----------+-------------------+
|     antecedent|consequent|         confidence|
+---------------+----------+-------------------+
|        [22554]|   [22551]| 0.4966216216216216|
|        [22554]|   [22556]|0.43243243243243246|
|        [22960]|   [22720]| 0.2559585492227979|
|        [22960]|   [22961]|  0.322279792746114|
| [20726, 22382]|   [20725]| 0.6356968215158925|
|        [21977]|   [21212]| 0.5007407407407407|
|        [21977]|   [84991]| 0.4148148148148148|
|        [22699]|   [22423]| 0.5167464114832536|
|        [22699]|   [22697]|  0.666267942583732|
|        [22699]|   [22698]| 0.5394736842105263|
|        [22866]|   [22867]| 0.5073684210526316|
|        [22866]|   [22865]| 0.5915789473684211|
|        [20723]|   [22355]|0.47023809523809523|
|        [20723]|   [20724]| 0.5952380952380952|
|[22386, 85099B]|   [21931]|0.40252707581227437|
|[22386, 85099B]|  [85099F]|0.42057761732851984|
| [23202, 23203]|  [85099B]| 0.5302325581395348|
| [20726, 22384]|   

In [22]:
itemInput = "ASSORTED COLOUR BIRD ORNAMENT"
checkAvailableItem = itemDf.filter(itemDf['Description'] ==  itemInput).collect()

In [23]:
item = []
if(not checkAvailableItem):
    print("Item Not Found")
else:
    item = checkAvailableItem[0]
    print("Found Item " + str(item[0]))

Found Item 84879


In [26]:
temp = [Row(ListItem=[item[0]])]
temp = spark.createDataFrame(temp)

In [27]:
model.transform(temp).show(truncate = False)

+--------+----------------------+
|ListItem|prediction            |
+--------+----------------------+
|[84879] |[22423, 85123A, 21136]|
+--------+----------------------+



In [28]:
res = model.transform(temp).collect()[0]

In [29]:
res = res[1]

In [30]:
for prediction in res:
    checkAvailableItem = itemDf.filter(itemDf['StockCode'] ==  prediction).collect()[0]
    print("Prediksi Item : " + checkAvailableItem[1] + "(" + checkAvailableItem[0]+")")

Prediksi Item : REGENCY CAKESTAND 3 TIER(22423)
Prediksi Item : WHITE HANGING HEART T-LIGHT HOLDER(85123A)
Prediksi Item : PAINTED METAL PEARS ASSORTED(21136)


### Prediksi 2

In [31]:
fpGrowth = FPGrowth(itemsCol="ListItem", minSupport=0.02, minConfidence=0.02)
model = fpGrowth.fit(groupedDf)

In [32]:
model.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [85123A]|2020|
|        [22423]|1884|
|       [85099B]|1643|
|        [47566]|1399|
|        [84879]|1385|
|        [20725]|1330|
|        [22720]|1218|
|         [POST]|1194|
|        [23203]|1097|
|[23203, 85099B]| 473|
|        [22197]|1085|
|        [20727]|1073|
| [20727, 20725]| 523|
|        [22383]|1063|
| [22383, 20725]| 526|
| [22383, 20727]| 467|
|        [21212]|1041|
|        [23298]|1021|
|        [23209]|1017|
| [23209, 23203]| 444|
+---------------+----+
only showing top 20 rows



In [33]:
model.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|   [22699]|   [22697]|  0.666267942583732|
|   [22699]|   [22698]| 0.5394736842105263|
|   [22386]|  [85099B]| 0.6266968325791855|
|   [20727]|   [20725]| 0.4874184529356943|
|   [20727]|   [22383]|0.43522833178005593|
|   [20727]|   [22384]|0.43522833178005593|
|   [22382]|   [22383]|0.45054945054945056|
|   [22382]|   [20725]| 0.4695304695304695|
|   [20725]|   [20727]|0.39323308270676693|
|   [20725]|   [22383]| 0.3954887218045113|
|   [20725]|   [22382]| 0.3533834586466165|
|   [20725]|   [20728]|0.35037593984962406|
|   [20725]|   [22384]|0.39849624060150374|
|   [22384]|   [20725]| 0.5573080967402734|
|   [22384]|   [20727]|  0.491062039957939|
|   [22910]|   [22086]| 0.6403385049365303|
|   [23209]|   [23203]| 0.4365781710914454|
|  [85099B]|   [23203]| 0.2878880097382836|
|  [85099B]|   [22386]| 0.3371880706025563|
|   [22726]|   [22727]| 0.662546

In [34]:
itemInput = "LUNCH BAG PINK POLKADOT"
checkAvailableItem = itemDf.filter(itemDf['Description'] ==  itemInput).collect()

In [35]:
item = []
if(not checkAvailableItem):
    print("Item Not Found")
else:
    item = checkAvailableItem[0]
    print("Found Item " + str(item[0]))

Found Item 22384


In [36]:
from pyspark.sql import Row
temp = [Row(ListItem=[item[0]])]
temp = spark.createDataFrame(temp)

In [37]:
model.transform(temp).show(truncate = False)

+--------+--------------+
|ListItem|prediction    |
+--------+--------------+
|[22384] |[20725, 20727]|
+--------+--------------+



In [38]:
res = model.transform(temp).collect()[0]

In [39]:
res = res[1]

In [40]:
for prediction in res:
    checkAvailableItem = itemDf.filter(itemDf['StockCode'] ==  prediction).collect()[0]
    print("Prediksi Item : " + checkAvailableItem[1] + "(" + checkAvailableItem[0]+")")

Prediksi Item : LUNCH BAG RED SPOTTY(20725)
Prediksi Item : LUNCH BAG  BLACK SKULL.(20727)


### Prediksi 3

In [41]:
fpGrowth = FPGrowth(itemsCol="ListItem", minSupport=0.015, minConfidence=0.025)
model = fpGrowth.fit(groupedDf)

In [42]:
model.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [15056N]| 384|
|       [85123A]|2020|
|        [22423]|1884|
|        [22616]| 384|
|       [85099B]|1643|
|        [21479]| 382|
|        [47566]|1399|
|        [21314]| 382|
|        [84879]|1385|
|        [22750]| 380|
|        [20725]|1330|
|[20725, 85099B]| 434|
|        [22900]| 379|
|        [22720]|1218|
|        [82582]| 378|
|         [POST]|1194|
|        [21908]| 377|
|        [23203]|1097|
|[23203, 85099B]| 473|
|        [22729]| 376|
+---------------+----+
only showing top 20 rows



In [43]:
model.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|   [21977]|   [21212]| 0.5007407407407407|
|   [22699]|   [22423]| 0.5167464114832536|
|   [22699]|   [22697]|  0.666267942583732|
|   [22699]|   [22698]| 0.5394736842105263|
|   [22386]|  [85099B]| 0.6266968325791855|
|   [20727]|   [20725]| 0.4874184529356943|
|   [20727]|   [22383]|0.43522833178005593|
|   [20727]|   [23209]|0.31127679403541475|
|   [20727]|   [22382]|0.39049394221808015|
|   [20727]|   [20728]| 0.3960857409133271|
|   [20727]|   [22384]|0.43522833178005593|
|   [20727]|   [20726]|0.32805219012115566|
|   [22382]|   [22383]|0.45054945054945056|
|   [22382]|   [20725]| 0.4695304695304695|
|   [22382]|   [20727]|0.41858141858141856|
|   [22382]|   [20728]| 0.3946053946053946|
|   [22382]|   [22384]| 0.3696303696303696|
|   [22382]|   [20726]| 0.4085914085914086|
|   [22382]|   [22662]|0.35564435564435565|
|   [20725]|  [85099B]| 0.326315

In [44]:
itemInput = "JUMBO SHOPPER VINTAGE RED PAISLEY"
checkAvailableItem = itemDf.filter(itemDf['Description'] ==  itemInput).collect()

In [45]:
item = []
if(not checkAvailableItem):
    print("Item Not Found")
else:
    item = checkAvailableItem[0]
    print("Found Item " + str(item[0]))

Found Item 22411


In [46]:
temp1 = [Row(ListItem=[item[0]])]
temp2 = spark.createDataFrame(temp1)

In [47]:
model.transform(temp).show(truncate = False)

+--------+-----------------------------------+
|ListItem|prediction                         |
+--------+-----------------------------------+
|[22384] |[20728, 22383, 20725, 20727, 22382]|
+--------+-----------------------------------+



In [48]:
res = model.transform(temp).collect()[0]

In [49]:
res = res[1]

In [50]:
for prediction in res:
    checkAvailableItem = itemDf.filter(itemDf['StockCode'] ==  prediction).collect()[0]
    print("Prediksi Item : " + checkAvailableItem[1] + "(" + checkAvailableItem[0]+")")

Prediksi Item : LUNCH BAG CARS BLUE(20728)
Prediksi Item : LUNCH BAG SUKI  DESIGN (22383)
Prediksi Item : LUNCH BAG RED SPOTTY(20725)
Prediksi Item : LUNCH BAG  BLACK SKULL.(20727)
Prediksi Item : LUNCH BAG SPACEBOY DESIGN (22382)
