In [1]:
# Import Spark library for python
import findspark
findspark.init("/usr/local/spark")
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()

In [2]:
# Load dataset
df2 = spark.read.csv("OnlineRetail.csv", header=True, inferSchema=True)

In [3]:
df2.createOrReplaceTempView("retail")


In [4]:
df2.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [5]:
result=spark.sql("SELECT * FROM retail")

In [24]:
stock = spark.sql("SELECT DISTINCT StockCode, Description from retail WHERE Description is not null order by StockCode")

In [26]:
stock.show(100)

+---------+--------------------+
|StockCode|         Description|
+---------+--------------------+
|    10002|INFLATABLE POLITI...|
|    10080|               check|
|    10080|GROOVY CACTUS INF...|
|    10120|        DOGGY RUBBER|
|   10123C|HEARTS WRAPPING T...|
|   10124A|SPOTS ON RED BOOK...|
|   10124G|ARMY CAMO BOOKCOV...|
|    10125|MINI FUNKY DESIGN...|
|    10133|             damaged|
|    10133|COLOURING PENCILS...|
|    10135|COLOURING PENCILS...|
|    11001|ASSTD DESIGN RACI...|
|    15030|    FAN BLACK FRAME |
|    15034|PAPER POCKET TRAV...|
|    15036|ASSORTED COLOURS ...|
|    15039|      SANDALWOOD FAN|
|   15044A| PINK PAPER PARASOL |
|   15044B| BLUE PAPER PARASOL |
|   15044C|PURPLE PAPER PARASOL|
|   15044D|   RED PAPER PARASOL|
|  15056BL|EDWARDIAN PARASOL...|
|   15056N|EDWARDIAN PARASOL...|
|   15056P|EDWARDIAN PARASOL...|
|  15056bl|EDWARDIAN PARASOL...|
|   15056n|EDWARDIAN PARASOL...|
|   15056p|EDWARDIAN PARASOL...|
|   15058A|           wet/rusty|
|   15058A

In [47]:
stock.count()

4792

In [6]:
result.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [73]:
# Process dataset to get id and items in here we use InvoiceNo and Desc
from pyspark.sql import functions as F

df=df2.groupBy("InvoiceNo").agg(F.collect_set(F.col("Description")).alias("Desc"))
df.show()

+---------+--------------------+
|InvoiceNo|                Desc|
+---------+--------------------+
|   536596|[WAKE UP COCKEREL...|
|   536938|[RED 3 PIECE RETR...|
|   537252|[SMALL POPCORN HO...|
|   537691|[3 HOOK PHOTO SHE...|
|   538041|                  []|
|   538184|[MINI JIGSAW SPAC...|
|   538517|[LARGE POPCORN HO...|
|   538879|[PARTY CONE CHRIS...|
|   539275|[RED  HARMONICA I...|
|   539630|[CHICK GREY HOT W...|
|   540499|[IVORY KITCHEN SC...|
|   540540|[HOME SWEET HOME ...|
|   540976|[60 CAKE CASES DO...|
|   541432|[RETROSPOT HEART ...|
|   541518|[60 CAKE CASES DO...|
|   541783|[PHOTO FRAME 3 CL...|
|   542026|[SMALL POPCORN HO...|
|   542375|[CHILDRENS APRON ...|
|  C540850|[SWEETHEART CERAM...|
|   543641|[DOORSTOP FOOTBAL...|
+---------+--------------------+
only showing top 20 rows



In [41]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- Desc: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [42]:
df.count()

25900

In [68]:
# First Model
from pyspark.ml.fpm import FPGrowth
# measure item frequency
fpGrowth = FPGrowth(itemsCol="Desc", minSupport=0.1, minConfidence=0.3)
model = fpGrowth.fit(df)
model.freqItemsets.show(100)
model.freqItemsets.count()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



0

In [62]:
# Second Model
# measure item frequency
fpGrowth2 = FPGrowth(itemsCol="Desc", minSupport=0.05, minConfidence=0.3)
model2 = fpGrowth2.fit(df)
model2.freqItemsets.show(100)
model2.freqItemsets.count()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2246|
| [22423]|2172|
|[85099B]|2135|
| [47566]|1706|
| [20725]|1608|
| [84879]|1468|
| [22720]|1462|
| [22197]|1442|
| [21212]|1334|
| [22383]|1306|
| [20727]|1295|
+--------+----+



11

In [69]:
# Third Model
# measure item frequency
fpGrowth3 = FPGrowth(itemsCol="Desc", minSupport=0.02, minConfidence=0.2)
model3 = fpGrowth3.fit(df)
model3.freqItemsets.show(100)
model3.freqItemsets.count()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[WHITE HANGING HE...|2302|
|[REGENCY CAKESTAN...|2169|
|[JUMBO BAG RED RE...|2135|
|     [PARTY BUNTING]|1706|
|[LUNCH BAG RED RE...|1607|
|[LUNCH BAG RED RE...| 588|
|[ASSORTED COLOUR ...|1467|
|[SET OF 3 CAKE TI...|1458|
|[PACK OF 72 RETRO...|1334|
|[LUNCH BAG  BLACK...|1295|
|[LUNCH BAG  BLACK...| 648|
|[NATURAL SLATE HE...|1266|
|           [POSTAGE]|1250|
|[JUMBO BAG PINK P...|1231|
|[JUMBO BAG PINK P...| 833|
|[JAM MAKING SET W...|1220|
|[HEART OF WICKER ...|1212|
|[JUMBO STORAGE BA...|1201|
|[JUMBO STORAGE BA...| 518|
|[JUMBO STORAGE BA...| 733|
|[JUMBO SHOPPER VI...|1187|
|[JUMBO SHOPPER VI...| 683|
|[JUMBO SHOPPER VI...| 525|
|[JAM MAKING SET P...|1174|
|[LUNCH BAG CARS B...|1173|
|[LUNCH BAG CARS B...| 526|
|[LUNCH BAG CARS B...| 561|
|[PAPER CHAIN KIT ...|1170|
|[LUNCH BAG SPACEB...|1170|
|[LUNCH BAG SPACEB...| 563|
|[RECIPE BOX PANTR...|1164|
|    [SPOTTY BUNTING]|1153|
|[ROSES REGENCY TE..

215

In [70]:
model3.associationRules.show()


+--------------------+--------------------+-------------------+------------------+
|          antecedent|          consequent|         confidence|              lift|
+--------------------+--------------------+-------------------+------------------+
|[PAPER CHAIN KIT ...|[PAPER CHAIN KIT ...| 0.6670673076923077|14.766703648915188|
|[JUMBO SHOPPER VI...|[JUMBO BAG RED RE...| 0.5754001684919966| 6.980264339083237|
|[JUMBO SHOPPER VI...|[JUMBO STORAGE BA...|0.44229149115417016| 9.538176203907582|
|[PINK REGENCY TEA...|[GREEN REGENCY TE...| 0.8941368078175895|21.909312509437626|
|[JUMBO STORAGE BA...|[JUMBO BAG PINK P...|0.43130724396336384| 9.074620323843318|
|[JUMBO STORAGE BA...|[JUMBO BAG RED RE...| 0.6103247293921732| 7.403939340167346|
|[JUMBO STORAGE BA...|[JUMBO SHOPPER VI...|0.43713572023313907| 9.538176203907584|
|[JUMBO BAG WOODLA...|[JUMBO BAG RED RE...| 0.6169724770642202| 7.484584147992179|
|  [JUMBO BAG APPLES]|[JUMBO BAG RED RE...| 0.5555555555555556| 6.739526411657559|
|[LU

In [71]:
model3.transform(df).show(100)

+---------+--------------------+--------------------+
|InvoiceNo|                Desc|          prediction|
+---------+--------------------+--------------------+
|   536596|[WAKE UP COCKEREL...|                  []|
|   536938|[RED 3 PIECE RETR...|[JUMBO BAG RED RE...|
|   537252|[SMALL POPCORN HO...|                  []|
|   537691|[3 HOOK PHOTO SHE...|                  []|
|   538041|                  []|                  []|
|   538184|[MINI JIGSAW SPAC...|                  []|
|   538517|[LARGE POPCORN HO...|                  []|
|   538879|[PARTY CONE CHRIS...|                  []|
|   539275|[RED  HARMONICA I...|[ROSES REGENCY TE...|
|   539630|[CHICK GREY HOT W...|[ROSES REGENCY TE...|
|   540499|[IVORY KITCHEN SC...|[PINK REGENCY TEA...|
|   540540|[HOME SWEET HOME ...|                  []|
|   540976|[60 CAKE CASES DO...|[CHARLOTTE BAG PI...|
|   541432|[RETROSPOT HEART ...|                  []|
|   541518|[60 CAKE CASES DO...|[JUMBO STORAGE BA...|
|   541783|[PHOTO FRAME 3 CL

In [66]:
model3.transform(df).printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- Desc: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Kesimpulan 
- min support = 0.1 dan min confidence = 0.3 results **0 itemset**  
- min support = 0.05 dan min confidence 0.3 results **11 itemsets**
- min support = 0.02 dan min confidence 0.3 results **220 itemsets**

