In [1]:
# Spark initial

In [2]:
import findspark
findspark.init()

In [3]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [4]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x114b74908>


In [5]:
# Data Import
df = spark.read.csv("/Users/gunstringer/Downloads/OnlineRetail.csv", header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [7]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/10 08.26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/10 08.26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01/12/10 08.26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|01/12/10 08.26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01/12/10 08.26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|01/12/10 08.26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|01/12/10 08.26|     4.

In [36]:
from pyspark.sql.functions import collect_set
df_preprocess = df.groupby("InvoiceNo").agg(collect_set('StockCode'))
df_preprocess.show()

+---------+----------------------+
|InvoiceNo|collect_set(StockCode)|
+---------+----------------------+
|   536596|  [22900, 22114, 84...|
|   536938|  [22112, 21931, 84...|
|   537252|               [22197]|
|   537691|  [22505, 46000R, 2...|
|   538041|               [22145]|
|   538184|  [22561, 22147, 21...|
|   538517|  [22749, 21212, 22...|
|   538879|  [21212, 22759, 22...|
|   539275|  [22083, 22150, 22...|
|   539630|  [22111, 22971, 22...|
|   540499|  [22697, 22796, 21...|
|   540540|  [22111, 22834, 22...|
|   540976|  [22413, 21212, 22...|
|   541432|  [22113, 22457, 21...|
|   541518|  [21212, 22432, 22...|
|   541783|  [22561, 22697, 22...|
|   542026|  [22398, 22194, 22...|
|   542375|  [22629, 21731, 22...|
|   543641|  [22645, 75131, 22...|
|   544303|  [84596L, 22931, 8...|
+---------+----------------------+
only showing top 20 rows



In [39]:
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="collect_set(StockCode)", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df_preprocess)

In [40]:
model.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



In [41]:
model.associationRules.show()

+----------+----------+----------+----+
|antecedent|consequent|confidence|lift|
+----------+----------+----------+----+
+----------+----------+----------+----+



In [42]:
model.transform(df_preprocess).show()

+---------+----------------------+----------+
|InvoiceNo|collect_set(StockCode)|prediction|
+---------+----------------------+----------+
|   536596|  [22900, 22114, 84...|        []|
|   536938|  [22112, 21931, 84...|        []|
|   537252|               [22197]|        []|
|   537691|  [22505, 46000R, 2...|        []|
|   538041|               [22145]|        []|
|   538184|  [22561, 22147, 21...|        []|
|   538517|  [22749, 21212, 22...|        []|
|   538879|  [21212, 22759, 22...|        []|
|   539275|  [22083, 22150, 22...|        []|
|   539630|  [22111, 22971, 22...|        []|
|   540499|  [22697, 22796, 21...|        []|
|   540540|  [22111, 22834, 22...|        []|
|   540976|  [22413, 21212, 22...|        []|
|   541432|  [22113, 22457, 21...|        []|
|   541518|  [21212, 22432, 22...|        []|
|   541783|  [22561, 22697, 22...|        []|
|   542026|  [22398, 22194, 22...|        []|
|   542375|  [22629, 21731, 22...|        []|
|   543641|  [22645, 75131, 22...|

In [47]:
from pyspark.ml.fpm import FPGrowth
fpGrowth = FPGrowth(itemsCol="collect_set(StockCode)", minSupport=0.01, minConfidence=0.6)
model = fpGrowth.fit(df_preprocess)

In [48]:
model.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|         [22633]| 487|
|         [23236]| 344|
|        [85123A]|2246|
|         [22423]|2172|
| [22423, 85123A]| 355|
|         [22667]| 486|
|         [22579]| 343|
|  [22579, 22578]| 282|
|        [85099B]|2135|
| [85099B, 22423]| 288|
|[85099B, 85123A]| 404|
|         [22620]| 486|
|        [84536A]| 342|
|         [71053]| 342|
|         [47566]|1706|
| [47566, 85099B]| 332|
|  [47566, 22423]| 398|
| [47566, 85123A]| 391|
|         [85150]| 483|
|         [20725]|1608|
+----------------+----+
only showing top 20 rows



In [None]:
model.transform(df_preprocess).show()