## Spark Initialization

In [1]:
# Call findspark
import findspark
findspark.init()

In [2]:
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [3]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000000006E4F198>


## Loading Data using Spark

In [4]:
# Datasets can be downloaded from https://www.kaggle.com/carrie1/ecommerce-data/home
df = spark.read.csv("D:/TC/6BigData/Dataset/ecommerce-data/data.csv", header=True, inferSchema=True)

In [5]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [6]:
df.count()

541909

In [7]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [8]:
df_items = df.selectExpr(['InvoiceNo as id','StockCode as items', 'Description'])

In [9]:
df = df.selectExpr(['InvoiceNo as id','StockCode as items'])

In [10]:
df.show()

+------+------+
|    id| items|
+------+------+
|536365|85123A|
|536365| 71053|
|536365|84406B|
|536365|84029G|
|536365|84029E|
|536365| 22752|
|536365| 21730|
|536366| 22633|
|536366| 22632|
|536367| 84879|
|536367| 22745|
|536367| 22748|
|536367| 22749|
|536367| 22310|
|536367| 84969|
|536367| 22623|
|536367| 22622|
|536367| 21754|
|536367| 21755|
|536367| 21777|
+------+------+
only showing top 20 rows



In [11]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- items: string (nullable = true)



In [12]:
df_items.show()

+------+------+--------------------+
|    id| items|         Description|
+------+------+--------------------+
|536365|85123A|WHITE HANGING HEA...|
|536365| 71053| WHITE METAL LANTERN|
|536365|84406B|CREAM CUPID HEART...|
|536365|84029G|KNITTED UNION FLA...|
|536365|84029E|RED WOOLLY HOTTIE...|
|536365| 22752|SET 7 BABUSHKA NE...|
|536365| 21730|GLASS STAR FROSTE...|
|536366| 22633|HAND WARMER UNION...|
|536366| 22632|HAND WARMER RED P...|
|536367| 84879|ASSORTED COLOUR B...|
|536367| 22745|POPPY'S PLAYHOUSE...|
|536367| 22748|POPPY'S PLAYHOUSE...|
|536367| 22749|FELTCRAFT PRINCES...|
|536367| 22310|IVORY KNITTED MUG...|
|536367| 84969|BOX OF 6 ASSORTED...|
|536367| 22623|BOX OF VINTAGE JI...|
|536367| 22622|BOX OF VINTAGE AL...|
|536367| 21754|HOME BUILDING BLO...|
|536367| 21755|LOVE BUILDING BLO...|
|536367| 21777|RECIPE BOX WITH M...|
+------+------+--------------------+
only showing top 20 rows



In [13]:
df_items.printSchema()

root
 |-- id: string (nullable = true)
 |-- items: string (nullable = true)
 |-- Description: string (nullable = true)



## Grouping Data

In [14]:
# Group the Data according to the Id
from pyspark.sql.functions import collect_list
df_group = df.groupby("id").agg(collect_list('items').alias('items'))

In [15]:
df_group.show()

+-------+--------------------+
|     id|               items|
+-------+--------------------+
| 536596|[21624, 22900, 22...|
| 536938|[22386, 85099C, 2...|
| 537252|             [22197]|
| 537691|[22791, 22171, 82...|
| 538041|             [22145]|
| 538184|[22585, 21481, 22...|
| 538517|[22491, 21232, 21...|
| 538879|[84819, 22150, 21...|
| 539275|[22909, 22423, 22...|
| 539630|[21484, 85099B, 2...|
| 540499|[21868, 22697, 22...|
| 540540|[21877, 21868, 21...|
| 540976|[22394, 21890, 22...|
| 541432|[21485, 22457, 84...|
| 541518|[21880, 21881, 21...|
| 541783|[22423, 22854, 22...|
| 542026|[21754, 82600, 22...|
| 542375|[21731, 22367, 22...|
|C540850|             [21231]|
| 543641|[85123A, 21833, 2...|
+-------+--------------------+
only showing top 20 rows



## Removing Duplicates Data

In [16]:
# Remove duplicates data
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

distinct = udf(lambda row: list(set(row)), ArrayType(StringType()))
df_group = df_group.withColumn("distinct_items", distinct("items"))

In [17]:
df_group.show()

+-------+--------------------+--------------------+
|     id|               items|      distinct_items|
+-------+--------------------+--------------------+
| 536596|[21624, 22900, 22...|[21624, 22114, 21...|
| 536938|[22386, 85099C, 2...|[84997A, 21479, 8...|
| 537252|             [22197]|             [22197]|
| 537691|[22791, 22171, 82...|[22505, 22666, 84...|
| 538041|             [22145]|             [22145]|
| 538184|[22585, 21481, 22...|[22560, 22561, 21...|
| 538517|[22491, 21232, 21...|[22562, 22563, 22...|
| 538879|[84819, 22150, 21...|[22593, 22453, 21...|
| 539275|[22909, 22423, 22...|[21914, 21915, 22...|
| 539630|[21484, 85099B, 2...|[21484, 21485, 22...|
| 540499|[21868, 22697, 22...|[22625, 22624, 22...|
| 540540|[21877, 21868, 21...|[22443, 22442, 22...|
| 540976|[22394, 21890, 22...|[20914, 22115, 22...|
| 541432|[21485, 22457, 84...|[21485, 22113, 22...|
| 541518|[21880, 21881, 21...|[22504, 22378, 84...|
| 541783|[22423, 22854, 22...|[22777, 21669, 22...|
| 542026|[21

In [18]:
df_group.printSchema()

root
 |-- id: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- distinct_items: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [19]:
df_group = df_group.selectExpr(['id','distinct_items as items'])

In [20]:
df_group.show()

+-------+--------------------+
|     id|               items|
+-------+--------------------+
| 536596|[21624, 22114, 21...|
| 536938|[84997A, 21479, 8...|
| 537252|             [22197]|
| 537691|[22505, 22666, 84...|
| 538041|             [22145]|
| 538184|[22560, 22561, 21...|
| 538517|[22562, 22563, 22...|
| 538879|[22593, 22453, 21...|
| 539275|[21914, 21915, 22...|
| 539630|[21484, 21485, 22...|
| 540499|[22625, 22624, 22...|
| 540540|[22443, 22442, 22...|
| 540976|[20914, 22115, 22...|
| 541432|[21485, 22113, 22...|
| 541518|[22504, 22378, 84...|
| 541783|[22777, 21669, 22...|
| 542026|[21754, 22192, 22...|
| 542375|[22634, 22629, 21...|
|C540850|             [21231]|
| 543641|[75131, 21644, 44...|
+-------+--------------------+
only showing top 20 rows



In [21]:
df_group.printSchema()

root
 |-- id: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)



## FP-Growth Algorithm

In [22]:
from pyspark.ml.fpm import FPGrowth

In [23]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df_group)

In [24]:
# Display frequent itemsets
model.freqItemsets.show()

+-----+----+
|items|freq|
+-----+----+
+-----+----+



In [25]:
fpGrowth2 = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.06)
model2 = fpGrowth2.fit(df_group)

In [26]:
# Display frequent itemsets
model2.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|[85123A]|2246|
| [22423]|2172|
|[85099B]|2135|
| [47566]|1706|
| [20725]|1608|
| [84879]|1468|
| [22720]|1462|
| [22197]|1442|
| [21212]|1334|
| [22383]|1306|
| [20727]|1295|
+--------+----+



In [27]:
# Display generated association rules.
model2.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [28]:
fpGrowth3 = FPGrowth(itemsCol="items", minSupport=0.02, minConfidence=0.03)
model3 = fpGrowth3.fit(df_group)

In [29]:
# Display frequent itemsets
model3.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [85123A]|2246|
|        [22423]|2172|
|       [85099B]|2135|
|        [47566]|1706|
|        [20725]|1608|
|[20725, 85099B]| 588|
|        [84879]|1468|
|        [22720]|1462|
|        [22197]|1442|
|        [21212]|1334|
|        [22383]|1306|
| [22383, 20725]| 663|
|        [20727]|1295|
| [20727, 20725]| 648|
| [20727, 22383]| 587|
|        [22457]|1266|
|         [POST]|1254|
|        [23203]|1249|
|[23203, 85099B]| 582|
|        [22386]|1231|
+---------------+----+
only showing top 20 rows



In [30]:
# Display generated association rules.
model3.associationRules.show()

+----------+----------+-------------------+
|antecedent|consequent|         confidence|
+----------+----------+-------------------+
|   [22699]|   [22423]|0.47946428571428573|
|   [22699]|   [22697]|                0.7|
|   [22699]|   [22698]| 0.5482142857142858|
|   [22386]|  [85099B]| 0.6766856214459789|
|   [22386]|   [21931]| 0.4207961007311129|
|   [20727]|   [20725]| 0.5003861003861004|
|   [20727]|   [22383]| 0.4532818532818533|
|   [20727]|   [20728]| 0.4061776061776062|
|   [20727]|   [22384]| 0.4223938223938224|
|   [22382]|   [20725]| 0.4811965811965812|
|   [22382]|   [22383]|0.45897435897435895|
|   [20725]|  [85099B]| 0.3656716417910448|
|   [20725]|   [22383]| 0.4123134328358209|
|   [20725]|   [20727]|0.40298507462686567|
|   [20725]|   [20728]|0.34950248756218905|
|   [20725]|   [22382]|0.35012437810945274|
|   [20725]|   [22384]| 0.3812189054726368|
|   [20725]|   [20726]| 0.3308457711442786|
|   [22384]|   [20725]| 0.5522522522522523|
|   [22384]|   [20727]| 0.492792

In [31]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model3.transform(df_group).show()

+-------+--------------------+--------------------+
|     id|               items|          prediction|
+-------+--------------------+--------------------+
| 536596|[21624, 22114, 21...|                  []|
| 536938|[84997A, 21479, 8...|     [85099B, 22411]|
| 537252|             [22197]|                  []|
| 537691|[22505, 22666, 84...|                  []|
| 538041|             [22145]|                  []|
| 538184|[22560, 22561, 21...|                  []|
| 538517|[22562, 22563, 22...|                  []|
| 538879|[22593, 22453, 21...|                  []|
| 539275|[21914, 21915, 22...|      [22699, 22697]|
| 539630|[21484, 21485, 22...|[20725, 23203, 22...|
| 540499|[22625, 22624, 22...|      [22698, 20724]|
| 540540|[22443, 22442, 22...|                  []|
| 540976|[20914, 22115, 22...|             [22356]|
| 541432|[21485, 22113, 22...|                  []|
| 541518|[22504, 22378, 84...|[22356, 20725, 23...|
| 541783|[22777, 21669, 22...|             [22698]|
| 542026|[21

In [39]:
df2 = spark.createDataFrame([
    ('0', ['22382']),
    ('1', ['22692', '22699']),
    ('2', ['20725','20727','20729'])
], ['id', 'items'])

In [40]:
model3.transform(df2).show(truncate = False)

+---+---------------------+-------------------------------------------+
|id |items                |prediction                                 |
+---+---------------------+-------------------------------------------+
|0  |[22382]              |[20725, 22383]                             |
|1  |[22692, 22699]       |[22423, 22697, 22698]                      |
|2  |[20725, 20727, 20729]|[22383, 20728, 22384, 85099B, 22382, 20726]|
+---+---------------------+-------------------------------------------+



In [None]:
Check the Description of the item (with id = 1)

In [41]:
# Register the DataFrame as a SQL temporary view
df_items.createOrReplaceTempView("data")

In [48]:
# Find the Description of the item with StockCode of 22692 or 22699 
query = spark.sql("SELECT DISTINCT Description \
                FROM data \
                WHERE (items = '22692' OR items = '22699')")
query.show(truncate = False)

+--------------------------------+
|Description                     |
+--------------------------------+
|DOORMAT WELCOME TO OUR HOME     |
|ROSES REGENCY TEACUP AND SAUCER |
+--------------------------------+



In [49]:
# Find the Description of the item with StockCode of 22423 or 22697 or 22698
query = spark.sql("SELECT DISTINCT Description \
                FROM data \
                WHERE (items = '22423' OR items = '22697' OR items = '22698')")
query.show(truncate = False)

+-------------------------------+
|Description                    |
+-------------------------------+
|REGENCY CAKESTAND 3 TIER       |
|null                           |
|damages                        |
|GREEN REGENCY TEACUP AND SAUCER|
|PINK REGENCY TEACUP AND SAUCER |
|faulty                         |
+-------------------------------+



In [None]:
From the result above, we can take a conclusion that if people buy Doormat Welcome To Our Home and Roses Regency Teacup and Saucer, then they are most likely also to buy Regency Cakestand 3 Tier and/or Green Regency Teacup and Saucer and/or Pink Regency Teacup and Saucer.

## Reference

1. Remove duplicates from PySpark array column (https://stackoverflow.com/questions/54185710/remove-duplicates-from-pyspark-array-column)