# Market Basket Analysis
In this notebook we practice Market Basket Analysis through a sample dataset.

![Titanic Disaster](https://upload.wikimedia.org/wikipedia/commons/4/4a/AffinityAnalysis.png)

Image Source: www.wikimedia.org

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('myproj').getOrCreate()

## Download the dataset from Moodle and upload in here:

In [None]:
data = spark.read.csv('/FileStore/tables/sales.csv',inferSchema=True,header=True)
data.printSchema()
data.show(5)

root
 |-- SalesDate: timestamp (nullable = true)
 |-- SalesValue: double (nullable = true)
 |-- SalesAmount: integer (nullable = true)
 |-- Customer: integer (nullable = true)
 |-- SalesTransactionID: integer (nullable = true)
 |-- SalesItem: integer (nullable = true)

+-------------------+----------+-----------+--------+------------------+---------+
|          SalesDate|SalesValue|SalesAmount|Customer|SalesTransactionID|SalesItem|
+-------------------+----------+-----------+--------+------------------+---------+
|2018-09-28 00:00:00|    8280.0|         10|       0|                 0|        0|
|2018-09-28 00:00:00|    7452.0|         10|       0|                 0|        0|
|2019-04-23 00:00:00|   21114.0|         30|       0|                 1|        0|
|2019-04-23 00:00:00|    7038.0|         10|       0|                 1|        1|
|2019-04-23 00:00:00|    7000.0|          2|       0|                 1|        2|
+-------------------+----------+-----------+--------+-------------

In [None]:
from pyspark.sql import functions as F
from pyspark.ml.fpm import FPGrowth
import pandas

## if we bought more than one quantity of an item, we just consider that item code

In [None]:
data_unique = data.dropDuplicates(['SalesTransactionID', 'SalesItem']).sort('SalesTransactionID')
data_unique.show(5)
basketdata = data_unique.groupBy("SalesTransactionID").agg(F.collect_list("SalesItem")).sort('SalesTransactionID')
basketdata.show(5)

+-------------------+----------+-----------+--------+------------------+---------+
|          SalesDate|SalesValue|SalesAmount|Customer|SalesTransactionID|SalesItem|
+-------------------+----------+-----------+--------+------------------+---------+
|2018-09-28 00:00:00|    8280.0|         10|       0|                 0|        0|
|2019-04-23 00:00:00|   21114.0|         30|       0|                 1|        0|
|2019-04-23 00:00:00|    7038.0|         10|       0|                 1|        1|
|2019-04-23 00:00:00|    7000.0|          2|       0|                 1|        2|
|2019-02-08 00:00:00|     745.2|          1|       0|                 2|        1|
+-------------------+----------+-----------+--------+------------------+---------+
only showing top 5 rows

+------------------+-----------------------+
|SalesTransactionID|collect_list(SalesItem)|
+------------------+-----------------------+
|                 0|                    [0]|
|                 1|              [0, 2, 1]|
|  

In [None]:
#Frequent Pattern Growth – FP Growth is a method of mining frequent itemsets using support, lift, and confidence.
fpGrowth = FPGrowth(itemsCol="collect_list(SalesItem)", minSupport=0.006, minConfidence=0.006)
model = fpGrowth.fit(basketdata)
# Display frequent itemsets.
model.freqItemsets.show()

+----------+----+
|     items|freq|
+----------+----+
|     [315]| 553|
|     [274]| 407|
|     [137]| 730|
|    [1491]| 432|
|     [295]| 471|
|     [565]| 368|
|     [363]| 512|
|     [159]| 671|
|[159, 161]| 319|
| [159, 20]| 296|
|     [131]| 599|
|[131, 132]| 364|
|      [19]| 382|
|     [135]| 690|
|     [302]| 622|
|      [22]| 449|
|     [205]| 484|
|     [152]| 761|
|     [146]| 524|
|     [128]| 929|
+----------+----+
only showing top 20 rows



In [None]:
items = model.freqItemsets
# Display generated association rules.
model.associationRules.show()

+----------+----------+-------------------+------------------+--------------------+
|antecedent|consequent|         confidence|              lift|             support|
+----------+----------+-------------------+------------------+--------------------+
|     [132]|     [131]|  0.603648424543947| 48.80180067940764|0.007516623301532235|
| [63, 104]|      [20]| 0.7111650485436893|12.139188805349558|0.006050468756453145|
|     [285]|     [102]| 0.5182291666666666|28.713690646453088|0.008218725478048982|
|     [285]|     [514]| 0.3997395833333333|13.777785809608542|0.006339569652665924|
|     [285]|      [83]| 0.4700520833333333| 11.52543908227848| 0.00745467310948664|
|     [285]|       [8]|         0.41015625|16.860973312818334|0.006504770164787511|
|     [285]|     [120]| 0.3802083333333333|10.390501551918735|0.006029818692437...|
|     [285]|      [20]| 0.4322916666666667| 7.378976471624956|0.006855821253045884|
| [67, 103]|     [312]|0.41866330390920553|19.072614445067906|0.006855821253

In [None]:
rules = model.associationRules
# transform examines the input items against all the association rules and summarize the consequents as prediction
model.transform(basketdata).show()
transformed = model.transform(basketdata)

+------------------+-----------------------+--------------------+
|SalesTransactionID|collect_list(SalesItem)|          prediction|
+------------------+-----------------------+--------------------+
|                 0|                    [0]|                  []|
|                 1|              [0, 2, 1]|                  []|
|                 2|                    [1]|                  []|
|                 3|                    [0]|                  []|
|                 4|                    [0]|                  []|
|                 5|                    [0]|                  []|
|                 6|                    [2]|                  []|
|                 7|                    [2]|                  []|
|                 8|                    [0]|                  []|
|                10|                 [1, 0]|                  []|
|                11|                    [0]|                  []|
|                12|                 [4, 3]|                  []|
|         

In [None]:
display(transformed)

SalesTransactionID,collect_list(SalesItem),prediction
0,List(0),List()
1,"List(0, 2, 1)",List()
2,List(1),List()
3,List(0),List()
4,List(0),List()
5,List(0),List()
6,List(2),List()
7,List(2),List()
8,List(0),List()
10,"List(1, 0)",List()


In [None]:
rules_pandadf = rules.select("*").toPandas()
rules_pandadf.head()

Unnamed: 0,antecedent,consequent,confidence,lift,support
0,[132],[131],0.603648,48.801801,0.007517
1,"[63, 104]",[20],0.711165,12.139189,0.00605
2,[285],[102],0.518229,28.713691,0.008219
3,[285],[514],0.39974,13.777786,0.00634
4,[285],[83],0.470052,11.525439,0.007455


In [None]:
# download the results
display(rules_pandadf)

antecedent,consequent,confidence,lift,support
List(132),List(131),0.603648424543947,48.80180067940764,0.0075166233015322
"List(63, 104)",List(20),0.7111650485436893,12.139188805349558,0.0060504687564531
List(285),List(102),0.5182291666666666,28.713690646453088,0.0082187254780489
List(285),List(514),0.3997395833333333,13.777785809608542,0.0063395696526659
List(285),List(83),0.4700520833333333,11.52543908227848,0.0074546731094866
List(285),List(8),0.41015625,16.860973312818334,0.0065047701647875
List(285),List(120),0.3802083333333333,10.390501551918735,0.0060298186924379
List(285),List(20),0.4322916666666667,7.378976471624956,0.0068558212530458
"List(67, 103)",List(312),0.4186633039092055,19.072614445067902,0.0068558212530458
"List(67, 103)",List(108),0.5573770491803278,16.309088207617254,0.0091273282947177
