# mid exams no. 2

In [1]:
#install Apache spark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Import required libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.fpm import FPGrowth


In [3]:
# Create Spark Session/Context
spark = SparkSession.builder \
    .master("local") \
    .appName("Frequent Itemset") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
# import the data from csv
df = spark.read.option("header", True).option("sep",";").csv("/content/drive/MyDrive/big-data/market-basket.csv")
df.show()
#mtittles.count()
# mtittles.show()

+------+--------------------+--------+----------------+-----+----------+--------------+
|BillNo|            Itemname|Quantity|            Date|Price|CustomerID|       Country|
+------+--------------------+--------+----------------+-----+----------+--------------+
|536365|WHITE HANGING HEA...|       6|01.12.2010 08:26| 2,55|     17850|United Kingdom|
|536365| WHITE METAL LANTERN|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|CREAM CUPID HEART...|       8|01.12.2010 08:26| 2,75|     17850|United Kingdom|
|536365|KNITTED UNION FLA...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|RED WOOLLY HOTTIE...|       6|01.12.2010 08:26| 3,39|     17850|United Kingdom|
|536365|SET 7 BABUSHKA NE...|       2|01.12.2010 08:26| 7,65|     17850|United Kingdom|
|536365|GLASS STAR FROSTE...|       6|01.12.2010 08:26| 4,25|     17850|United Kingdom|
|536366|HAND WARMER UNION...|       6|01.12.2010 08:28| 1,85|     17850|United Kingdom|
|536366|HAND WARMER RED P...|   

In [5]:
# Select only the billno adn items name
df_raw = df.select("BillNo","Itemname")
df_raw.show()

+------+--------------------+
|BillNo|            Itemname|
+------+--------------------+
|536365|WHITE HANGING HEA...|
|536365| WHITE METAL LANTERN|
|536365|CREAM CUPID HEART...|
|536365|KNITTED UNION FLA...|
|536365|RED WOOLLY HOTTIE...|
|536365|SET 7 BABUSHKA NE...|
|536365|GLASS STAR FROSTE...|
|536366|HAND WARMER UNION...|
|536366|HAND WARMER RED P...|
|536367|ASSORTED COLOUR B...|
|536367|POPPY'S PLAYHOUSE...|
|536367|POPPY'S PLAYHOUSE...|
|536367|FELTCRAFT PRINCES...|
|536367|IVORY KNITTED MUG...|
|536367|BOX OF 6 ASSORTED...|
|536367|BOX OF VINTAGE JI...|
|536367|BOX OF VINTAGE AL...|
|536367|HOME BUILDING BLO...|
|536367|LOVE BUILDING BLO...|
|536367|RECIPE BOX WITH M...|
+------+--------------------+
only showing top 20 rows



In [6]:
# Before dropping the duplicates
df_raw.count()

522064

In [7]:
# Remove the duplicates
df_raw = df_raw.dropDuplicates(["BillNo", "Itemname"])
df_raw.count()

511280

In [8]:
# Convert into this format: ["id", "items"] = [1, [a, b, c]]
# You can use groupBy, agg, and collect_list methods

df_input = df_raw.groupBy("BillNo").agg(collect_list('Itemname').alias('items'))
df_input.show()

+------+--------------------+
|BillNo|               items|
+------+--------------------+
|536365|[KNITTED UNION FL...|
|536366|[HAND WARMER UNIO...|
|536367|[BOX OF VINTAGE J...|
|536368|[YELLOW COAT RACK...|
|536369|[BATH BUILDING BL...|
|536370|[SPACEBOY LUNCH B...|
|536371|[PAPER CHAIN KIT ...|
|536372|[HAND WARMER UNIO...|
|536373|[GLASS STAR FROST...|
|536374|[VICTORIAN SEWING...|
|536375|[SAVE THE PLANET ...|
|536376|[RED HANGING HEAR...|
|536377|[HAND WARMER RED ...|
|536378|[PACK OF 60 PINK ...|
|536380|[JAM MAKING SET P...|
|536381|[ZINC WILLIE WINK...|
|536382|[VINTAGE SNAKES &...|
|536384|[ENAMEL BREAD BIN...|
|536385|[TRADITIONAL CHRI...|
|536386|[JUMBO BAG RED RE...|
+------+--------------------+
only showing top 20 rows



In [38]:
# Train the data
# fpGrowth = FPGrowth(itemsCol="items", minSupport=0.02, minConfidence=0.02)
# fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.05)
# fpGrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.1)
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.5)

# 2. Train/fit the data training to become a model
model = fpGrowth.fit(df_input)

In [39]:
# Frequent Itemset
model.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|[JAM MAKING SET P...|1116|
|[LUNCH BAG  BLACK...|1260|
|[HEART OF WICKER ...|1181|
|[JAM MAKING SET W...|1089|
|[REGENCY CAKESTAN...|1904|
|[ASSORTED COLOUR ...|1431|
|[LUNCH BAG SPACEB...|1147|
|[JUMBO SHOPPER VI...|1166|
|[LUNCH BAG RED RE...|1541|
|[PACK OF 72 RETRO...|1279|
|[LUNCH BAG CARS B...|1132|
|[JUMBO BAG PINK P...|1211|
|[JUMBO BAG RED RE...|2064|
|[WOODEN PICTURE F...|1092|
|[NATURAL SLATE HE...|1232|
|    [SPOTTY BUNTING]|1114|
|[JUMBO STORAGE BA...|1178|
|[RECIPE BOX PANTR...|1085|
|     [PARTY BUNTING]|1656|
|[PAPER CHAIN KIT ...|1143|
+--------------------+----+
only showing top 20 rows



#### Conslusion :

in the FP-growth algorithm, minSupport and minConfidence are the parameters used to set the resulting association pattern. minSupport determines how often a pattern should appear in transactional data. The higher the minSupport value, the rarer the pattern is generated and the higher the quality of the pattern. and then The higher minConfidence value, the stricter the conditions are and the fewer association rules are generated. in this case we can use the minSupport value to omit items that rarely appear in transactions, such as items that are less popular. We can also use the minConfidence value to filter out weak association rules, so that only strong rules are returned
