In [21]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()

# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Tugas FP Growth") \
    .getOrCreate()


# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f65a01f8f50>


In [22]:
#import data

df = spark.read.csv("data.csv", header=True, inferSchema=True)

In [23]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [24]:
df.createOrReplaceTempView("dataFPG")

In [25]:
#meng query berdasarkan yang diminta
soal1 = spark.sql("select InvoiceNo as ID, StockCode as Items from dataFPG")
soal1.show()

+------+------+
|    ID| Items|
+------+------+
|536365|85123A|
|536365| 71053|
|536365|84406B|
|536365|84029G|
|536365|84029E|
|536365| 22752|
|536365| 21730|
|536366| 22633|
|536366| 22632|
|536367| 84879|
|536367| 22745|
|536367| 22748|
|536367| 22749|
|536367| 22310|
|536367| 84969|
|536367| 22623|
|536367| 22622|
|536367| 21754|
|536367| 21755|
|536367| 21777|
+------+------+
only showing top 20 rows



In [26]:
#meng query berdasarkan yang diminta
soal2 = spark.sql("select InvoiceNo as ID, StockCode as Items, Description from dataFPG")
soal2.show()

+------+------+--------------------+
|    ID| Items|         Description|
+------+------+--------------------+
|536365|85123A|WHITE HANGING HEA...|
|536365| 71053| WHITE METAL LANTERN|
|536365|84406B|CREAM CUPID HEART...|
|536365|84029G|KNITTED UNION FLA...|
|536365|84029E|RED WOOLLY HOTTIE...|
|536365| 22752|SET 7 BABUSHKA NE...|
|536365| 21730|GLASS STAR FROSTE...|
|536366| 22633|HAND WARMER UNION...|
|536366| 22632|HAND WARMER RED P...|
|536367| 84879|ASSORTED COLOUR B...|
|536367| 22745|POPPY'S PLAYHOUSE...|
|536367| 22748|POPPY'S PLAYHOUSE...|
|536367| 22749|FELTCRAFT PRINCES...|
|536367| 22310|IVORY KNITTED MUG...|
|536367| 84969|BOX OF 6 ASSORTED...|
|536367| 22623|BOX OF VINTAGE JI...|
|536367| 22622|BOX OF VINTAGE AL...|
|536367| 21754|HOME BUILDING BLO...|
|536367| 21755|LOVE BUILDING BLO...|
|536367| 21777|RECIPE BOX WITH M...|
+------+------+--------------------+
only showing top 20 rows



In [27]:
#import lib untuk group
from pyspark.sql.functions import collect_list

In [28]:
#mengelompokkan invoiceno
soal1_group = soal1.groupby("ID").agg(collect_list('Items').alias('Items'))
soal1_group.show()

+-------+--------------------+
|     ID|               Items|
+-------+--------------------+
| 536596|[21624, 22900, 22...|
| 536938|[22386, 85099C, 2...|
| 537252|             [22197]|
| 537691|[22791, 22171, 82...|
| 538041|             [22145]|
| 538184|[22585, 21481, 22...|
| 538517|[22491, 21232, 21...|
| 538879|[84819, 22150, 21...|
| 539275|[22909, 22423, 22...|
| 539630|[21484, 85099B, 2...|
| 540499|[21868, 22697, 22...|
| 540540|[21877, 21868, 21...|
| 540976|[22394, 21890, 22...|
| 541432|[21485, 22457, 84...|
| 541518|[21880, 21881, 21...|
| 541783|[22423, 22854, 22...|
| 542026|[21754, 82600, 22...|
| 542375|[21731, 22367, 22...|
|C540850|             [21231]|
| 543641|[85123A, 21833, 2...|
+-------+--------------------+
only showing top 20 rows



In [29]:
#Menghilangkan data rangkap

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

distinct_items= udf(lambda row: list(set(row)), ArrayType(StringType()))
soal1_group = soal1_group.withColumn("Distinct Items", distinct_items("Items"))

In [30]:
soal1_group.show()

+-------+--------------------+--------------------+
|     ID|               Items|      Distinct Items|
+-------+--------------------+--------------------+
| 536596|[21624, 22900, 22...|[21624, 22114, 21...|
| 536938|[22386, 85099C, 2...|[84997A, 21479, 8...|
| 537252|             [22197]|             [22197]|
| 537691|[22791, 22171, 82...|[22505, 22666, 84...|
| 538041|             [22145]|             [22145]|
| 538184|[22585, 21481, 22...|[22560, 22561, 21...|
| 538517|[22491, 21232, 21...|[22562, 22563, 22...|
| 538879|[84819, 22150, 21...|[22593, 22453, 21...|
| 539275|[22909, 22423, 22...|[21914, 21915, 22...|
| 539630|[21484, 85099B, 2...|[21484, 21485, 22...|
| 540499|[21868, 22697, 22...|[22625, 22624, 22...|
| 540540|[21877, 21868, 21...|[22443, 22442, 22...|
| 540976|[22394, 21890, 22...|[20914, 22115, 22...|
| 541432|[21485, 22457, 84...|[21485, 22113, 22...|
| 541518|[21880, 21881, 21...|[22504, 22378, 84...|
| 541783|[22423, 22854, 22...|[22777, 21669, 22...|
| 542026|[21

In [31]:
# melakukan percobaan dengan mengubah minimum support dan minimum confidence
from pyspark.ml.fpm import FPGrowth

In [37]:
fpGrowth = FPGrowth(itemsCol="Distinct Items", minSupport=0.03, minConfidence=0.2)
model1=fpGrowth.fit(soal1_group)

In [38]:
model1.freqItemsets.show()

+---------------+----+
|          items|freq|
+---------------+----+
|       [85123A]|2246|
|        [22423]|2172|
|       [85099B]|2135|
|        [47566]|1706|
|        [20725]|1608|
|        [84879]|1468|
|        [22720]|1462|
|        [22197]|1442|
|        [21212]|1334|
|        [22383]|1306|
|        [20727]|1295|
|        [22457]|1266|
|         [POST]|1254|
|        [23203]|1249|
|        [22386]|1231|
|[22386, 85099B]| 833|
|        [22960]|1220|
|        [22469]|1214|
|        [21931]|1201|
|        [22411]|1187|
+---------------+----+
only showing top 20 rows



In [39]:
model.associationRules.show()

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
+----------+----------+----------+



In [40]:
fpGrowth2 = FPGrowth(itemsCol="Distinct Items", minSupport=0.01, minConfidence=0.1)
model2=fpGrowth2.fit(query1_group)

In [41]:
model2.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|         [22633]| 487|
|         [23236]| 344|
|        [85123A]|2246|
|         [22423]|2172|
| [22423, 85123A]| 355|
|         [22667]| 486|
|         [22579]| 343|
|  [22579, 22578]| 282|
|        [85099B]|2135|
| [85099B, 22423]| 288|
|[85099B, 85123A]| 404|
|         [22620]| 486|
|        [84536A]| 342|
|         [71053]| 342|
|         [47566]|1706|
| [47566, 85099B]| 332|
|  [47566, 22423]| 398|
| [47566, 85123A]| 391|
|         [85150]| 483|
|         [20725]|1608|
+----------------+----+
only showing top 20 rows



In [42]:
model2.associationRules.show()

+--------------+----------+-------------------+
|    antecedent|consequent|         confidence|
+--------------+----------+-------------------+
|       [22554]|   [22551]| 0.4823695345557123|
|       [22554]|   [22556]| 0.3991537376586742|
|       [22960]|   [21212]|0.21885245901639344|
|       [22960]|  [85099B]|0.23688524590163934|
|       [22960]|   [22423]|0.23852459016393443|
|       [22960]|   [22720]| 0.3155737704918033|
|       [22960]|   [22961]|0.38934426229508196|
|       [22960]|   [22666]|0.28032786885245903|
|       [22960]|   [22993]| 0.2540983606557377|
|       [22960]|   [22697]|0.21475409836065573|
|       [22960]|   [22722]|0.22131147540983606|
|[20726, 22382]|   [20728]|  0.546583850931677|
|[20726, 22382]|   [20725]| 0.6356107660455487|
|[20726, 22382]|   [20727]| 0.5445134575569358|
|[20726, 22382]|   [22383]| 0.5403726708074534|
|       [21977]|   [21212]| 0.4948571428571429|
|       [21977]|   [84991]| 0.4045714285714286|
|       [22699]|   [22423]|0.47946428571

In [43]:
fpGrowth3 = FPGrowth(itemsCol="Distinct Items", minSupport=0.05, minConfidence=0.5)
model3=fpGrowth2.fit(query1_group)

In [44]:
model3.freqItemsets.show()

+----------------+----+
|           items|freq|
+----------------+----+
|         [22633]| 487|
|         [23236]| 344|
|        [85123A]|2246|
|         [22423]|2172|
| [22423, 85123A]| 355|
|         [22667]| 486|
|         [22579]| 343|
|  [22579, 22578]| 282|
|        [85099B]|2135|
| [85099B, 22423]| 288|
|[85099B, 85123A]| 404|
|         [22620]| 486|
|        [84536A]| 342|
|         [71053]| 342|
|         [47566]|1706|
| [47566, 85099B]| 332|
|  [47566, 22423]| 398|
| [47566, 85123A]| 391|
|         [85150]| 483|
|         [20725]|1608|
+----------------+----+
only showing top 20 rows



In [45]:
model3.associationRules.show()

+--------------+----------+-------------------+
|    antecedent|consequent|         confidence|
+--------------+----------+-------------------+
|       [22554]|   [22551]| 0.4823695345557123|
|       [22554]|   [22556]| 0.3991537376586742|
|       [22960]|   [21212]|0.21885245901639344|
|       [22960]|  [85099B]|0.23688524590163934|
|       [22960]|   [22423]|0.23852459016393443|
|       [22960]|   [22720]| 0.3155737704918033|
|       [22960]|   [22961]|0.38934426229508196|
|       [22960]|   [22666]|0.28032786885245903|
|       [22960]|   [22993]| 0.2540983606557377|
|       [22960]|   [22697]|0.21475409836065573|
|       [22960]|   [22722]|0.22131147540983606|
|[20726, 22382]|   [20728]|  0.546583850931677|
|[20726, 22382]|   [20725]| 0.6356107660455487|
|[20726, 22382]|   [20727]| 0.5445134575569358|
|[20726, 22382]|   [22383]| 0.5403726708074534|
|       [21977]|   [21212]| 0.4948571428571429|
|       [21977]|   [84991]| 0.4045714285714286|
|       [22699]|   [22423]|0.47946428571

In [62]:
#create dataframe 
dataframe=spark.createDataFrame([
            ('0',['85099B'])
        ],['ID','Distinct Items'])

In [63]:
dataframe.show()

+---+--------------+
| ID|Distinct Items|
+---+--------------+
|  0|      [85099B]|
+---+--------------+



In [64]:
#menampilkan prediksi dalam dataframe
model3.transform(dataframe).show()

+---+--------------+--------------------+
| ID|Distinct Items|          prediction|
+---+--------------+--------------------+
|  0|      [85099B]|[22423, 85123A, 4...|
+---+--------------+--------------------+



In [68]:
#kesimpulan
soal2.createOrReplaceTempView("dataFPG")

In [70]:
kesimpulan=spark.sql("SELECT DISTINCT Description \
                     FROM dataFPG \
                     WHERE Items='85099B'")
kesimpulan.show()

+--------------------+
|         Description|
+--------------------+
|JUMBO BAG RED RET...|
+--------------------+



In [73]:
kesimpulan2=spark.sql("SELECT DISTINCT Description \
                     FROM dataFPG \
                     WHERE Items='22423'")
kesimpulan2.show()

+--------------------+
|         Description|
+--------------------+
|REGENCY CAKESTAND...|
|             damages|
|              faulty|
+--------------------+



In [74]:
kesimpulan3=spark.sql("SELECT DISTINCT Description \
                     FROM dataFPG \
                     WHERE Items='85123A'")
kesimpulan3.show()

+--------------------+
|         Description|
+--------------------+
|CREAM HANGING HEA...|
|wrongly marked ca...|
|WHITE HANGING HEA...|
|                   ?|
+--------------------+



In [None]:
#kesimpulan dari atas
#setiap membeli jumbo bag retro, biasanya mereka membeli juga cream CREAM HANGING HEART T-LIGHT HOLDER dan REGENCY CAKESTAND 3 TIER