In [10]:
from pyspark.ml.fpm import FPGrowth

from pyspark.sql.functions import split

from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName("FPGrowthExample")\
    .getOrCreate()

data = (spark.read
    .text("sample_fpgrowth.txt")
    .select(split("value", "\s+").alias("items")))
data.show(truncate=False)

+------------------------+
|items                   |
+------------------------+
|[r, z, h, k, p]         |
|[z, y, x, w, v, u, t, s]|
|[s, x, o, n, r]         |
|[x, z, y, m, t, s, q, e]|
|[z]                     |
|[x, z, y, r, q, t, p]   |
+------------------------+



In [11]:
from pyspark.ml.fpm import FPGrowth

In [12]:
fp = FPGrowth(minSupport=0.2, minConfidence=0.7)

In [17]:
fpm = fp.fit(data)
fpm.freqItemsets.show(20)

+---------------+----+
|          items|freq|
+---------------+----+
|            [s]|   3|
|         [s, x]|   3|
|      [s, x, z]|   2|
|         [s, z]|   2|
|            [r]|   3|
|         [r, x]|   2|
|         [r, z]|   2|
|            [y]|   3|
|         [y, s]|   2|
|      [y, s, x]|   2|
|   [y, s, x, z]|   2|
|      [y, s, z]|   2|
|         [y, x]|   3|
|      [y, x, z]|   3|
|         [y, t]|   3|
|      [y, t, s]|   2|
|   [y, t, s, x]|   2|
|[y, t, s, x, z]|   2|
|   [y, t, s, z]|   2|
|      [y, t, x]|   3|
+---------------+----+
only showing top 20 rows



In [14]:
fpm.associationRules.show(5)

+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
|    [t, s]|       [y]|       1.0|
|    [t, s]|       [x]|       1.0|
|    [t, s]|       [z]|       1.0|
|       [p]|       [r]|       1.0|
|       [p]|       [z]|       1.0|
+----------+----------+----------+
only showing top 5 rows



In [16]:
new_data = spark.createDataFrame([(["t", "s"], )], ["items"])
sorted(fpm.transform(new_data).first().prediction)

['x', 'y', 'z']