![](../images/FE_02.png)

# Chuẩn dữ liệu

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("").getOrCreate()

In [4]:
products = spark.read.csv("../data/75000/goods.csv", inferSchema=True, header=True)

In [5]:
data = spark.read.csv("../data/75000/75000i.csv", inferSchema=True, header=False)

In [6]:
data.show(5)

+---+---+---+
|_c0|_c1|_c2|
+---+---+---+
|  1|  1| 21|
|  1|  5| 11|
|  2|  1|  7|
|  2|  3| 11|
|  2|  4| 37|
+---+---+---+
only showing top 5 rows



In [7]:
data = data.select('_c0', '_c2')

In [8]:
data.show(3)

+---+---+
|_c0|_c2|
+---+---+
|  1| 21|
|  1| 11|
|  2|  7|
+---+---+
only showing top 3 rows



In [9]:
data = data.withColumnRenamed('_c0', 'orderID').withColumnRenamed('_c2', 'itemID')

In [10]:
data.show(3)

+-------+------+
|orderID|itemID|
+-------+------+
|      1|    21|
|      1|    11|
|      2|     7|
+-------+------+
only showing top 3 rows



# Chuẩn dữ liệu

In [11]:
from pyspark.sql.functions import collect_list, col, count, collect_set

In [12]:
data.createOrReplaceTempView('data')

In [13]:
ordersDB = spark.sql("SELECT * FROM data")

In [14]:
ordersDB.show(5)

+-------+------+
|orderID|itemID|
+-------+------+
|      1|    21|
|      1|    11|
|      2|     7|
|      2|    11|
|      2|    37|
+-------+------+
only showing top 5 rows



In [15]:
orders = ordersDB.groupBy('orderID').agg(collect_set('itemID').alias('items'))

In [16]:
orders.createOrReplaceTempView('orders')

In [17]:
orders.show(5)

+-------+--------------------+
|orderID|               items|
+-------+--------------------+
|    148|[33, 27, 9, 46, 2...|
|    463|            [17, 14]|
|    471|     [9, 37, 34, 20]|
|    496|     [15, 6, 47, 26]|
|    833|         [12, 5, 21]|
+-------+--------------------+
only showing top 5 rows



# 3. Build model

In [18]:
from pyspark.ml.fpm import FPGrowth

In [19]:
fp_growth = FPGrowth(itemsCol='items', minSupport=0.03, minConfidence=0.03)

In [20]:
model = fp_growth.fit(orders)

# 4. Hiển thị các mẫu phổ biến

In [21]:
model.freqItemsets.show()

+--------+----+
|   items|freq|
+--------+----+
|     [7]|8193|
|    [45]|7700|
| [45, 7]|2367|
|    [28]|7556|
|    [18]|6987|
|     [4]|6948|
|    [35]|6943|
|[35, 18]|3982|
|    [42]|6871|
|    [22]|6783|
|    [27]|6733|
|[27, 28]|3819|
|    [14]|6362|
|    [32]|6299|
|[32, 45]|2653|
|     [1]|6271|
|     [0]|6265|
|    [46]|6236|
| [46, 0]|3303|
|    [16]|6221|
+--------+----+
only showing top 20 rows



# 5. Các luật kết hợp

In [22]:
most_popular_item = model.transform(orders)

In [23]:
most_popular_item.show(20)

+-------+--------------------+------------------+
|orderID|               items|        prediction|
+-------+--------------------+------------------+
|    148|[33, 27, 9, 46, 2...|        [42, 0, 2]|
|    463|            [17, 14]|          [44, 47]|
|    471|     [9, 37, 34, 20]|    [45, 7, 11, 4]|
|    496|     [15, 6, 47, 26]|           [7, 17]|
|    833|         [12, 5, 21]|              [22]|
|   1088| [27, 35, 3, 18, 40]|              [28]|
|   1238|        [19, 32, 18]|[35, 3, 1, 45, 16]|
|   1342|         [49, 17, 8]|              [47]|
|   1580|        [12, 31, 36]|                []|
|   1591|             [1, 19]|                []|
|   1645|         [15, 49, 7]|      [45, 37, 11]|
|   1829|[15, 49, 38, 6, 7...|      [45, 37, 11]|
|   1959|[9, 1, 18, 4, 22,...|    [35, 3, 5, 19]|
|   2122|             [5, 22]|                []|
|   2142|        [14, 44, 41]|                []|
|   2366|         [0, 27, 29]|       [28, 46, 2]|
|   2659|                [42]|              [33]|


# Hiển thị kết quả với tên

In [26]:
import pandas as pd

In [44]:
df = most_popular_item.toPandas()

In [29]:
df.head()

Unnamed: 0,orderID,items,prediction
0,148,"[33, 27, 9, 46, 28, 4]","[42, 0, 2]"
1,463,"[17, 14]","[44, 47]"
2,471,"[9, 37, 34, 20]","[45, 7, 11, 4]"
3,496,"[15, 6, 47, 26]","[7, 17]"
4,833,"[12, 5, 21]",[22]


In [50]:
products_df = products.collect()

In [53]:
import numpy as np

def showName(pProduct_df, df):
    lst_products = [None] * (len(pProduct_df) + 5)

    for row in pProduct_df:
        i = row['Id']
        name = row['Food']
        flavor = row['Flavor']
        lst_products[i] = "{} - {}".format(flavor, name)

    items = np.array(lst_products)
    res = []

    for row in df['items']:
        res.append(items[row])

    return res


In [55]:
res = showName(products_df, df)

In [56]:
res[:5]

[array(["'Cheese' - 'Croissant'", "'Marzipan' - 'Cookie'",
        "'Napoleon' - 'Cake'", "'Chocolate' - 'Coffee'",
        "'Tuile' - 'Cookie'", "'Strawberry' - 'Cake'"], dtype=object),
 array(["'Chocolate' - 'Tart'", "'Berry' - 'Tart'"], dtype=object),
 array(["'Napoleon' - 'Cake'", "'Almond' - 'Twist'",
        "'Chocolate' - 'Croissant'", "'Pecan' - 'Tart'"], dtype=object),
 array(["'Blackberry' - 'Tart'", "'Chocolate' - 'Eclair'",
        "'Vanilla' - 'Frappuccino'", "'Vanilla' - 'Meringue'"],
       dtype=object),
 array(["'Apple' - 'Tart'", "'Truffle' - 'Cake'", "'Ganache' - 'Cookie'"],
       dtype=object)]