In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lag, unix_timestamp, sum as spark_sum, to_timestamp
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec
from pyspark.sql.functions import udf
import logging
from datetime import datetime



In [2]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
spark = SparkSession.builder \
        .appName("MongoDBConnectionTest") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
        .config("spark.jars","/usr/local/airflow/spark/jars/qdrant-spark-2.3.2.jar") \
        .config("spark.mongodb.input.uri", "mongodb+srv://pnghung2003:pnghung2003@cluster0.xiuaw.mongodb.net/recommendation_system?authSource=admin&ssl=true") \
        .config("spark.mongodb.output.uri", "mongodb+srv://pnghung2003:pnghung2003@cluster0.xiuaw.mongodb.net/recommendation_system?authSource=admin&ssl=true") \
        .config("spark.mongodb.input.sslEnabled", "true") \
        .config("spark.mongodb.output.sslEnabled", "true") \
        .config("spark.mongodb.input.ssl.invalidHostNameAllowed", "true") \
        .config("spark.mongodb.output.ssl.invalidHostNameAllowed", "true") \
        .master("local[*]") \
        .getOrCreate()

    # Test data
test_data = {
        "test_id": "connection_test",
        "timestamp": datetime.now().isoformat(),
        "status": "success"
    }


In [4]:
df_product = spark.read \
            .format("mongo") \
            .option("database", "recommendation_system") \
            .option("collection", "products") \
            .load()
df_userbehaviors = spark.read \
            .format("mongo") \
            .option("database", "recommendation_system") \
            .option("collection", "userbehaviors") \
            .load()

In [5]:
df_product.show()

+--------------------+--------------------+-----------+-----------+--------------------+--------------------+------+---------+------+-----+-------------+
|           MainImage|                 _id|      brand|   category|         description|                name| price|productID|rating|stock|         type|
+--------------------+--------------------+-----------+-----------+--------------------+--------------------+------+---------+------+-----+-------------+
|https://images.sa...|{679c814226291400...|    samsung|electronics|Our tender-inspir...|Smartphone Samsun...|168.75|  1004858|     5|  996|   smartphone|
|https://i5.walmar...|{679c814226291400...|    samsung|electronics|Stylish Bike desi...|Smartphone Samsun...|140.59|  1004872|     0|  998|   smartphone|
|https://images.th...|{679c814226291400...|         lg| appliances|Discover the geck...|Refrigerators Lg ...|858.49|  2702053|     0| 1000|refrigerators|
|https://images.un...|{679c814226291400...|         lg|electronics|The Kelli

In [6]:
from pyspark.sql.functions import concat_ws

df_product = df_product.withColumn("category_code", concat_ws(".", df_product["category"], df_product["type"]))
df_product =df_product.drop("category")
df_product =df_product.drop("type")
df_product.show()


+--------------------+--------------------+-----------+--------------------+--------------------+------+---------+------+-----+--------------------+
|           MainImage|                 _id|      brand|         description|                name| price|productID|rating|stock|       category_code|
+--------------------+--------------------+-----------+--------------------+--------------------+------+---------+------+-----+--------------------+
|https://images.sa...|{679c814226291400...|    samsung|Our tender-inspir...|Smartphone Samsun...|168.75|  1004858|     5|  996|electronics.smart...|
|https://i5.walmar...|{679c814226291400...|    samsung|Stylish Bike desi...|Smartphone Samsun...|140.59|  1004872|     0|  998|electronics.smart...|
|https://images.th...|{679c814226291400...|         lg|Discover the geck...|Refrigerators Lg ...|858.49|  2702053|     0| 1000|appliances.refrig...|
|https://images.un...|{679c814226291400...|         lg|The Kellie Shirt ...|       Tv Lg 1802037|589.27|  

In [7]:
df_product.printSchema()

df_userbehaviors.printSchema()

root
 |-- MainImage: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- description: string (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- productID: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- stock: integer (nullable = true)
 |-- category_code: string (nullable = false)

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- behavior: string (nullable = true)
 |-- createdAt: timestamp (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- sessionId: string (nullable = true)
 |-- updatedAt: timestamp (nullable = true)
 |-- user_id: string (nullable = true)



In [8]:
from pyspark.sql.functions import col

# Thực hiện join trên productID và product_id
df_merged = df_userbehaviors.join(
    df_product, 
    df_userbehaviors["product_id"] == df_product["productID"], 
    "inner"
).select(
    df_userbehaviors["sessionId"].alias("event_time"),
    df_userbehaviors["behavior"].alias("event_type"),
    df_product["brand"],
    df_userbehaviors["sessionId"].alias("user_session"),
    df_product["category_code"],
    df_product["price"],
    df_userbehaviors["user_id"],
    df_userbehaviors["product_name"].alias("name"),
    df_userbehaviors["product_id"]
)

# Hiển thị kết quả
df_merged.show()


+--------------------+----------+-------+--------------------+--------------------+------+--------------------+--------------------+----------+
|          event_time|event_type|  brand|        user_session|       category_code| price|             user_id|                name|product_id|
+--------------------+----------+-------+--------------------+--------------------+------+--------------------+--------------------+----------+
|b15be6f8-7d6f-499...|      view|     lg|b15be6f8-7d6f-499...|appliances.refrig...|858.49|             3135874|Refrigerators Lg ...|   2702053|
|60d37b5e-88de-467...|  checkout|     lg|60d37b5e-88de-467...|appliances.refrig...|858.49|679c790760f5656bf...|Refrigerators Lg ...|   2702053|
|60d37b5e-88de-467...|  purchase|     lg|60d37b5e-88de-467...|appliances.refrig...|858.49|            57042354|Refrigerators Lg ...|   2702053|
|0def7a55-24ea-421...|      view|     lg|0def7a55-24ea-421...|appliances.refrig...|858.49|            57042354|Refrigerators Lg ...|   2

In [9]:
df_csv = spark.read.csv("dataset_with_name.csv", header=True, inferSchema=True)

df_csv =df_csv.drop("category_id")
df_csv.show(5)



+-------------------+----------+----------+--------------------+-------+------+---------+--------------------+--------------------+
|         event_time|event_type|product_id|       category_code|  brand| price|  user_id|        user_session|                name|
+-------------------+----------+----------+--------------------+-------+------+---------+--------------------+--------------------+
|2019-11-16 11:54:03|      view|   1004858|electronics.smart...|samsung|128.42|557027114|56c4fa27-2298-400...|Smartphone Samsun...|
|2019-11-05 09:36:31|      view|   1004872|electronics.smart...|samsung|270.23|513209166|56b2bf0c-18c7-4b4...|Smartphone Samsun...|
|2019-11-16 13:21:15|      view|   2702053|appliances.kitche...|     lg|516.33|564970140|ac97a1c3-8412-4b6...|Refrigerators Lg ...|
|2019-11-17 13:40:34|      cart|   1802037|electronics.video.tv|     lg|307.55|571961869|e36e63f5-571e-442...|       Tv Lg 1802037|
|2019-11-20 08:12:31|      view|   4804056|electronics.audio...|  apple|165.

In [10]:
# Đảm bảo cả hai DataFrame có cùng schema
df_final = df_merged.unionByName(df_csv)

# Hiển thị kết quả
df_final.show(5)


+--------------------+----------+-----+--------------------+--------------------+------+--------------------+--------------------+----------+
|          event_time|event_type|brand|        user_session|       category_code| price|             user_id|                name|product_id|
+--------------------+----------+-----+--------------------+--------------------+------+--------------------+--------------------+----------+
|b15be6f8-7d6f-499...|      view|   lg|b15be6f8-7d6f-499...|appliances.refrig...|858.49|             3135874|Refrigerators Lg ...|   2702053|
|60d37b5e-88de-467...|  checkout|   lg|60d37b5e-88de-467...|appliances.refrig...|858.49|679c790760f5656bf...|Refrigerators Lg ...|   2702053|
|60d37b5e-88de-467...|  purchase|   lg|60d37b5e-88de-467...|appliances.refrig...|858.49|            57042354|Refrigerators Lg ...|   2702053|
|0def7a55-24ea-421...|      view|   lg|0def7a55-24ea-421...|appliances.refrig...|858.49|            57042354|Refrigerators Lg ...|   2702053|
|60d37

In [11]:
df_final.count()

5000068

In [12]:
df_final.coalesce(1).write.csv("user_behavior", header=True, mode="overwrite")
