In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    to_timestamp,
    when,
    sum as spark_sum,
    lag,
    unix_timestamp,
    col,
    monotonically_increasing_id,
    udf
)
from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.window import Window

import logging

from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance,
    VectorParams,
    PointStruct
)

from fastembed import TextEmbedding

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
 spark = SparkSession.builder.appName("test qdrant ").config("spark.driver.memory", "8g").config("spark.executor.memory", "8g").getOrCreate() 

In [None]:
# Load the CSV into a DataFrame
df = spark.read.format("csv").option("header", "true").load("data/test.csv")

# Display the first 2 rows
# Show schema
df.printSchema()

# Show sample data
df.show(5, truncate=False)



In [None]:
# (Optional) Repartition for parallelism
df = df.repartition(200)

# ------------------------------------------------------------------------
# 4. Convert `event_time` to Timestamp
# ------------------------------------------------------------------------
df = df.withColumn(
    "event_time",
    to_timestamp("event_time", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
)

# ------------------------------------------------------------------------
# 5. Generate flags for event types
# ------------------------------------------------------------------------
df = (
    df.withColumn("view_count", when(col("event_type") == "view", 1).otherwise(0))
      .withColumn("cart_count", when(col("event_type") == "cart", 1).otherwise(0))
      .withColumn("purchase_count", when(col("event_type") == "purchase", 1).otherwise(0))
)

# ------------------------------------------------------------------------
# 6. Compute session-level totals
# ------------------------------------------------------------------------
df_totals = df.groupBy("user_session").agg(
    spark_sum("view_count").alias("total_views"),
    spark_sum("cart_count").alias("total_carts"),
    spark_sum("purchase_count").alias("total_purchases")
)

# ------------------------------------------------------------------------
# 7. Compute product-level features (views, carts, purchases) and join
# ------------------------------------------------------------------------
df_product = df.groupBy("user_session", "product_id", "user_id").agg(
    spark_sum("view_count").alias("product_views"),
    spark_sum("cart_count").alias("product_carts"),
    spark_sum("purchase_count").alias("product_purchases")
)

df_features = df_product.join(df_totals, on="user_session", how="left")

df_features = (
    df_features
    .withColumn(
        "F1",
        when(col("total_views") != 0, col("product_views") / col("total_views")).otherwise(0)
    )
    .withColumn(
        "F2",
        when(col("total_carts") != 0, col("product_carts") / col("total_carts")).otherwise(0)
    )
    .withColumn(
        "F3",
        when(col("total_purchases") != 0, col("product_purchases") / col("total_purchases")).otherwise(0)
    )
)

# ------------------------------------------------------------------------
# 8. Create window for time-based features
# ------------------------------------------------------------------------
window_order = Window.partitionBy("user_session").orderBy("event_time")

df_time = (
    df.withColumn("prev_event_time", lag("event_time").over(window_order))
      .withColumn(
          "time_spent_seconds",
          unix_timestamp("event_time") - unix_timestamp("prev_event_time")
      )
      .na.fill(0, subset=["time_spent_seconds"])
      .withColumn("time_spent", col("time_spent_seconds").cast("double"))
)

# ------------------------------------------------------------------------
# 9. Aggregate time spent per product vs. total
# ------------------------------------------------------------------------
df_time_agg = df_time.groupBy("user_session", "product_id", "user_id").agg(
    spark_sum("time_spent").alias("product_time_spent")
)

df_total_time = df_time.groupBy("user_session").agg(
    spark_sum("time_spent").alias("total_time_spent")
)

df_features = (
    df_features
    .join(df_time_agg, on=["user_session", "product_id", "user_id"], how="left")
    .join(df_total_time, on="user_session", how="left")
)

df_features = df_features.withColumn(
    "F4",
    when(col("total_time_spent") != 0, col("product_time_spent") / col("total_time_spent"))
    .otherwise(0)
)

# ------------------------------------------------------------------------
# 10. Define weights and compute a final score
# ------------------------------------------------------------------------
w1, w2, w3, w4 = 0.1, 0.25, 0.45, 0.2
df_features = (
    df_features
    .withColumn(
        "score",
        w1 * col("F1") + w2 * col("F2") + w3 * col("F3") + w4 * col("F4")
    )
    .fillna({"score": 0})
)

final_df = df_features.select("user_id", "product_id", "score")
logger.info("Feature engineering completed.")
final_df.show(10)


In [None]:
df = final_df
df.show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import split, col



# Chuyển đổi user_id thành chuỗi và đóng gói trong một danh sách (Word2Vec yêu cầu input là danh sách từ)
df_string = df.withColumn("user_id_str", col("user_id").cast("string"))
df_words = df_string.withColumn("user_id_list", split(col("user_id_str"), ""))  # Mỗi ký tự là một "word"

# Áp dụng Word2Vec
word2Vec = Word2Vec(vectorSize=10, minCount=0, inputCol="user_id_list", outputCol="user_embedding")
model = word2Vec.fit(df_words)
df_with_embedding = model.transform(df_words)

# Chọn các cột cần thiết
df_final = df_with_embedding.select("user_id", "product_id", "score", "user_embedding")

# Hiển thị DataFrame với embedding
df_final.show(truncate=False)




In [None]:
client = QdrantClient(host='qdrant', port=6333)
df_pandas = df_final.select("user_id", "product_id", "score", "user_embedding").toPandas()
collection_name = "user_embeddings"

# Kiểm tra xem collection đã tồn tại chưa
collections = client.get_collections()
if collection_name not in [collection.name for collection in collections.collections]:
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=10, distance=Distance.COSINE)  # Thay `size` theo kích thước embedding của bạn
    )
    print(f"Collection '{collection_name}' đã được tạo.")
else:
    print(f"Collection '{collection_name}' đã tồn tại.")


In [None]:
from pyspark.sql.functions import col, regexp_replace, udf
from pyspark.sql.types import IntegerType, ArrayType, FloatType
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct

# Giả sử bạn đã có biến `collection_name` định nghĩa tên collection trong Qdrant

# Bước 1: Xử lý dữ liệu trực tiếp trong Spark
# Giả sử 'user_embedding' là cột Vector, chuyển nó thành list
def vector_to_list(vector):
    return vector.toArray().tolist() if hasattr(vector, 'toArray') else vector

vector_to_list_udf = udf(vector_to_list, ArrayType(FloatType()))

df_preprocessed = df_final.select(
    regexp_replace(col("user_id"), ",", "").cast(IntegerType()).alias("user_id"),
    "product_id",
    "score",
    vector_to_list_udf(col("user_embedding")).alias("user_embedding")
)

# Bước 2: Định nghĩa hàm upsert cho mỗi phân vùng
def upsert_partition(partition):
    client = QdrantClient()  # Khởi tạo client Qdrant trong mỗi phân vùng
    points = []
    for row in partition:
        point = PointStruct(
            id=row['user_id'],  # Sử dụng user_id làm ID point
            vector=row['user_embedding'],
            payload={
                "product_id": row['product_id'],
                "score": row['score']
            }
        )
        points.append(point)
        
        # Upsert mỗi 1000 points để tăng hiệu suất
        if len(points) >= 1000:
            client.upsert(collection_name=collection_name, points=points)
            print(f"Đã upsert 1000 points")
            points = []
    
    # Upsert các points còn lại
    if points:
        client.upsert(collection_name=collection_name, points=points)
        print(f"Đã upsert {len(points)} points cuối cùng")

# Bước 3: Áp dụng upsert cho từng phân vùng
df_preprocessed.rdd.foreachPartition(upsert_partition)


In [None]:
df_final.printSchema()