In [1]:
from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, monotonically_increasing_id, when, rank, sum as spark_sum, count as spark_count, log2
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row

spark = SparkSession.builder \
    .appName("MovieLensALS") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/18 15:48:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
ratings_path = "hdfs://namenode:9000/movielens/32M/ratings.csv"
movies_path = "hdfs://namenode:9000/movielens/32M/movies.csv"

ratings_df = spark.read.csv(
    ratings_path,
    header=True,
    inferSchema=True
)

movies_df = spark.read.csv(
    movies_path,
    header=True,
    inferSchema=True
)

als_data = ratings_df.select(
    F.col("userId").cast("integer").alias("userId"),
    F.col("movieId").cast("integer").alias("movieId"),
    F.col("rating").cast("float").alias("rating"),
    F.col("timestamp").alias("timestamp_orig")  # Giữ lại timestamp để chia train/test
).dropna()
als_data.show(5)

                                                                                

+------+-------+------+--------------+
|userId|movieId|rating|timestamp_orig|
+------+-------+------+--------------+
|     1|     17|   4.0|     944249077|
|     1|     25|   1.0|     944250228|
|     1|     29|   2.0|     943230976|
|     1|     30|   5.0|     944249077|
|     1|     32|   5.0|     943228858|
+------+-------+------+--------------+
only showing top 5 rows



In [3]:
user_counts = als_data.groupBy("userId").count()
item_counts = als_data.groupBy("movieId").count()

min_user_ratings = 2
min_item_ratings = 2

filtered_als_data = als_data \
    .join(user_counts.filter(F.col("count") >= min_user_ratings), "userId") \
    .join(item_counts.filter(F.col("count") >= min_item_ratings), "movieId") \
    .select("userId", "movieId", "rating", "timestamp_orig")

In [None]:
# Lưu lại mapping
user_mapping = filtered_als_data.select("userId").distinct().withColumn(
    "new_userId",
    F.row_number().over(Window.orderBy("userId")) - 1
)
user_mapping.write.mode("overwrite").parquet("/app/src/batch/mappings/users.parquet")


# Lưu lại mapping
movie_mapping = filtered_als_data.select("movieId").distinct().withColumn(
    "new_movieId",
    F.row_number().over(Window.orderBy("movieId")) - 1
)
movie_mapping.write.mode("overwrite").parquet("/app/src/batch/mappings/movies.parquet")

# 3. Ánh xạ dữ liệu training 
indexed_data = filtered_als_data \
    .join(user_mapping, "userId") \
    .join(movie_mapping, "movieId") \
    .select(
        F.col("new_userId").alias("userId"),
        F.col("new_movieId").alias("movieId"),
        "rating",
        "timestamp_orig"
    ).cache()

indexed_data.show(5)

25/11/18 15:49:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:49:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:49:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:50:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:50:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:50:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 1

+------+-------+------+--------------+
|userId|movieId|rating|timestamp_orig|
+------+-------+------+--------------+
|   147|   1061|   0.5|    1471747769|
|   147|   2275|   1.0|    1471747783|
|   147|   4413|   1.0|    1471747756|
|   495|   1523|   3.5|    1633649130|
|   832|   1523|   2.5|    1193952315|
+------+-------+------+--------------+
only showing top 5 rows



                                                                                

In [5]:
total_count = indexed_data.count()

train_set = indexed_data.select("userId", "movieId", "rating")

print(f"Tổng số ratings: {total_count:,}")
print(f"Train Set : {train_set.count():,}")

25/11/18 15:55:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 1

Tổng số ratings: 31,981,597




Train Set : 31,981,597


                                                                                

In [6]:
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=False,
    rank=100,                
    maxIter=15,               
    regParam=0.05,
    numUserBlocks=100,        
    numItemBlocks=100,        
    intermediateStorageLevel="MEMORY_AND_DISK", 
    finalStorageLevel="MEMORY_AND_DISK_SER"     
)
num_partitions = 100
train_set_repartitioned = train_set.repartition(num_partitions).cache()
model_32m = als.fit(train_set_repartitioned)

25/11/18 15:55:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 15:55:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

In [7]:
model_path = "/kaggle/working/als_model_32m"
model_32m.write().overwrite().save(model_path)

                                                                                