### Khai báo thư viện (luôn để findspark ở đầu)

In [10]:
import findspark
findspark.init()
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

# time: 0.2

### Khởi tạo và cấu hình spark

In [11]:
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "10g") \
    .appName('movieRecommenderPySpark') \
    .getOrCreate()

sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

# time: 10.5

### Load data và cấu hình ALS

In [12]:
data_type = "small"

ratings = (
    spark.read.csv(
        path=f"../data/{data_type}/ratings.csv",
        sep=",", header=True, quote='"', schema="userId INT, movieId INT, rating DOUBLE, timestamp INT",
    ).select("userId", "movieId", "rating")
    .cache()
)

# Create training and test set (80/20 split)
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build generic ALS model without hyperparameters
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop", nonnegative=True,
          implicitPrefs=False)

# Tell Spark how to evaluate model performance
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

# time: 2.5s

### Train với số liệu mặc định của ALS (rank 10, maxIter 10, regParam 0.1)

In [13]:
# default value als: rank 10, maxIter 10, regParam 0.1,...
normal_model = als.fit(training)

# time: 6.4s

### RMSE (default als param)

In [14]:
predictions = normal_model.transform(test)
rmse = evaluator.evaluate(predictions.na.drop())
# Print evaluation metrics and model parameters
print ("*Noraml Model*")
print (f"RMSE: {rmse}")
print (f"Rank: {normal_model.rank}")
print (f"MaxIter: {normal_model._java_obj.parent().getMaxIter()}")
print (f"RegParam: {normal_model._java_obj.parent().getRegParam()}") 

# time: 7.9s

*Noraml Model*
RMSE: 0.8708606240912293
Rank: 10
MaxIter: 10
RegParam: 0.1


### Tạo bảng tìm số liệu tốt nhất cho als

In [15]:
# Tell Spark what values to try for each hyperparameter
param_grid = ParamGridBuilder()\
    .addGrid(als.rank, [5, 10])\
    .addGrid(als.maxIter, [10,20])\
    .addGrid(als.regParam, [0.05, 0.1])\
    .build()

# Build cross validation step using CrossValidator
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5)

# time: 0.4s

### Tính toán model (với từng giá trị rank, maxIter, regParam)

In [16]:
# Run the cv on the training data
model = cv.fit(training)

# time: 5m16s

In [17]:
# Extract best combination of values from cross validation
best_model = model.bestModel

# time: 0.3s

In [18]:
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions.na.drop())
# Print evaluation metrics and model parameters
print ("*Best Model*")
print (f"RMSE: {rmse}")
print (f"Rank: {best_model.rank}")
print (f"MaxIter: {best_model._java_obj.parent().getMaxIter()}")
print (f"RegParam: {best_model._java_obj.parent().getRegParam()}") 

# time: 4.8s

*Best Model*
RMSE: 0.8726086770070626
Rank: 5
MaxIter: 20
RegParam: 0.1


### Save lại model để dùng lại

In [19]:
# normal_model.save("modelRecNormal")
# best_model.save("modelRecBest")