In [1]:
import findspark
findspark.init('/home/gerardo-rodriguez/spark-4.0.0-bin-hadoop3')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ALS').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/27 11:38:58 WARN Utils: Your hostname, Lanz-Lenovo, resolves to a loopback address: 127.0.1.1; using 192.168.1.145 instead (on interface wlp2s0)
25/08/27 11:38:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/27 11:38:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/27 11:39:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/27 11:39:00 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/08/27 11:39:00 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, LongType

In [4]:
schema = StructType([
    StructField('user_id', LongType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('country', StringType(), True),
    StructField('film_id', StringType(), True),
    StructField('title', StringType(), True),
    StructField('genre', StringType(), True),
    StructField('duration', StringType(), True),
    StructField('rating', DoubleType(), True)
])

In [5]:
df = spark.read.csv('../ratings_netlfix/', schema=schema)

In [6]:
df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- film_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- rating: double (nullable = true)



In [7]:
from pyspark.sql.functions import col, round, regexp_replace, when

In [8]:
df = df.withColumn('rating', round('rating'))
df = df.withColumn('film_id', regexp_replace(col('film_id'), 's', '').cast('int'))

In [9]:
df.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- film_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- rating: double (nullable = true)



### Create ALS

In [14]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np

In [11]:
train_data, test_data = df.randomSplit([0.8,0.2])

In [46]:
als = ALS(userCol='user_id', itemCol='film_id', ratingCol='rating',seed=1, coldStartStrategy='drop', implicitPrefs=True)

## Option 1 for find the best value

In [66]:
def best_params():
    reg_params = np.arange(0.01, 0.2, 0.01)
    results = {}
    
    for reg_param in reg_params:
        als.setRegParam(reg_param)
        model = als.fit(train_data)
        predictions = model.transform(test_data)
        
        rmse_evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction', metricName='rmse')
        mae_evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction', metricName='mae')
        
        rmse = rmse_evaluator.evaluate(predictions)
        mae = mae_evaluator.evaluate(predictions)
        
        results[reg_param] = {'rmse': rmse, 'mae': mae}
    
    best_reg_param = min(results, key=lambda x: results[x]['rmse'])
    best_rmse = results[best_reg_param]['rmse']
    
    print(f'Best reg_param: {best_reg_param} with RMSE: {best_rmse:.4f}')
    
    return best_reg_param

In [67]:
def best_rank_params():
    rank_array = [x for x in range(10, 40, 10)]
    
    rank_dict = {}
    
    for rank in rank_array:
        als.setRank(rank)
        model = als.fit(train_data)
        pred = model.transform(test_data)
        
        evaluator = RegressionEvaluator(labelCol='rating', predictionCol='prediction')
        rmse = evaluator.evaluate(pred)
        
        rank_dict[rank] = rmse
    
    best_rank = min(rank_dict, key=rank_dict.get)
    best_rmse = rank_dict[best_rank]
    
    print(f'Best rank: {best_rank} with RMSE: {best_rmse:.4f}')
    
    return best_rank

In [28]:
reg_param = best_params()

                                                                                

Best reg_param: 0.1 with RMSE: 2.9162


In [29]:
rank = best_rank_params()

Best rank: 20 with RMSE: 2.9167


## option 2 for find the best value

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.regParam, [0.01, 0.05, 0.1]) \
    .addGrid(als.maxIter, [10, 20]) \
    .build()

cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

model_cv = cv.fit(train_data)
best_model = model_cv.bestModel

In [None]:

print("Best rank:", best_model.rank)
print("Best regParam:", best_model._java_obj.parent().getRegParam())
print("Best maxIter:", best_model._java_obj.parent().getMaxIter())

In [69]:
als = als.setRegParam(reg_param).setRank(rank).setMaxIter(20)

In [70]:
model = als.fit(train_data)

In [71]:
prediction = model.transform(test_data)
prediction.select('rating', 'prediction').orderBy(col('prediction').desc()).show(5)

                                                                                

+------+----------+
|rating|prediction|
+------+----------+
|   5.0|0.47612974|
|   2.0|0.37992322|
|   2.0| 0.3169993|
|   1.0|0.29991472|
|   0.0|  0.219556|
+------+----------+
only showing top 5 rows


In [72]:
eva = RegressionEvaluator(metricName='rmse', predictionCol='prediction', labelCol='rating')

print('RMSE')
eva.evaluate(prediction)

RMSE


2.915936487658402

## Test For users

In [73]:
user_data = test_data.filter(test_data['user_id'] == 100).select('user_id', 'film_id')
prediction_user = model.transform(user_data)
prediction_user.orderBy('prediction', ascending=False).show()

+-------+-------+-----------+
|user_id|film_id| prediction|
+-------+-------+-----------+
|    100|   1244|0.002501208|
+-------+-------+-----------+



## Justificación de resultados en ALS

El modelo de recomendación basado en ALS (Alternating Least Squares) presentó métricas de predicción bajas (RMSE alto, R² bajo o negativo). Esto no se debe a un error en la implementación del algoritmo, sino a las características propias del dataset utilizado:

Datos aleatorios:
Las calificaciones fueron generadas de manera completamente aleatoria, sin patrones reales de preferencia entre usuarios y películas. ALS aprende a partir de correlaciones usuario–ítem, por lo que en ausencia de éstas, no puede generalizar adecuadamente.

Escasez de interacciones:
Algunos usuarios y películas poseen pocas interacciones, lo que genera mayor dispersión en la matriz de usuario–ítem y limita el aprendizaje del modelo.

Dependencia de hiperparámetros:
Aunque ALS permite mejorar el rendimiento mediante ajuste de parámetros como rank, regParam y maxIter, en este caso la naturaleza aleatoria de los datos impide obtener mejoras significativas.

En conclusión, los valores bajos obtenidos reflejan la falta de relación lógica en los datos y no una deficiencia en el modelo. En un escenario real, con calificaciones de usuarios auténticos, se esperaría que ALS detecte patrones de consumo y logre un desempeño mucho más robusto.

## Justification of ALS Results

The recommendation model based on ALS (Alternating Least Squares) showed low prediction performance (high RMSE, low or even negative R²). This outcome is not due to an error in the algorithm implementation, but rather to the characteristics of the dataset used:

Random data:
The ratings were generated completely at random, with no real patterns of user–movie preferences. Since ALS learns from user–item correlations, the absence of such patterns prevents the model from generalizing effectively.

Sparse interactions:
Some users and movies have very few interactions, which increases the sparsity of the user–item matrix and limits the model’s learning capacity.

Hyperparameter sensitivity:
Although ALS performance can be improved through parameter tuning (rank, regParam, maxIter), in this case the random nature of the dataset prevents significant improvements.

In conclusion, the low values obtained reflect the lack of logical relationships in the dataset, rather than a weakness in the model itself. In a real-world scenario with genuine user ratings, ALS would be expected to detect consumption patterns and achieve much more reliable performance.