# Running ALS on Outfit Recommendation (PySpark)

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
import pyspark
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.utils.notebook_utils import store_metadata

# Dataset
from datasets import outfits

print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))


System version: 3.9.23 | packaged by conda-forge | (main, Jun  4 2025, 18:02:02) 
[Clang 18.1.8 ]
Spark version: 4.0.0


In [2]:
# top k items to recommend
TOP_K = 1

OUTFITS_DATA_SIZE = '100'

# Column names for the dataset
COL_USER = "UserId"
COL_ITEM = "Clothing"
COL_RATING = "Rating"
COL_WEATHER = "Weather"
COL_ITEM_ID = "ClothingId"

In [3]:
# Start Spark session
spark = start_or_get_spark("ALS PySpark", memory="16g")
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/05 10:37:41 WARN Utils: Your hostname, Khangs-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.24.77.42 instead (on interface en0)
25/10/05 10:37:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/05 10:37:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 1. Download Dataset

In [4]:
schema = StructType(
    (
        StructField(COL_USER, IntegerType()),
        StructField(COL_WEATHER, StringType()),
        StructField(COL_ITEM, StringType()),
        StructField(COL_RATING, FloatType()),
    )
)

data = outfits.load_spark_df(spark, size=None, schema=schema, filepath="./datasets/csv/example_feature1.csv")
data.show()

indexer = StringIndexer(inputCol=COL_ITEM, outputCol=COL_ITEM_ID)
indexed_data = indexer.fit(data).transform(data)
print("Data after indexing the 'Clothing' column:")
indexed_data.show()


+------+-------+-----------------+------+
|UserId|Weather|         Clothing|Rating|
+------+-------+-----------------+------+
|     1|  Humid|           Blazer|   2.2|
|     1|  Rainy|           Blazer|   2.8|
|     1|  Sunny|           Hoodie|   2.5|
|     1|  Sunny|            Jeans|   3.8|
|     2| Cloudy|           Hoodie|   4.1|
|     2| Cloudy|            Jeans|   3.8|
|     2|  Humid|         Cardigan|   4.0|
|     2|  Humid|          Joggers|   3.7|
|     2|  Rainy|Long-sleeve shirt|   3.8|
|     2|  Rainy|          T-shirt|   3.6|
|     2|  Snowy|           Hoodie|   3.7|
|     2|  Sunny|            Jeans|   3.5|
|     2|  Sunny|             Polo|   4.3|
|     2|  Windy|           Shorts|   3.0|
|     3|  Humid|           Chinos|   3.7|
|     3|  Rainy|Long-sleeve shirt|   3.2|
|     3|  Rainy|             Polo|   2.8|
|     3|  Snowy|             Coat|   4.8|
|     3|  Snowy|Long-sleeve shirt|   2.7|
|     3|  Sunny|           Shorts|   4.1|
+------+-------+-----------------+

### 2. Splitting the Data

In [5]:
train, test = spark_random_split(indexed_data, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

N train 77
N test 23


### 3. Training the Model and Getting Our Predictions

In [8]:
header = {
    "userCol": COL_USER,
    "itemCol": COL_ITEM_ID,
    "ratingCol": COL_RATING,
}


als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=False,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

In [9]:
with Timer() as train_time:
    model = als.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

25/10/05 10:38:00 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/10/05 10:38:00 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


Took 1.9142997500000014 seconds for training.


In [10]:
# with Timer() as test_time:

#     # Get the cross join of all user-item pairs and score them.
#     users = train.select(COL_USER).distinct()
#     items = train.select(COL_ITEM).distinct()
#     user_item = users.crossJoin(items)
#     dfs_pred = model.transform(user_item)

#     # Remove seen items.
#     dfs_pred_exclude_train = dfs_pred.alias("pred").join(
#         train.alias("train"),
#         (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),
#         how='outer'
#     )

#     top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_RATING}"].isNull()) \
#         .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

#     # In Spark, transformations are lazy evaluation
#     # Use an action to force execute and measure the test time 
#     top_all.cache().count()

# print("Took {} seconds for prediction.".format(test_time.interval))

with Timer() as test_time:
    users = train.select(COL_USER).distinct()

    items = train.select(COL_ITEM_ID).distinct()

    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    top_all = dfs_pred.join(
        indexed_data.select(COL_USER, COL_ITEM_ID),
        on=[COL_USER, COL_ITEM_ID],
        how='left_anti'
    )

    # Force execution to measure the time
    top_all.cache().count()

print("Took {} seconds for prediction.".format(test_time.interval))



Took 1.5354195419999996 seconds for prediction.


                                                                                

In [11]:
top_all.show()

+------+----------+----------+
|UserId|ClothingId|prediction|
+------+----------+----------+
|    12|         8| 0.8833362|
|     1|         8| 2.4692569|
|    13|         8| 1.5551647|
|    16|         8| 1.8367697|
|     3|         8|0.74171567|
|    20|         8| 1.3612454|
|    19|         8| 2.3161528|
|    15|         8| 1.9725528|
|     9|         8|  2.264041|
|     4|         8|  2.011328|
|     8|         8| 2.3636644|
|     7|         8| 2.2307923|
|    11|         8| 2.1407385|
|    18|         8| 2.1490989|
|    13|         0| 3.3109655|
|     6|         0| 2.5150957|
|     3|         0|  3.050418|
|     5|         0|  4.445485|
|    15|         0|  4.336204|
|    17|         0| 2.2741568|
+------+----------+----------+
only showing top 20 rows


### 5. Evaluation

In [13]:
rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_ITEM_ID, 
                                    col_rating=COL_RATING, col_prediction="prediction", 
                                    relevancy_method="top_k")

                                                                                

In [14]:
print("Model:\tALS",
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

Model:	ALS
Top K:	1
MAP:	0.000000
NDCG:	0.000000
Precision@K:	0.000000
Recall@K:	0.000000


### 5. Evaluate Rating Prediction

In [15]:
# Generate predicted ratings.
prediction = model.transform(test)
prediction.cache().show()


+------+-------+-----------------+------+----------+----------+
|UserId|Weather|         Clothing|Rating|ClothingId|prediction|
+------+-------+-----------------+------+----------+----------+
|    12|  Snowy|          Joggers|   2.9|         7| 1.8935436|
|    12|  Windy|           Hoodie|   3.7|         0| 3.3782058|
|    12|  Windy|       Sweatshirt|   4.4|         1| 3.0695956|
|     1|  Sunny|           Hoodie|   2.5|         0| 3.9122157|
|     6|  Snowy|          T-shirt|   1.0|         8|  2.972186|
|    16|  Rainy|           Chinos|   3.8|         4|  2.062534|
|     3|  Snowy|Long-sleeve shirt|   2.7|         2| 3.2212994|
|     5|  Windy|          T-shirt|   3.7|         8| 2.2005248|
|    19|  Windy|           Shorts|   2.5|        11|  2.477874|
|     9|  Sunny|Long-sleeve shirt|   3.5|         2| 3.4481497|
|    17|  Snowy|       Sweatshirt|   5.0|         1| 2.0359073|
|     4|  Sunny|           Chinos|   3.5|         4| 1.7861462|
|     8|  Rainy|           Blazer|   2.2

In [16]:
rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_ITEM, 
                                    col_rating=COL_RATING, col_prediction="prediction")

print("Model:\tALS rating prediction",
      "RMSE:\t%f" % rating_eval.rmse(),
      "MAE:\t%f" % rating_eval.mae(),
      "Explained variance:\t%f" % rating_eval.exp_var(),
      "R squared:\t%f" % rating_eval.rsquared(), sep='\n')

Model:	ALS rating prediction
RMSE:	1.252579
MAE:	1.019917
Explained variance:	-0.762224
R squared:	-0.906327


In [17]:
# cleanup spark instance
spark.stop()