In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
sys.path.append(os.path.abspath("../.."))  # Adds the project root to sys.path
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.utils.timer import Timer
import utils

In [2]:
COL_USER = "UserID"
COL_ITEM = "ProductID"
COL_RATING = "Rating"
DATA_FILE_PATH = "tools_recommendation_dataset.json"

In [3]:
utils.generate_fake_data()
utils.generate_csv_arrays_from_json(DATA_FILE_PATH)

Dataset saved.
Saved users.csv
Saved products.csv
Saved userPurchases.csv
Saved userReviews.csv


In [4]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = utils.start_or_get_spark("ALS PySpark", memory="16g")
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

In [5]:
users_df = spark.read.option("multiLine", "true").csv("users.csv", header=True, inferSchema=True)
products_df = spark.read.option("multiLine", "true").csv("products.csv", header=True, inferSchema=True)
purchases_df = spark.read.option("multiLine", "true").csv("userPurchases.csv", header=True, inferSchema=True)
reviews_df = spark.read.option("multiLine", "true").csv("userReviews.csv", header=True, inferSchema=True)

In [6]:
train, test = spark_random_split(reviews_df, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

N train 2971
N test 1006


In [7]:
header = {
    "userCol": COL_USER,
    "itemCol": COL_ITEM,
    "ratingCol": COL_RATING,
}

als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=False,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

In [8]:
with Timer() as train_time:
    model = als.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

Took 6.583877100027166 seconds for training.


In [9]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_ITEM).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),
        how='outer'
    )

    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_RATING}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time 
    top_all.cache().count()

print("Took {} seconds for prediction.".format(test_time.interval))

Took 3.7692807000130415 seconds for prediction.


In [10]:
top_all.show()

+------+---------+-----------+
|UserID|ProductID| prediction|
+------+---------+-----------+
|     2|       80|-0.08209327|
|     3|       22|  1.0215471|
|     3|       57|  0.7545142|
|     3|       89|  1.6685169|
|     7|       55|  2.3551004|
|     8|       52| -0.1394828|
|    10|       85|  0.8987014|
|    15|       14|  2.8720644|
|    15|       26|  2.5398207|
|    18|       68|   2.354938|
|    18|       95|   2.242795|
|    22|       53|   2.260746|
|    25|       61|  1.7027037|
|    27|       65|   4.071296|
|    28|       16|-0.24412246|
|    28|       63|  0.4042694|
|    32|       26|  1.7833146|
|    32|       79|  2.3801322|
|    33|       37|  1.6468672|
|    35|       45|  1.8626484|
+------+---------+-----------+
only showing top 20 rows

