### Set Up Notebook

#### Import Required Modules

In [0]:
# %pip install pinecone-client python-dotenv

In [0]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

from pyspark.ml.recommendation import ALS

#### Get or Create SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Load Secrets as Environment Variables

In [0]:
load_dotenv("/dbfs/FileStore/env/.env")

Out[6]: True

#### Set Workflow Constants

In [0]:
MIN_RATING = 3
MIN_USERS = 10
MIN_MOVIES = 10
DIMENSION = 32
INDEX_NAME = "collaborative-embeddings"
CLN_PATH = "dbfs:/FileStore/data/clean"

### Create Model Frame 

#### Import Clean Data

In [0]:
movies = spark.read.parquet(os.path.join(CLN_PATH, "movies"))
movies.show(10)

+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|tmdb_id|               title|release_date|runtime|              genres|            overview|   budget|  revenue|popularity|vote_average|vote_count|            keywords|
+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|  10010|      Brother Bear 2|  2006-08-17|   73.0|[Adventure, Anima...|Kenai finds his c...|     null|     null| 10.861154|         6.3|       318|[grizzly bear, hu...|
|  10012|              Cursed|  2005-02-25|   97.0|    [Horror, Comedy]|A werewolf loose ...| 35000000| 19294901|  8.949722|         5.1|       168|[brother sister r...|
| 100402|Captain America: ...|  2014-03-20|  136.0|[Action, Adventur...|After the catacly...|170000000|714766572| 18.717704|         7.6|      5881|[w

In [0]:
ratings = spark.read.parquet(os.path.join(CLN_PATH, "ratings"))
ratings.show(10)

+-------+-------+------+-------------------+
|user_id|tmdb_id|rating|          timestamp|
+-------+-------+------+-------------------+
|  42017|     14|   4.0|2015-09-10 22:15:39|
|  54926|    165|   2.5|2006-03-19 20:54:43|
|  47629|   1637|   4.0|1996-07-01 14:27:03|
|  68508|  13920|   5.0|2003-11-14 01:18:56|
|  59910|  24746|   4.0|2017-07-29 19:41:07|
|  50057|  26195|   4.0|2006-10-22 17:38:40|
|  61522|   1621|   3.0|2001-07-04 22:26:29|
|  64941|    873|   5.0|2010-07-08 17:55:35|
|  53592|  10110|   5.0|2001-07-17 15:55:23|
|  56874| 150540|   5.0|2016-07-15 16:21:38|
+-------+-------+------+-------------------+
only showing top 10 rows



#### Create an Implicit Feedback Recommender Model Frame

In [0]:
model_frame = ratings \
    .join(movies, on='tmdb_id', how='inner') \
    .select('user_id', 'tmdb_id', 'rating', 'timestamp') \
    .filter(f.col('rating') >= MIN_RATING) \
    .withColumn('cnt_users', f.count('*').over(Window.partitionBy('tmdb_id'))) \
    .withColumn('cnt_movies', f.count('*').over(Window.partitionBy('user_id'))) \
    .filter(f.col('cnt_users') >= MIN_USERS) \
    .filter(f.col('cnt_movies') >= MIN_MOVIES) \
    .withColumn('rating', f.col('rating') - (MIN_RATING - 1)) \
    .withColumn('user_id', f.col('user_id').cast('INTEGER')) \
    .withColumn('tmdb_id', f.col('tmdb_id').cast('INTEGER')) \
    .cache()

counts = model_frame.agg(f.countDistinct('user_id').alias('users'), f.countDistinct('tmdb_id').alias('movies'), f.count('*').alias('ratings')).first()
sparsity = round(1 - (counts['ratings'] / (counts['users'] * counts['movies'])), 4)
print(f"users={counts['users']} movies={counts['movies']} ratings={counts['ratings']} sparsity={sparsity}")
model_frame.show(10, truncate=False)

users=195899 movies=6235 ratings=15372729 sparsity=0.9874
+-------+-------+------+-------------------+---------+----------+
|user_id|tmdb_id|rating|timestamp          |cnt_users|cnt_movies|
+-------+-------+------+-------------------+---------+----------+
|100010 |11     |3.0   |1997-02-21 13:37:24|71403    |27        |
|100010 |10534  |3.0   |1997-02-21 13:41:49|4056     |27        |
|100010 |36915  |1.0   |1997-02-21 13:44:51|622      |27        |
|100010 |11860  |1.0   |1997-02-21 13:36:43|12818    |27        |
|100010 |1645   |2.0   |1997-02-21 13:37:54|12426    |27        |
|100010 |9294   |2.0   |1997-02-21 13:38:28|12200    |27        |
|100010 |63     |2.0   |1997-02-21 13:35:32|49933    |27        |
|100010 |252    |2.0   |1997-02-21 13:36:43|29169    |27        |
|100010 |954    |1.0   |1997-02-21 13:35:33|37731    |27        |
|100010 |602    |1.0   |1997-02-21 13:35:32|45885    |27        |
+-------+-------+------+-------------------+---------+----------+
only showing top 1

In [0]:
model_frame.repartition(10).write.parquet(os.path.join(CLN_PATH, "model_frame"), mode="overwrite")

### Train the ALS Recommender Model and Update User/Movie Embeddings

#### Train the ALS Model

In [0]:
estimator = ALS(rank=DIMENSION, maxIter=15, regParam=0.1, implicitPrefs=True, userCol='user_id', itemCol='tmdb_id', ratingCol='rating')
transformer = estimator.fit(model_frame)
transformer

Out[31]: ALSModel: uid=ALS_68021d38058b, rank=32

#### Extract the Embeddings from the Fit Model and Format for Index Insertion

In [0]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

user_embeddings  = [{"id": item["id"], "values": item["values"].tolist()} for item in user_embeddings]
movie_embeddings = [{"id": item["id"], "values": item["values"].tolist()} for item in movie_embeddings]
len(user_embeddings), len(movie_embeddings)

Out[32]: (195899, 6235)

#### Insert the User/Movie Embeddings into the Pinecone Index

In [0]:
import pinecone      
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENVIRONMENT"])   

In [0]:
# pinecone.delete_index(INDEX_NAME)
# pinecone.create_index(name=INDEX_NAME, dimension=DIMENSION, metric='cosine', pods=1, replicas=1, pod_type="p1")

In [0]:
index = pinecone.Index(INDEX_NAME)
index.describe_index_stats()

Out[41]: {'dimension': 32,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [0]:
index.upsert(vectors=user_embeddings, namespace="users", batch_size=100)

Upserted vectors:   0%|          | 0/195899 [00:00<?, ?it/s]

Out[42]: {'upserted_count': 195899}

In [0]:
index.upsert(vectors=movie_embeddings, namespace="movies", batch_size=100)

Upserted vectors:   0%|          | 0/6235 [00:00<?, ?it/s]

Out[43]: {'upserted_count': 6235}

In [0]:
index.describe_index_stats()

Out[44]: {'dimension': 32,
 'index_fullness': 0.0,
 'namespaces': {'movies': {'vector_count': 6235},
                'users': {'vector_count': 195899}},
 'total_vector_count': 202134}

#### Spot Check the Collaborative Filtering Embeddings with Sample Queries

In [0]:
movies.sort('tmdb_id').show(100)

+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|tmdb_id|               title|release_date|runtime|              genres|            overview|   budget|  revenue|popularity|vote_average|vote_count|            keywords|
+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|  10003|           The Saint|  1997-04-03|  116.0|[Thriller, Action...|Ivan Tretiak, Rus...| 68000000|118063304|  10.97633|         5.9|       310|[berlin, russia, ...|
| 100042|  Dumb and Dumber To|  2014-11-12|  110.0|            [Comedy]|20 years after th...| 40000000|169837010| 15.402597|         5.4|      1140|[friendship, sequ...|
|  10005|Behind Enemy Line...|  2006-10-17|   96.0|[Action, Adventur...|Navy SEALS, heade...|     null|     null| 11.063562|         4.5|        31|[n

In [0]:
tmdb_id = '10096'
matches = index.query(namespace="movies", id=tmdb_id, top_k=10)['matches']
matches = spark.createDataFrame([{"tmdb_id": item["id"], "score": item["score"]} for item in matches])
movies.join(matches, on='tmdb_id', how='inner').sort(f.desc('score')).show()

+-------+--------------------+------------+-------+--------------------+--------------------+--------+---------+----------+------------+----------+--------------------+-----------+
|tmdb_id|               title|release_date|runtime|              genres|            overview|  budget|  revenue|popularity|vote_average|vote_count|            keywords|      score|
+-------+--------------------+------------+-------+--------------------+--------------------+--------+---------+----------+------------+----------+--------------------+-----------+
|  10096|      13 Going on 30|  2004-04-13|   98.0|[Comedy, Fantasy,...|After total humil...|37000000| 96455697| 12.632595|         6.3|      1260|[new york, photog...|        1.0|
|   9919|How to Lose a Guy...|  2003-02-07|  116.0|   [Comedy, Romance]|An advice columni...|50000000|177371441|  5.188107|         6.3|       873|[new york, bet, j...|0.959636867|
|   9779|The Sisterhood of...|  2005-06-01|  119.0|     [Drama, Comedy]|Four best friends...|25