### Notebook Set-Up

#### Import Required Modules

In [0]:
# %pip install pinecone-client python-dotenv

In [0]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window

from pyspark.ml.recommendation import ALS

#### Get or Create SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Load Secrets as Environment Variables

In [0]:
load_dotenv("/dbfs/FileStore/env/.env")

Out[8]: True

#### Set Workflow Constants

In [0]:
MIN_RELEASE_DATE = '1970-01-01'
MIN_POPULARITY = 5.0
MIN_RATING = 3
MIN_USERS = 10
MIN_MOVIES = 10
FACTOR_DIMENSION = 20
INDEX_NAME = "cf-embed"

### Import and Clean Raw Data

#### Movie Metadata

In [0]:
metadata_path = "dbfs:/FileStore/data/raw/movies_metadata.csv"
metadata_raw = spark.read.csv(metadata_path, header=True)
# metadata_raw.show(1, truncate=False, vertical=True)

In [0]:
metadata = metadata_raw \
    .select(
        f.col('id').cast('STRING').alias('tmdb_id'),
        f.col('title'),
        f.col('overview'),
        f.col('status'),
        f.col('release_date').cast('DATE').alias('release_date'),
        f.col('popularity').cast('DOUBLE').alias('popularity'),
        f.transform(f.from_json('genres', "ARRAY<STRUCT<id:INTEGER,name:STRING>>"), lambda x: x['name']).alias('genres'),
        f.col('runtime').cast('DOUBLE').alias('runtime'),
        f.from_json('spoken_languages', "ARRAY<STRUCT<iso_639_1:STRING,name:STRING>>").alias('spoken_languages'),
        f.col('adult').cast('BOOLEAN').alias('adult'),
        f.col('vote_average').cast('DOUBLE').alias('vote_average'),
        f.col('vote_count').cast('INTEGER').alias('vote_count')
    ) \
    .withColumn('id_rank', f.row_number().over(Window.partitionBy('tmdb_id').orderBy(f.rand(seed=1492)))) \
    .filter(f.col('id_rank') == 1) \
    .filter(f.col('status') == 'Released') \
    .filter(f.col('adult') == False) \
    .filter(f.array_contains(f.transform('spoken_languages', lambda x: x['name']), 'English')) \
    .filter(f.col('release_date') >= MIN_RELEASE_DATE) \
    .filter(f.col('popularity') >= MIN_POPULARITY) \
    .select('tmdb_id', 'title', 'release_date', 'popularity', 'genres', 'overview', 'runtime', 'vote_average', 'vote_count') \
    .sort('tmdb_id')
    
metadata.printSchema()
metadata.show(1, truncate=False, vertical=True)
metadata.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.count('*').alias('records')).show()

root
 |-- tmdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- popularity: double (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overview: string (nullable = true)
 |-- runtime: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nullable = true)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 tmdb_id      | 10003                                                                                                                                                                                                  

#### Movie Keywords

In [0]:
keywords_path = "dbfs:/FileStore/data/raw/keywords.csv"
keywords_raw = spark.read.csv(keywords_path, header=True)
# keywords_raw.show(1, truncate=False, vertical=True)

In [0]:
keywords = keywords_raw \
    .select(
        f.col('id').alias('tmdb_id'),
        f.transform(f.from_json('keywords', 'ARRAY<STRUCT<id:INTEGER,name:STRING>>'), lambda x: x['name']).alias('keywords')
    ) \
    .withColumn('id_rank', f.row_number().over(Window.partitionBy('tmdb_id').orderBy(f.rand(seed=1492)))) \
    .filter(f.col('id_rank') == 1) \
    .select('tmdb_id', 'keywords') \
    .sort('tmdb_id')

keywords.printSchema()
keywords.show(1, truncate=False, vertical=True)
keywords.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.count('*').alias('records')).show()

root
 |-- tmdb_id: string (nullable = true)
 |-- keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 tmdb_id  | 100                                                                                                                                                                                       
 keywords | [ambush, alcohol, shotgun, tea, joint, machismo, cocktail, rifle, marijuana, cockney accent, pot smoking, hatchet, antique, cardsharp, anger, carjacking, piano, strip show, high stakes] 
only showing top 1 row

+-------+-------+
|tmdb_id|records|
+-------+-------+
|  45432|  45432|
+-------+-------+



#### Movie Identifier Crosswalk

In [0]:
links_path = "dbfs:/FileStore/data/raw/links.csv"
links = spark \
    .read.csv(links_path, header=True) \
    .withColumnRenamed('movieId', 'movie_id') \
    .withColumnRenamed('imdbId', 'imdb_id') \
    .withColumnRenamed('tmdbId', 'tmdb_id')

links.printSchema()
links.show(5, truncate=False)
links.agg(f.countDistinct('tmdb_id').alias('tmdb_id'), f.countDistinct('movie_id').alias('movie_id'), f.count('*').alias('records')).show()

root
 |-- movie_id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- tmdb_id: string (nullable = true)

+--------+-------+-------+
|movie_id|imdb_id|tmdb_id|
+--------+-------+-------+
|1       |0114709|862    |
|2       |0113497|8844   |
|3       |0113228|15602  |
|4       |0114885|31357  |
|5       |0113041|11862  |
+--------+-------+-------+
only showing top 5 rows

+-------+--------+-------+
|tmdb_id|movie_id|records|
+-------+--------+-------+
|  45594|   45843|  45843|
+-------+--------+-------+



#### Create a Combined Movies Dataset

In [0]:
movies = metadata \
    .join(keywords, on='tmdb_id', how='inner') \
    .join(links,    on='tmdb_id', how='inner') \
    .select('movie_id', 'tmdb_id', 'imdb_id', 'title', 'release_date', 'runtime', 'popularity', 'overview', 'genres', 'keywords', 'vote_average', 'vote_count') \
    .cache()

movies.show(10)
movies.printSchema()
movies.agg(f.countDistinct('movie_id').alias('movie_id'), f.countDistinct('tmdb_id').alias('tmdb_id'), f.count('*').alias('records')).show()
movies.agg(*[f.avg(f.col(col).isNull().cast('int')).alias(col) for col in movies.columns]).show()

+--------+-------+-------+--------------------+------------+-------+----------+--------------------+--------------------+--------------------+------------+----------+
|movie_id|tmdb_id|imdb_id|               title|release_date|runtime|popularity|            overview|              genres|            keywords|vote_average|vote_count|
+--------+-------+-------+--------------------+------------+-------+----------+--------------------+--------------------+--------------------+------------+----------+
|    7444|  10096|0337563|      13 Going on 30|  2004-04-13|   98.0| 12.632595|After total humil...|[Comedy, Fantasy,...|[new york, photog...|         6.3|      1260|
|    1623|  10351|0120524|          Wishmaster|  1997-09-19|   90.0|  5.215107|The Djinn having ...|            [Horror]|[fire, menace, wi...|         5.6|       114|
|     412|  10436|0106226|The Age of Innocence|  1993-09-17|  139.0|  8.013617|Tale of 19th cent...|    [Drama, Romance]|[upper class, new...|         7.0|       172

#### Save the Combined Movies Dataset

In [0]:
movies_path = "dbfs:/FileStore/data/clean/movies"
movies.repartition(10).write.parquet(movies_path, mode='overwrite')

#### Import and Clean the Ratings Data

In [0]:
ratings_path = "dbfs:/FileStore/data/raw/ratings.csv"
ratings_raw = spark.read.csv(ratings_path, header=True)
ratings_raw.show(5, truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |110    |1.0   |1425941529|
|1     |147    |4.5   |1425942435|
|1     |858    |5.0   |1425941523|
|1     |1221   |5.0   |1425941546|
|1     |1246   |5.0   |1425941556|
+------+-------+------+----------+
only showing top 5 rows



In [0]:
ratings = ratings_raw \
    .select(
        f.col('userId').alias('user_id'),
        f.col('movieId').alias('movie_id'),
        f.col('rating').cast('DOUBLE').alias('rating'),
        f.from_unixtime('timestamp').alias('timestamp')
    )

ratings.printSchema()
ratings.show(5, truncate=False)
ratings.agg(f.countDistinct('user_id').alias('total_users'), f.countDistinct('movie_id').alias('total_movies'), f.count('*').alias('total_ratings')).show()

root
 |-- user_id: string (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+-------+--------+------+-------------------+
|user_id|movie_id|rating|timestamp          |
+-------+--------+------+-------------------+
|1      |110     |1.0   |2015-03-09 22:52:09|
|1      |147     |4.5   |2015-03-09 23:07:15|
|1      |858     |5.0   |2015-03-09 22:52:03|
|1      |1221    |5.0   |2015-03-09 22:52:26|
|1      |1246    |5.0   |2015-03-09 22:52:36|
+-------+--------+------+-------------------+
only showing top 5 rows

+-----------+------------+-------------+
|total_users|total_movies|total_ratings|
+-----------+------------+-------------+
|     270896|       45115|     26024289|
+-----------+------------+-------------+



#### Save the Ratings Dataset

In [0]:
ratings_path = "dbfs:/FileStore/data/clean/ratings"
ratings.repartition(10).write.parquet(ratings_path, mode='overwrite')

### Train the Implicit Feedback Recommender Model and Extract Embeddings

#### Convert the Ratings to an Implicit Feedback Dataset

In [0]:
model_frame = ratings \
    .join(movies, on='movie_id', how='inner') \
    .select('user_id', 'movie_id', 'rating', 'timestamp') \
    .filter(f.col('rating') >= MIN_RATING) \
    .withColumn('cnt_users', f.count('user_id').over(Window.partitionBy('movie_id'))) \
    .filter(f.col('cnt_users') >= MIN_USERS) \
    .withColumn('cnt_movies', f.count('movie_id').over(Window.partitionBy('user_id'))) \
    .filter(f.col('cnt_movies') >= MIN_MOVIES) \
    .withColumn('rating', f.col('rating') - (MIN_RATING - 1)) \
    .withColumn('user_id', f.col('user_id').cast('INTEGER')) \
    .withColumn('movie_id', f.col('movie_id').cast('INTEGER')) \
    .cache()

counts = model_frame.agg(f.countDistinct('user_id').alias('total_users'), f.countDistinct('movie_id').alias('total_movies'), f.count('*').alias('total_ratings')).first()
sparsity = round(1 - (counts['total_ratings'] / (counts['total_users'] * counts['total_movies'])), 4)
print(f"total_users={counts['total_users']} total_movies={counts['total_movies']} total_ratings={counts['total_ratings']} sparsity={sparsity}")

total_users=195883 total_movies=6236 total_ratings=15372570 sparsity=0.9874


In [0]:
model_frame_path = "dbfs:/FileStore/data/clean/model_frame"
model_frame.repartition(10).write.parquet(model_frame_path, mode='overwrite')

### Train the ALS Recommender Model

In [0]:
estimator = ALS(rank=FACTOR_DIMENSION, maxIter=15, regParam=0.1, implicitPrefs=True, userCol='user_id', itemCol='movie_id', ratingCol='rating', coldStartStrategy='nan')
transformer = estimator.fit(model_frame)
transformer

Out[22]: ALSModel: uid=ALS_d05ee7c4bd21, rank=20

### Extract The User/Movie Embeddings

In [0]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

user_embeddings  = [{"id": item["id"], "values": item["values"].tolist()} for item in user_embeddings]
movie_embeddings = [{"id": item["id"], "values": item["values"].tolist()} for item in movie_embeddings]
len(user_embeddings), len(movie_embeddings)

Out[25]: (195883, 6236)

In [0]:
user_embeddings[0]

Out[26]: {'id': '10',
 'values': [-0.0364203155040741,
  0.10358455032110214,
  0.002394407754763961,
  0.009727544151246548,
  -0.09657862037420273,
  0.1660528928041458,
  -0.17063908278942108,
  0.18315237760543823,
  0.17520791292190552,
  0.14551199972629547,
  0.2445758730173111,
  0.08945835381746292,
  -0.13605937361717224,
  0.01712501235306263,
  -0.44428420066833496,
  0.2247859686613083,
  0.1914227455854416,
  0.29872438311576843,
  -0.1750021129846573,
  0.047015074640512466]}

### Insert the User/Movie Embeddings into Pinecone

In [0]:
import pinecone      
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENVIRONMENT"])   

  original_result = python_builtin_import(name, globals, locals, fromlist, level)


In [0]:
# pinecone.delete_index(INDEX_NAME)
# pinecone.create_index(name=INDEX_NAME, dimension=FACTOR_DIMENSION, metric='cosine', pods=1, replicas=1, pod_type="p1")

In [0]:
index = pinecone.Index('cf-embed')
index.describe_index_stats()

Out[29]: {'dimension': 20,
 'index_fullness': 0.0,
 'namespaces': {'movies': {'vector_count': 6238},
                'users': {'vector_count': 195889}},
 'total_vector_count': 202127}

In [0]:
index.upsert(vectors=user_embeddings, namespace="users", batch_size=100)

Upserted vectors:   0%|          | 0/195883 [00:00<?, ?it/s]

Out[30]: {'upserted_count': 195883}

In [0]:
index.upsert(vectors=movie_embeddings, namespace="movies", batch_size=100)

Upserted vectors:   0%|          | 0/6236 [00:00<?, ?it/s]

Out[31]: {'upserted_count': 6236}

In [0]:
index.describe_index_stats()

Out[32]: {'dimension': 20,
 'index_fullness': 0.0,
 'namespaces': {'movies': {'vector_count': 6238},
                'users': {'vector_count': 195889}},
 'total_vector_count': 202127}

### Spot Check the Embeddings using Similarity Queries

In [0]:
movies.sort('movie_id').show(100)

+--------+-------+-------+--------------------+------------+-------+----------+--------------------+--------------------+--------------------+------------+----------+
|movie_id|tmdb_id|imdb_id|               title|release_date|runtime|popularity|            overview|              genres|            keywords|vote_average|vote_count|
+--------+-------+-------+--------------------+------------+-------+----------+--------------------+--------------------+--------------------+------------+----------+
|       1|    862|0114709|           Toy Story|  1995-10-30|   81.0| 21.946943|Led by Woody, And...|[Animation, Comed...|[jealousy, toy, b...|         7.7|      5415|
|      10|    710|0113189|           GoldenEye|  1995-11-16|  130.0| 14.686036|James Bond must u...|[Adventure, Actio...|[cuba, falsely ac...|         6.6|      1194|
|  100010|  59197|1758570|Battle of Los Ang...|  2011-03-10|   91.0|  7.987112|In February 1942 ...|[Science Fiction,...|[android, pilot, ...|         3.8|        46

In [0]:
movie_id = '101739'
matches = [{"movie_id": item["id"], "score": item["score"]} for item in index.query(namespace='movies', id=movie_id, top_k=10)['matches']]
matches = spark.createDataFrame(matches).sort(f.desc('score'))

In [0]:
movies.join(matches, on='movie_id', how='inner').sort(f.desc('score')).show()

+--------+-------+-------+------------------+------------+-------+----------+--------------------+--------------------+--------------------+------------+----------+-----------+
|movie_id|tmdb_id|imdb_id|             title|release_date|runtime|popularity|            overview|              genres|            keywords|vote_average|vote_count|      score|
+--------+-------+-------+------------------+------------+-------+----------+--------------------+--------------------+--------------------+------------+----------+-----------+
|  101739| 109428|1288558|         Evil Dead|  2013-04-05|   91.0|  12.01525|Evil Dead, the fo...|            [Horror]|[remake, demon, n...|         6.4|      1754|0.999999881|
|  113780| 256274|2870612|As Above, So Below|  2014-08-14|   93.0| 10.846585|When a team of ex...|  [Horror, Thriller]|                null|         6.1|       788|0.994952619|
|  110591| 157547|2388715|            Oculus|  2013-09-08|  104.0|  8.698043|A woman tries to ...|            [Horr