### Set Up Notebook

#### Import Required Modules

In [1]:
# %pip install openai python-dotenv chromadb

In [2]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

import openai
import chromadb
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS

#### Get or Create SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/23 23:47:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Load Secrets

In [4]:
# env_path = "/dbfs/FileStore/env/.env"
load_dotenv()

True

#### Set Workflow Constants

In [6]:
# CLEAN_PATH = "dbfs:/FileStore/data/clean"
CLEAN_PATH = "../data/json"

MOVIES_CONTENT_INDEX_NAME = "movies-content"
USERS_COLLAB_INDEX_NAME = "users-collab"
MOVIES_COLLAB_INDEX_NAME = "movies-collab"

CONTENT_DIMENSION = 1536
COLLAB_DIMENSION = 32

### Create Content Embedding Vectors

#### Import Clean Movies Data

In [7]:
movies = spark.read.json(os.path.join(CLEAN_PATH, "movies.json"))
movies.show(1, truncate=False, vertical=True)
movies.count()

                                                                                

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 actors        | [Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval]                                                                                                                                                                    
 budget        | 4000000                                                                                                                                                                                                                                       
 director      | Allison Anders                                                                                                                                                                                                         

2264

#### Create a Combined `Text` Field to Convert to Vector Embeddings

In [8]:
movies = movies \
    .withColumn("text", f.concat_ws(" | ", 
        f.array_join("genres", ", "), 
        f.array_join("keywords", ", "), 
        f.col("overview"),
        f.col("director"),
        f.array_join("actors", ", ")
    ))
                                   
movies.show(3, truncate=False, vertical=True)
movies.count()

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 actors        | [Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval]                                                                                                                                                                     

2264

In [9]:
movies = movies.toPandas().to_dict(orient="records")
movies[0]

                                                                                

{'actors': ['Tim Roth',
  'Jennifer Beals',
  'Antonio Banderas',
  'Valeria Golino',
  'David Proval'],
 'budget': 4000000,
 'director': 'Allison Anders',
 'genres': ['Comedy'],
 'keywords': ['hotel',
  "new year's eve",
  'witch',
  'bet',
  'hotel room',
  'sperm',
  'anthology',
  'los angeles, california',
  'hoodlum',
  'multiple storylines',
  'woman director'],
 'language': 'en',
 'overview': "It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.",
 'popularity': 21.138,
 'release_date': '1995-12-09',
 'revenue': 4257354,
 'runtime': 98,
 'title': 'Four Rooms',
 'tmdb_homepage': 'https://www.themoviedb.org/movie/5',
 'tmdb_id': '5',
 'updated_at': '2023-10-24T21:38:29.594Z',
 'vote_average': 5.789,
 'vote_count': 2443,
 'text': "Comedy | hotel, new year's eve, witch, bet, hotel room, sperm, anthol

#### Convert the Combined Movie Text to Embedding Representations

In [11]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [12]:
movie_embeddings = [
    openai_client.embeddings.create(
        model="text-embedding-ada-002",
        input=movie["text"]
    ).data[0].embedding for movie in movies
]

In [14]:
len(movies), len(movie_embeddings)

(2264, 2264)

#### Create a Chroma Collection and Insert the Movie Embeddings

In [15]:
chroma_client = chromadb.PersistentClient(path="../src/chroma")
chroma_client

<chromadb.api.client.Client at 0x13dd83b20>

In [16]:
embedding_function = chromadb.utils.embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.environ["OPENAI_API_KEY"], 
    model_name="text-embedding-ada-002"
)

In [17]:
# client.delete_collection(name=MOVIES_CONTENT_INDEX_NAME)

In [19]:
movies_content_collection = chroma_client.get_or_create_collection(
    name=MOVIES_CONTENT_INDEX_NAME, 
    metadata={"hnsw:space": "cosine"},
    embedding_function=embedding_function
)

In [20]:
movies_content_collection.upsert(
    ids=[movie["tmdb_id"] for movie in movies],
    embeddings=movie_embeddings
)
movies_content_collection.count()

2264

#### Spot Check the Content Embeddings with a Few Queries

In [22]:
query = "a gritty crime drama set in new york city starring al pacino"
matches = movies_content_collection.query(query_texts=query, n_results=10)

query_embedding = openai_client.embeddings.create(model="text-embedding-ada-002", input=query).data[0].embedding
matches = movies_content_collection.query(query_embeddings=query_embedding, n_results=10)

matches = spark \
    .createDataFrame(pd.DataFrame({"tmdb_id": matches["ids"][0], "distance": matches["distances"][0]})) \
    .join(spark.createDataFrame(movies), on="tmdb_id", how="inner") \
    .select("tmdb_id", "title", "genres", "keywords", "overview", "release_date", "director", "actors", "distance") \
    .sort("distance").show()

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
                                                                                

+-------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-------------------+
|tmdb_id|               title|              genres|            keywords|            overview|release_date|        director|              actors|           distance|
+-------+--------------------+--------------------+--------------------+--------------------+------------+----------------+--------------------+-------------------+
|   6075|       Carlito's Way|[Crime, Drama, Ro...|[new york city, b...|A Puerto-Rican ex...|  1993-11-10|  Brian De Palma|[Al Pacino, Sean ...|0.11422139406204224|
|   1429|           25th Hour|      [Crime, Drama]|[new york city, b...|In New York City ...|  2002-12-19|       Spike Lee|[Edward Norton, P...|0.12457805871963501|
|    311|Once Upon a Time ...|      [Drama, Crime]|[new york city, l...|A former Prohibit...|  1984-05-23|    Sergio Leone|[Robert De Niro, ...|  0.125485360622406|
|   2112| 

### Create Collaborative Filtering Embedding Vectors

#### Import Clean Ratings Data

In [23]:
ratings = spark.read.json(os.path.join(CLEAN_PATH, "ratings.json"))
ratings.show(5, truncate=False)
ratings.count()

+------+-------+------------------------+-------+
|rating|tmdb_id|updated_at              |user_id|
+------+-------+------------------------+-------+
|4.0   |1572   |2023-10-24T21:38:29.594Z|304    |
|5.0   |2108   |2023-10-24T21:38:29.594Z|469    |
|4.0   |98     |2023-10-24T21:38:29.594Z|247    |
|4.0   |1592   |2023-10-24T21:38:29.594Z|307    |
|3.5   |2048   |2023-10-24T21:38:29.594Z|477    |
+------+-------+------------------------+-------+
only showing top 5 rows



81116

#### Create an Implicit Feedback Recommender Model Frame

In [24]:
model_frame = ratings \
    .withColumn('user_id', f.col('user_id').cast('INTEGER')) \
    .withColumn('tmdb_id', f.col('tmdb_id').cast('INTEGER')) \
    .cache()

counts = model_frame.agg(f.countDistinct('user_id').alias('users'), f.countDistinct('tmdb_id').alias('movies'), f.count('*').alias('ratings')).first()
sparsity = round(1 - (counts['ratings'] / (counts['users'] * counts['movies'])), 4)
print(f"users={counts['users']} movies={counts['movies']} ratings={counts['ratings']} sparsity={sparsity}")

model_frame.groupby("rating").count().sort("rating").show()
model_frame.show(5, truncate=False)

                                                                                

users=610 movies=2269 ratings=81116 sparsity=0.9414
+------+-----+
|rating|count|
+------+-----+
|   0.5|  918|
|   1.0| 1989|
|   1.5| 1121|
|   2.0| 5397|
|   2.5| 4035|
|   3.0|15973|
|   3.5|10260|
|   4.0|22460|
|   4.5| 7225|
|   5.0|11738|
+------+-----+

+------+-------+------------------------+-------+
|rating|tmdb_id|updated_at              |user_id|
+------+-------+------------------------+-------+
|4.0   |1572   |2023-10-24T21:38:29.594Z|304    |
|5.0   |2108   |2023-10-24T21:38:29.594Z|469    |
|4.0   |98     |2023-10-24T21:38:29.594Z|247    |
|4.0   |1592   |2023-10-24T21:38:29.594Z|307    |
|3.5   |2048   |2023-10-24T21:38:29.594Z|477    |
+------+-------+------------------------+-------+
only showing top 5 rows



#### Train the ALS Model

In [27]:
estimator = ALS(rank=COLLAB_DIMENSION, maxIter=15, regParam=0.1, implicitPrefs=True, userCol='user_id', itemCol='tmdb_id', ratingCol='rating')
transformer = estimator.fit(model_frame)
transformer

                                                                                

ALSModel: uid=ALS_1a320d47a959, rank=32

#### Extract the Embeddings from the Fit Model and Format for Index Insertion

In [28]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .sort('id') \
    .toPandas() \
    .to_dict(orient='list')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .sort('id') \
    .toPandas() \
    .to_dict(orient='list')

                                                                                

#### Create Collaborative Filtering Embeddings for the Users

In [30]:
users_collab_collection = chroma_client.get_or_create_collection(
    name=USERS_COLLAB_INDEX_NAME, 
    metadata={"hnsw:space": "cosine"},
    embedding_function=embedding_function
)
users_collab_collection

Collection(name=users-collab)

In [31]:
users_collab_collection.upsert(
    ids=user_embeddings["id"],
    embeddings=user_embeddings["values"]
)
users_collab_collection.count()

610

#### Create Collaborative Filtering Embeddings for the Movies

In [32]:
movies_collab_collection = chroma_client.get_or_create_collection(
    name=MOVIES_COLLAB_INDEX_NAME, 
    metadata={"hnsw:space": "cosine"},
    embedding_function=embedding_function
)
movies_collab_collection

Collection(name=movies-collab)

In [33]:
movies_collab_collection.upsert(
    ids=movie_embeddings["id"],
    embeddings=movie_embeddings["values"]
)
movies_collab_collection.count()

2269

#### Spot Check the Content Embeddings with a Few Queries

In [34]:
tmdb_id = '769'
embedding = movie_embeddings["values"][movie_embeddings["id"].index(tmdb_id)]

matches = movies_collab_collection.query(query_embeddings=embedding, n_results=10)
matches = spark \
    .createDataFrame(pd.DataFrame({"tmdb_id": matches["ids"][0], "distance": matches["distances"][0]})) \
    .join(spark.createDataFrame(movies), on="tmdb_id", how="inner") \
    .select("tmdb_id", "title", "genres", "keywords", "overview", "release_date", "director", "actors", "distance") \
    .sort("distance").show()

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
                                                                                

+-------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+
|tmdb_id|               title|              genres|            keywords|            overview|release_date|            director|              actors|            distance|
+-------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+
|    769|          GoodFellas|      [Drama, Crime]|[new york city, p...|The true story of...|  1990-09-12|     Martin Scorsese|[Ray Liotta, Robe...|5.960464477539062...|
|    500|      Reservoir Dogs|   [Crime, Thriller]|[traitor, jewelry...|A botched robbery...|  1992-09-02|   Quentin Tarantino|[Harvey Keitel, T...| 0.13296473026275635|
|    240|The Godfather Par...|      [Drama, Crime]|[italy, italian a...|In the continuing...|  1974-12-20|Francis Ford Coppola|[Al Pacino, Rober...| 0

# START SANDBOX CODE