### Set Up Notebook

#### Import Required Modules

In [4]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

import openai
import chromadb
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS

from llama_index import ServiceContext, StorageContext
from llama_index.schema import TextNode, MetadataMode
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index.vector_stores import ChromaVectorStore
from llama_index.indices.vector_store import VectorStoreIndex

#### Get or Create SparkSession

In [5]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark

#### Load Secrets

In [6]:
# env_path = "/dbfs/FileStore/env/.env"
load_dotenv()

True

#### Set Workflow Constants

In [47]:
# CLEAN_PATH = "dbfs:/FileStore/data/clean"
CLEAN_PATH = "../data/json"

MOVIES_CONTENT_INDEX_NAME = "movies-content"
USERS_COLLAB_INDEX_NAME = "users-collab"
MOVIES_COLLAB_INDEX_NAME = "movies-collab"

CONTENT_DIMENSION = 1536
COLLAB_DIMENSION = 32

### Create Content Embedding Vectors

#### Import Clean Movies Data

In [48]:
movies = spark.read.json(os.path.join(CLEAN_PATH, "movies.json"))
movies.show(1, truncate=False, vertical=True)
movies.count()

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 actors        | [Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval]                                                                                                                                                                    
 budget        | 4000000                                                                                                                                                                                                                                       
 director      | Allison Anders                                                                                                                                                                                                         

2264

#### Convert Movies into LLamaIndex Nodes

In [49]:
movies = movies.toPandas().to_dict(orient="records")
movies[0]

{'actors': ['Tim Roth',
  'Jennifer Beals',
  'Antonio Banderas',
  'Valeria Golino',
  'David Proval'],
 'budget': 4000000,
 'director': 'Allison Anders',
 'genres': ['Comedy'],
 'keywords': ['hotel',
  "new year's eve",
  'witch',
  'bet',
  'hotel room',
  'sperm',
  'anthology',
  'los angeles, california',
  'hoodlum',
  'multiple storylines',
  'woman director'],
 'language': 'en',
 'overview': "It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.",
 'popularity': 21.138,
 'release_date': '1995-12-09',
 'revenue': 4257354,
 'runtime': 98,
 'title': 'Four Rooms',
 'tmdb_homepage': 'https://www.themoviedb.org/movie/5',
 'tmdb_id': '5',
 'updated_at': '2023-10-24T21:38:29.594Z',
 'vote_average': 5.789,
 'vote_count': 2443}

In [50]:
list(movies[0].keys())

['actors',
 'budget',
 'director',
 'genres',
 'keywords',
 'language',
 'overview',
 'popularity',
 'release_date',
 'revenue',
 'runtime',
 'title',
 'tmdb_homepage',
 'tmdb_id',
 'updated_at',
 'vote_average',
 'vote_count']

In [51]:
emb_metadata_keys = ["genres", "keywords", "director", "actors", "decade"]
excluded_embed_metadata_keys = sorted(set(movies[0].keys()) - set(emb_metadata_keys))
excluded_embed_metadata_keys

['budget',
 'language',
 'overview',
 'popularity',
 'release_date',
 'revenue',
 'runtime',
 'title',
 'tmdb_homepage',
 'tmdb_id',
 'updated_at',
 'vote_average',
 'vote_count']

In [52]:
excluded_llm_metadata_keys = ["tmdb_homepage", "tmdb_id", "updated_at"]
excluded_llm_metadata_keys

['tmdb_homepage', 'tmdb_id', 'updated_at']

In [53]:
nodes = []
for movie in movies:

    metadata = movie.copy()
    text = metadata.pop("overview")
    
    metadata["actors"] = ", ".join(metadata["actors"])
    metadata["genres"] = ", ".join(metadata["genres"])
    metadata["keywords"] = ", ".join(metadata["keywords"])
    
    metadata["decade"] = metadata['release_date'][:3] + "0s"
    metadata.pop("release_date")

    node = TextNode(
        text=text,
        metadata=metadata,
        excluded_embed_metadata_keys=excluded_embed_metadata_keys,
        excluded_llm_metadata_keys=excluded_llm_metadata_keys,
        text_template="{metadata_str}\nplot overview: {content}"
    )
    node.id_ = metadata["tmdb_id"]
    nodes.append(node)

In [71]:
print(nodes[0].get_content(metadata_mode=MetadataMode.EMBED))

actors: Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval
director: Allison Anders
genres: Comedy
keywords: hotel, new year's eve, witch, bet, hotel room, sperm, anthology, los angeles, california, hoodlum, multiple storylines, woman director
decade: 1990s
plot overview: It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.


In [55]:
print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

actors: Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval
budget: 4000000
director: Allison Anders
genres: Comedy
keywords: hotel, new year's eve, witch, bet, hotel room, sperm, anthology, los angeles, california, hoodlum, multiple storylines, woman director
language: en
popularity: 21.138
revenue: 4257354
runtime: 98
title: Four Rooms
vote_average: 5.789
vote_count: 2443
decade: 1990s
plot overview: It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.


#### Get or Create a Chroma Collection

In [56]:
chroma_client = chromadb.PersistentClient(path="../src/chroma")
chroma_client

<chromadb.api.client.Client at 0x29a921e40>

In [57]:
embedding_function = chromadb.utils.embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.environ["OPENAI_API_KEY"], 
    model_name="text-embedding-ada-002"
)

In [58]:
chroma_client.delete_collection(name=MOVIES_CONTENT_INDEX_NAME)

In [59]:
movies_content_collection = chroma_client.get_or_create_collection(
    name=MOVIES_CONTENT_INDEX_NAME, 
    metadata={"hnsw:space": "cosine"},
    embedding_function=embedding_function
)
movies_content_collection.count()

0

#### Create a LLamaIndex ChromaDB VectorStore and Insert Nodes

In [60]:
movies_content_store = ChromaVectorStore(chroma_collection=movies_content_collection)
storage_context = StorageContext.from_defaults(vector_store=movies_content_store)
storage_context

StorageContext(docstore=<llama_index.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x29c2882e0>, index_store=<llama_index.storage.index_store.simple_index_store.SimpleIndexStore object at 0x29c288850>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': <llama_index.vector_stores.simple.SimpleVectorStore object at 0x29c288640>}, graph_store=<llama_index.graph_stores.simple.SimpleGraphStore object at 0x29c288790>)

In [61]:
llm = OpenAI(model="gpt-4-1106-preview", temperature=0.1, max_tokens=1000, api_key=os.environ["OPENAI_API_KEY"])
embed_model = OpenAIEmbedding(model="text-embedding-ada-002", api_key=os.environ["OPENAI_API_KEY"])
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

In [62]:
movies_content_index = VectorStoreIndex(
    nodes=nodes,
    service_context=service_context,
    storage_context=storage_context,
    show_progress=True
)

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2048/2048 [01:01<00:00, 33.27it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 216/216 [00:06<00:00, 32.92it/s]


In [63]:
movies_content_collection.count()

2264

In [64]:
movies_content_collection.get(ids="5")

{'ids': ['5'],
 'embeddings': None,
 'metadatas': [{'_node_content': '{"id_": "5", "embedding": null, "metadata": {"actors": "Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval", "budget": 4000000, "director": "Allison Anders", "genres": "Comedy", "keywords": "hotel, new year\'s eve, witch, bet, hotel room, sperm, anthology, los angeles, california, hoodlum, multiple storylines, woman director", "language": "en", "popularity": 21.138, "revenue": 4257354, "runtime": 98, "title": "Four Rooms", "tmdb_homepage": "https://www.themoviedb.org/movie/5", "tmdb_id": "5", "updated_at": "2023-10-24T21:38:29.594Z", "vote_average": 5.789, "vote_count": 2443, "decade": "1990s"}, "excluded_embed_metadata_keys": ["budget", "language", "overview", "popularity", "release_date", "revenue", "runtime", "title", "tmdb_homepage", "tmdb_id", "updated_at", "vote_average", "vote_count"], "excluded_llm_metadata_keys": ["tmdb_homepage", "tmdb_id", "updated_at"], "relationships": {}, "hash": "

#### Convert the Combined Movie Text to Embedding Representations

In [65]:
openai_client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [66]:
# movie_embeddings = [
#     openai_client.embeddings.create(
#         model="text-embedding-ada-002",
#         input=movie["text"]
#     ).data[0].embedding for movie in movies
# ]

In [67]:
# len(movies), len(movie_embeddings)

In [68]:
# movies_content_collection.upsert(
#     ids=[movie["tmdb_id"] for movie in movies],
#     embeddings=movie_embeddings,
#     documents=[movie["text"] for movie in movies]
# )
# movies_content_collection.count()

#### Spot Check the Content Embeddings with a Few Queries

In [69]:
retriever = movies_content_index.as_retriever(similarity_top_k=5, verbose=True)

In [70]:
query = "a gritty crime drama set in new york city starring al pacino"
results = retriever.retrieve(query)
print("\n\n".join([result.get_content(metadata_mode=MetadataMode.LLM) for result in results]))

actors: Al Pacino, Sean Penn, Penelope Ann Miller, John Leguizamo, Ingrid Rogers
budget: 30000000
director: Brian De Palma
genres: Crime, Drama, Romance, Thriller
keywords: new york city, based on novel or book, 1970s, go-go dancer, puerto rico, cocaine, nightclub, gangster, release from prison, criminal past, love, lawyer, drugs, disco, neo-noir
language: en
popularity: 24.27
revenue: 63848322
runtime: 144
title: Carlito's Way
vote_average: 7.834
vote_count: 2875
decade: 1990s
plot overview: A Puerto-Rican ex-con, just released from prison, pledges to stay away from drugs and violence despite the pressure around him, and lead a better life outside NYC.

actors: Al Pacino, John Cusack, Bridget Fonda, Danny Aiello, Martin Landau
budget: 0
director: Harold Becker
genres: Drama, Thriller
keywords: corruption, undercover, war on drugs, mayor, drug dealer, presidential election, undercover agent, investigation, police, drugs
language: en
popularity: 17.394
revenue: 0
runtime: 111
title: Cit

### Create Collaborative Filtering Embedding Vectors

#### Import Clean Ratings Data

In [23]:
ratings = spark.read.json(os.path.join(CLEAN_PATH, "ratings.json"))
ratings.show(5, truncate=False)
ratings.count()

+------+-------+------------------------+-------+
|rating|tmdb_id|updated_at              |user_id|
+------+-------+------------------------+-------+
|4.0   |1572   |2023-10-24T21:38:29.594Z|304    |
|5.0   |2108   |2023-10-24T21:38:29.594Z|469    |
|4.0   |98     |2023-10-24T21:38:29.594Z|247    |
|4.0   |1592   |2023-10-24T21:38:29.594Z|307    |
|3.5   |2048   |2023-10-24T21:38:29.594Z|477    |
+------+-------+------------------------+-------+
only showing top 5 rows



81116

#### Create an Implicit Feedback Recommender Model Frame

In [24]:
model_frame = ratings \
    .withColumn('user_id', f.col('user_id').cast('INTEGER')) \
    .withColumn('tmdb_id', f.col('tmdb_id').cast('INTEGER')) \
    .cache()

counts = model_frame.agg(f.countDistinct('user_id').alias('users'), f.countDistinct('tmdb_id').alias('movies'), f.count('*').alias('ratings')).first()
sparsity = round(1 - (counts['ratings'] / (counts['users'] * counts['movies'])), 4)
print(f"users={counts['users']} movies={counts['movies']} ratings={counts['ratings']} sparsity={sparsity}")

model_frame.groupby("rating").count().sort("rating").show()
model_frame.show(5, truncate=False)

                                                                                

users=610 movies=2269 ratings=81116 sparsity=0.9414
+------+-----+
|rating|count|
+------+-----+
|   0.5|  918|
|   1.0| 1989|
|   1.5| 1121|
|   2.0| 5397|
|   2.5| 4035|
|   3.0|15973|
|   3.5|10260|
|   4.0|22460|
|   4.5| 7225|
|   5.0|11738|
+------+-----+

+------+-------+------------------------+-------+
|rating|tmdb_id|updated_at              |user_id|
+------+-------+------------------------+-------+
|4.0   |1572   |2023-10-24T21:38:29.594Z|304    |
|5.0   |2108   |2023-10-24T21:38:29.594Z|469    |
|4.0   |98     |2023-10-24T21:38:29.594Z|247    |
|4.0   |1592   |2023-10-24T21:38:29.594Z|307    |
|3.5   |2048   |2023-10-24T21:38:29.594Z|477    |
+------+-------+------------------------+-------+
only showing top 5 rows



#### Train the ALS Model

In [27]:
estimator = ALS(rank=COLLAB_DIMENSION, maxIter=15, regParam=0.1, implicitPrefs=True, userCol='user_id', itemCol='tmdb_id', ratingCol='rating')
transformer = estimator.fit(model_frame)
transformer

                                                                                

ALSModel: uid=ALS_1a320d47a959, rank=32

#### Extract the Embeddings from the Fit Model and Format for Index Insertion

In [28]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .sort('id') \
    .toPandas() \
    .to_dict(orient='list')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .sort('id') \
    .toPandas() \
    .to_dict(orient='list')

                                                                                

#### Create Collaborative Filtering Embeddings for the Users

In [30]:
users_collab_collection = chroma_client.get_or_create_collection(
    name=USERS_COLLAB_INDEX_NAME, 
    metadata={"hnsw:space": "cosine"},
    embedding_function=embedding_function
)
users_collab_collection

Collection(name=users-collab)

In [31]:
users_collab_collection.upsert(
    ids=user_embeddings["id"],
    embeddings=user_embeddings["values"]
)
users_collab_collection.count()

610

#### Create Collaborative Filtering Embeddings for the Movies

In [32]:
movies_collab_collection = chroma_client.get_or_create_collection(
    name=MOVIES_COLLAB_INDEX_NAME, 
    metadata={"hnsw:space": "cosine"},
    embedding_function=embedding_function
)
movies_collab_collection

Collection(name=movies-collab)

In [33]:
movies_collab_collection.upsert(
    ids=movie_embeddings["id"],
    embeddings=movie_embeddings["values"]
)
movies_collab_collection.count()

2269

#### Spot Check the Content Embeddings with a Few Queries

In [34]:
tmdb_id = '769'
embedding = movie_embeddings["values"][movie_embeddings["id"].index(tmdb_id)]

matches = movies_collab_collection.query(query_embeddings=embedding, n_results=10)
matches = spark \
    .createDataFrame(pd.DataFrame({"tmdb_id": matches["ids"][0], "distance": matches["distances"][0]})) \
    .join(spark.createDataFrame(movies), on="tmdb_id", how="inner") \
    .select("tmdb_id", "title", "genres", "keywords", "overview", "release_date", "director", "actors", "distance") \
    .sort("distance").show()

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:
                                                                                

+-------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+
|tmdb_id|               title|              genres|            keywords|            overview|release_date|            director|              actors|            distance|
+-------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+--------------------+
|    769|          GoodFellas|      [Drama, Crime]|[new york city, p...|The true story of...|  1990-09-12|     Martin Scorsese|[Ray Liotta, Robe...|5.960464477539062...|
|    500|      Reservoir Dogs|   [Crime, Thriller]|[traitor, jewelry...|A botched robbery...|  1992-09-02|   Quentin Tarantino|[Harvey Keitel, T...| 0.13296473026275635|
|    240|The Godfather Par...|      [Drama, Crime]|[italy, italian a...|In the continuing...|  1974-12-20|Francis Ford Coppola|[Al Pacino, Rober...| 0

# START SANDBOX CODE