### Set Up Notebook

#### Import Required Modules

In [0]:
# %pip install openai pinecone-client python-dotenv

In [0]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

import openai
import pinecone
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS

#### Get or Create SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Load Secrets

In [0]:
env_path = "/dbfs/FileStore/env/.env"
load_dotenv(env_path)

Out[41]: True

#### Set Workflow Constants

In [0]:
CLEAN_PATH = "dbfs:/FileStore/data/clean"
CONTENT_INDEX_NAME = "content-embeddings"
CONTENT_DIMENSION = 1536
COLLABORATIVE_INDEX_NAME = "collaborative-embeddings"
COLLABORATIVE_DIMENSION = 32

#### Authenticate Providers

In [0]:
openai.api_key = os.environ["OPENAI_API_KEY"]        
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENVIRONMENT"]) 

### Create Content Embedding Vectors

#### Import Clean Movies Data

In [0]:
movies = spark.read.parquet(os.path.join(CLEAN_PATH, "movies"))
movies.show(1, truncate=False, vertical=True)
movies.count()

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 budget        | 4000000                                                                                                                                                                                                                                       
 cast          | [Tim Roth, Jennifer Beals, Antonio Banderas, Valeria Golino, David Proval]                                                                                                                                                                    
 director      | Allison Anders                                                                                                                                                                                                         

#### Create a Combined `Text` Field to Convert to Vector Embeddings

In [0]:
movies = movies \
    .withColumn("text", f.concat_ws(" | ", 
        f.concat_ws(" ", f.lit("genres:"), f.array_join("genres", ", ")), 
        f.concat_ws(" ", f.lit("keywords:"), f.array_join("keywords", ", ")), 
        f.concat_ws(" ", f.lit("plot:"), f.col("overview")),
        f.concat_ws(" ", f.lit("director:"), f.col("director")),
        f.concat_ws(" ", f.lit("actors:"), f.array_join("cast", ", ")),
        f.concat_ws(" ", f.lit("release_date:"), f.col("release_date"))
    ))
        
movies.show(3, truncate=False, vertical=True)
movies.count()

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 budget        | 4000000                                                                                                                                                                   

#### Convert the Combined Movie Text to Embedding Representations

In [0]:
movie_embeddings = [
    openai.Embedding.create(
        model="text-embedding-ada-002",
        input=movie["text"]
    )["data"][0]["embedding"] for movie in movies.toPandas().to_dict(orient="records")
]
len(movie_embeddings)

Out[36]: 2264

#### Insert the Movie Content Embeddings into the Pinecone Index

In [0]:
movie_vectors = [
    {
        "id": movie["tmdb_id"],
        "values": embedding,
    }
    for movie, embedding in zip(movies, movie_embeddings)
]
len(movie_vectors)

Out[37]: 2264

In [0]:
# pinecone.delete_index(CONTENT_INDEX_NAME)
# pinecone.create_index(name=CONTENT_INDEX_NAME, dimension=CONTENT_DIMENSION, metric='cosine', pods=1, replicas=1, pod_type="p1")

In [0]:
index = pinecone.Index(CONTENT_INDEX_NAME)
index.upsert(vectors=movie_vectors, batch_size=100)

Upserted vectors:   0%|          | 0/2264 [00:00<?, ?it/s]

Out[87]: {'upserted_count': 2264}

In [0]:
index.describe_index_stats()

Out[89]: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2264}},
 'total_vector_count': 2264}

#### Spot Check the Content Embeddings with a Few Queries

In [0]:
query = "a gritty crime drama set in new york city starring al pacino"
query_embedding = openai.Embedding.create(model="text-embedding-ada-002", input=query)["data"][0]["embedding"]
matches = index.query(vector=query_embedding, top_k=10, include_values=False, include_metadata=True)["matches"]
matches = spark.createDataFrame([{"tmdb_id": item["id"], "score": item["score"]} for item in matches])
matches = matches.join(movies, on="tmdb_id", how="inner").select("title", "genres", "keywords", "overview", "release_date", "director", "cast", "score").sort(f.desc("score")).show()

+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------+
|               title|              genres|            keywords|            overview|release_date|            director|                cast|      score|
+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------+
|       Carlito's Way|[Crime, Drama, Ro...|[new york city, b...|A Puerto-Rican ex...|  1993-11-10|      Brian De Palma|[Al Pacino, Sean ...| 0.88834095|
|Once Upon a Time ...|      [Drama, Crime]|[new york city, l...|A former Prohibit...|  1984-05-23|        Sergio Leone|[Robert De Niro, ...|0.875242472|
|           City Hall|   [Drama, Thriller]|[corruption, unde...|The accidental sh...|  1996-02-16|       Harold Becker|[Al Pacino, John ...|  0.8738904|
|             Payback|[Crime, Action, D...|[new york city, p...|With friends like.

### Create Collaborative Filtering Embedding Vectors

#### Import Clean Ratings Data

In [0]:
ratings = spark.read.parquet(os.path.join(CLEAN_PATH, "ratings"))
ratings.show(5, truncate=False)
ratings.count()

+-------+-------+------+-------------------+
|user_id|tmdb_id|rating|timestamp          |
+-------+-------+------+-------------------+
|304    |1572   |4.0   |1998-03-29 12:12:08|
|469    |2108   |5.0   |2000-08-04 21:46:05|
|247    |98     |4.0   |2016-07-04 14:55:39|
|307    |1592   |4.0   |2007-10-29 13:08:15|
|477    |2048   |3.5   |2008-01-21 18:45:47|
+-------+-------+------+-------------------+
only showing top 5 rows

Out[70]: 81116

#### Create an Implicit Feedback Recommender Model Frame

In [0]:
model_frame = ratings \
    .withColumn('user_id', f.col('user_id').cast('INTEGER')) \
    .withColumn('tmdb_id', f.col('tmdb_id').cast('INTEGER')) \
    .cache()

counts = model_frame.agg(f.countDistinct('user_id').alias('users'), f.countDistinct('tmdb_id').alias('movies'), f.count('*').alias('ratings')).first()
sparsity = round(1 - (counts['ratings'] / (counts['users'] * counts['movies'])), 4)
print(f"users={counts['users']} movies={counts['movies']} ratings={counts['ratings']} sparsity={sparsity}")

model_frame.groupby("rating").count().sort("rating").show()
model_frame.show(5, truncate=False)

users=610 movies=2269 ratings=81116 sparsity=0.9414
+------+-----+
|rating|count|
+------+-----+
|   0.5|  918|
|   1.0| 1989|
|   1.5| 1121|
|   2.0| 5397|
|   2.5| 4035|
|   3.0|15973|
|   3.5|10260|
|   4.0|22460|
|   4.5| 7225|
|   5.0|11738|
+------+-----+

+-------+-------+------+-------------------+
|user_id|tmdb_id|rating|timestamp          |
+-------+-------+------+-------------------+
|304    |1572   |4.0   |1998-03-29 12:12:08|
|469    |2108   |5.0   |2000-08-04 21:46:05|
|247    |98     |4.0   |2016-07-04 14:55:39|
|307    |1592   |4.0   |2007-10-29 13:08:15|
|477    |2048   |3.5   |2008-01-21 18:45:47|
+-------+-------+------+-------------------+
only showing top 5 rows



#### Train the ALS Model

In [0]:
estimator = ALS(rank=COLLABORATIVE_DIMENSION, maxIter=15, regParam=0.1, implicitPrefs=True, userCol='user_id', itemCol='tmdb_id', ratingCol='rating')
transformer = estimator.fit(model_frame)
transformer

Out[75]: ALSModel: uid=ALS_78f7317cb457, rank=32

#### Extract the Embeddings from the Fit Model and Format for Index Insertion

In [0]:
user_embeddings = transformer.userFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

movie_embeddings = transformer.itemFactors \
    .withColumn('id', f.col('id').cast('string')) \
    .withColumnRenamed('features', 'values') \
    .toPandas() \
    .to_dict(orient='records')

user_embeddings  = [{"id": item["id"], "values": item["values"].tolist()} for item in user_embeddings]
movie_embeddings = [{"id": item["id"], "values": item["values"].tolist()} for item in movie_embeddings]
len(user_embeddings), len(movie_embeddings)

Out[76]: (610, 2269)

#### Insert the User/Movie Embeddings into the Pinecone Index

In [0]:
pinecone.delete_index(COLLABORATIVE_INDEX_NAME)
pinecone.create_index(name=COLLABORATIVE_INDEX_NAME, dimension=COLLABORATIVE_DIMENSION, metric='cosine', pods=1, replicas=1, pod_type="p1")

In [0]:
index = pinecone.Index(COLLABORATIVE_INDEX_NAME)
index.upsert(vectors=user_embeddings, namespace="users", batch_size=100)
index.upsert(vectors=movie_embeddings, namespace="movies", batch_size=100)

Upserted vectors:   0%|          | 0/610 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/2269 [00:00<?, ?it/s]

Out[81]: {'upserted_count': 2269}

In [0]:
index.describe_index_stats()

Out[85]: {'dimension': 32,
 'index_fullness': 0.0,
 'namespaces': {'movies': {'vector_count': 2269},
                'users': {'vector_count': 610}},
 'total_vector_count': 2879}

#### Spot Check the Content Embeddings with a Few Queries

In [0]:
tmdb_id = '769'
matches = index.query(namespace="movies", id=tmdb_id, top_k=10)['matches']
matches = spark.createDataFrame([{"tmdb_id": item["id"], "score": item["score"]} for item in matches])
movies.join(matches, on='tmdb_id', how='inner').select("title", "genres", "keywords", "overview", "release_date", "director", "cast", "score").sort(f.desc("score")).show()

+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------+
|               title|              genres|            keywords|            overview|release_date|            director|                cast|      score|
+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------+
|          GoodFellas|      [Drama, Crime]|[new york city, p...|The true story of...|  1990-09-12|     Martin Scorsese|[Ray Liotta, Robe...| 1.00000012|
|      Reservoir Dogs|   [Crime, Thriller]|[traitor, jewelry...|A botched robbery...|  1992-09-02|   Quentin Tarantino|[Harvey Keitel, T...|0.896228135|
|       The Godfather|      [Drama, Crime]|[italy, loss of l...|Spanning the year...|  1972-03-14|Francis Ford Coppola|[Marlon Brando, A...| 0.83672756|
|The Godfather Par...|      [Drama, Crime]|[italy, italian a...|In the continuing.

# START SANDBOX CODE