### Set Up Notebook

#### Import Required Modules

In [0]:
# %pip install openai pinecone-client python-dotenv

In [0]:
import os
import json

from typing import List, Dict
from dotenv import load_dotenv

import openai
import pinecone

import numpy as np
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window



#### Get or Create SparkSession

In [0]:
spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("error")
spark

#### Load Secrets as Environment Variables

In [0]:
env_path = "/dbfs/FileStore/env/.env"
load_dotenv(env_path)

Out[33]: True

#### Set Workflow Constants

In [0]:
CLN_PATH = "dbfs:/FileStore/data/clean"
INDEX_NAME = "content-embeddings"
DIMENSION = 1536

### Create Content Embedding Vectors

#### Import Clean Data

In [0]:
movies = spark.read.parquet(os.path.join(CLN_PATH, "movies"))
movies.show(10)

+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|tmdb_id|               title|release_date|runtime|              genres|            overview|   budget|  revenue|popularity|vote_average|vote_count|            keywords|
+-------+--------------------+------------+-------+--------------------+--------------------+---------+---------+----------+------------+----------+--------------------+
|  10010|      Brother Bear 2|  2006-08-17|   73.0|[Adventure, Anima...|Kenai finds his c...|     null|     null| 10.861154|         6.3|       318|[grizzly bear, hu...|
|  10012|              Cursed|  2005-02-25|   97.0|    [Horror, Comedy]|A werewolf loose ...| 35000000| 19294901|  8.949722|         5.1|       168|[brother sister r...|
| 100402|Captain America: ...|  2014-03-20|  136.0|[Action, Adventur...|After the catacly...|170000000|714766572| 18.717704|         7.6|      5881|[w

In [0]:
model_frame = spark.read.parquet(os.path.join(CLN_PATH, "model_frame"))
model_frame.show(10)

+-------+-------+------+-------------------+---------+----------+
|user_id|tmdb_id|rating|          timestamp|cnt_users|cnt_movies|
+-------+-------+------+-------------------+---------+----------+
| 171679|   2255|   2.0|2013-04-23 14:39:05|    13641|       133|
| 217667|  10427|   2.0|1996-11-08 17:21:30|     3441|        58|
|  31793|    532|   3.0|1997-05-07 15:14:38|    12698|       107|
|  96489|    197|   3.0|2015-10-25 19:42:18|    61076|        84|
| 198045|    641|   2.5|2015-11-04 04:07:02|    19478|       689|
|  34058|   1551|   1.0|2015-08-05 20:08:07|     3358|      1113|
|  55981|    603|   2.0|2008-03-26 11:50:09|    72417|       152|
| 250847|   2164|   1.0|1996-06-11 09:51:07|    30738|        34|
| 200371|    687|   1.0|2002-04-23 09:36:49|    22338|       183|
| 145188|    154|   2.0|2013-01-21 23:45:51|    15405|        34|
+-------+-------+------+-------------------+---------+----------+
only showing top 10 rows



In [0]:
model_frame_movies = model_frame.select('tmdb_id').distinct()
model_frame_movies.count()

Out[37]: 6235

#### Format Combined Movie Text for Embedding

In [0]:
movies = movies \
    .join(model_frame_movies, on='tmdb_id', how='inner') \
    .withColumn('genres',   f.when(f.col('genres').isNull(),   f.array().cast('ARRAY<STRING>')).otherwise(f.col('genres'))) \
    .withColumn('keywords', f.when(f.col('keywords').isNull(), f.array().cast('ARRAY<STRING>')).otherwise(f.col('keywords'))) \
    .withColumn('text', f.concat_ws(" | ", f.array_join(f.col('genres'), ", "), f.array_join(f.col('keywords'), ", "), f.col('overview'))) \
    .sort('tmdb_id')

movies.show(5, truncate=False, vertical=True)
movies.count()

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 tmdb_id      | 10003                                                                                                                                                                                                                                                                                                                                                                                                                     

In [0]:
movies = movies.toPandas().to_dict(orient="records")
movies[0]

Out[39]: {'tmdb_id': '10003',
 'title': 'The Saint',
 'release_date': datetime.date(1997, 4, 3),
 'runtime': 116.0,
 'genres': array(['Thriller', 'Action', 'Romance', 'Science Fiction', 'Adventure'],
       dtype=object),
 'overview': 'Ivan Tretiak, Russian Mafia boss who wants to create an oil crisis in Moscow and seize power as a result sends Simon Templar, great international criminal, to England to get a secret formula for cold fusion from U.S. scientist Emma Russell. Templar falls in love with Emma and they try to outwit Tretiak and his guerrillas, hiding from them in Moscow',
 'budget': 68000000.0,
 'revenue': 118063304.0,
 'popularity': 10.97633,
 'vote_average': 5.9,
 'vote_count': 310.0,
 'keywords': array(['berlin', 'russia', 'gas', 'master thief', 'the saint'],
       dtype=object),
 'text': 'Thriller, Action, Romance, Science Fiction, Adventure | berlin, russia, gas, master thief, the saint | Ivan Tretiak, Russian Mafia boss who wants to create an oil crisis in Moscow and s

#### Convert the Combined Movie Text to Embedding Representations

In [0]:
openai.api_key = os.environ["OPENAI_API_KEY"]

In [0]:
movie_embeddings = [
    openai.Embedding.create(
        model="text-embedding-ada-002",
        input=movie["text"]
    )["data"][0]["embedding"] for movie in movies
]

In [0]:
len(movies), len(movie_embeddings)

Out[44]: (6235, 6235)

#### Format the Movie Metadata + Embeddings as Vectors and Insert into the Pinecone Index

In [0]:
movie_vectors = [
    {
        "id": movie["tmdb_id"],
        "values": embedding,
        "metadata": {
            "title": movie["title"],
            "release_date": movie["release_date"],
            "runtime": movie["runtime"],
            "text": movie["text"],
            "budget": movie["budget"] if not np.isnan(movie["budget"]) else -1,
            "revenue": movie["revenue"] if not np.isnan(movie["revenue"]) else -1,
            "popularity": movie["popularity"],
            "vote_average": movie["vote_average"] if not np.isnan(movie["vote_average"]) else -1,
            "vote_count": movie["vote_count"] if not np.isnan(movie["vote_count"]) else -1,
            "genres": movie["genres"].tolist(),
            "keywords": movie["keywords"].tolist(),
            "overview": movie["overview"],
            "text": movie["text"],
        }
    }
    for movie, embedding in zip(movies, movie_embeddings)
]
len(movie_vectors)

Out[68]: 6235

In [0]:
import pinecone      
pinecone.init(api_key=os.environ["PINECONE_API_KEY"], environment=os.environ["PINECONE_ENVIRONMENT"])   

In [0]:
# pinecone.delete_index(INDEX_NAME)
# pinecone.create_index(name=INDEX_NAME, dimension=DIMENSION, metric='cosine', pods=1, replicas=1, pod_type="p1")

In [0]:
index = pinecone.Index(INDEX_NAME)
index.describe_index_stats()

Out[71]: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 900}},
 'total_vector_count': 900}

In [0]:
index.upsert(vectors=movie_vectors, batch_size=100)

Upserted vectors:   0%|          | 0/6235 [00:00<?, ?it/s]

Out[72]: {'upserted_count': 6235}

In [0]:
index.describe_index_stats()

Out[73]: {'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 6235}},
 'total_vector_count': 6235}

### Spot Check the Embeddings using Similarity Queries

In [0]:
query = "a romantic comedy with a happy ending"
query_embedding = openai.Embedding.create(model="text-embedding-ada-002", input=query)["data"][0]["embedding"]
matches = index.query(vector=query_embedding, top_k=10, include_values=False, include_metadata=True)["matches"]
matches

Out[80]: [{'id': '13476',
  'metadata': {'budget': 17000000.0,
               'genres': ['Comedy', 'Romance'],
               'keywords': ['celebrity',
                            'romantic comedy',
                            'male female relationship',
                            'movie star'],
               'overview': 'A small-town girl wins a date with a Hollywood star '
                           'through a contest. When the date goes better than '
                           'expected, a love triangle forms between the girl, '
                           "the celebrity, and the girl's best friend.",
               'popularity': 5.924226,
               'release_date': datetime.date(2004, 1, 23),
               'revenue': 16980098.0,
               'runtime': 95.0,
               'text': 'Comedy, Romance | celebrity, romantic comedy, male '
                       'female relationship, movie star | A small-town girl '
                       'wins a date with a Hollywood star throug