In [2]:
import openai
from dotenv import dotenv_values
config = dotenv_values("./../.env")
openai.api_key = config["OPENAI_API_KEY"]

In [3]:
import pandas as pd
import numpy as np
from tenacity import retry, wait_random_exponential, stop_after_attempt
import pickle
import tiktoken

Dataset
Download it [here](https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots). 

Extract it and change its path in this file.

In [4]:
dataset_path = "./../wiki_movie_plots_deduped.csv"
df = pd.read_csv(dataset_path)

In [5]:
movies = df[df["Origin/Ethnicity"] == "Japanese"].sort_values("Release Year", ascending=False).head(100)

In [6]:
movie_plots = movies["Plot"].values

Cost estimation with tiktoken

In [7]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [8]:
total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])

In [9]:
total_tokens
cost = total_tokens * (.0004 / 1000)
print(f"Estimated cost ${cost:.2f}")

Estimated cost $0.02


Generating the embeddings

In [10]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text, model="text-embedding-ada-002"):

    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [11]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path = "movie_embeddings_cache2.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string,
    model="text-embedding-ada-002",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, model) not in embedding_cache.keys():
        embedding_cache[(string, model)] = get_embedding(string, model)
        print(f"GOT EMBEDDING FROM OPENAI FOR {string[:20]}")
        with open(embedding_cache_path, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, model)]

In [12]:
plot_embeddings = [embedding_from_string(plot, model="text-embedding-ada-002") for plot in movie_plots]

GOT EMBEDDING FROM OPENAI FOR The story takes plac
GOT EMBEDDING FROM OPENAI FOR The story takes plac
GOT EMBEDDING FROM OPENAI FOR The story revolves a
GOT EMBEDDING FROM OPENAI FOR The story is set in 
GOT EMBEDDING FROM OPENAI FOR The film begins when
GOT EMBEDDING FROM OPENAI FOR The story focuses on
GOT EMBEDDING FROM OPENAI FOR Free! Iwatobi Swim C
GOT EMBEDDING FROM OPENAI FOR Rei Kiriyama is a 17
GOT EMBEDDING FROM OPENAI FOR Following the events
GOT EMBEDDING FROM OPENAI FOR Manji is a samurai o
GOT EMBEDDING FROM OPENAI FOR The Kaitei Supreme H
GOT EMBEDDING FROM OPENAI FOR Zash Caine (ザッシュ・ケイン
GOT EMBEDDING FROM OPENAI FOR Onigawara, the direc
GOT EMBEDDING FROM OPENAI FOR On an alternate Eart
GOT EMBEDDING FROM OPENAI FOR Momo Adachi is a for
GOT EMBEDDING FROM OPENAI FOR In the distant techn
GOT EMBEDDING FROM OPENAI FOR Kai Ashimoto (足元カイ, 
GOT EMBEDDING FROM OPENAI FOR The story is set in 
GOT EMBEDDING FROM OPENAI FOR The plot of Napping 
GOT EMBEDDING FROM OPENAI FOR T

Recommending movie by plot

In [13]:
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances

In [16]:
def print_recommendations_from_strings(
    strings,
    index_of_source_string,
    k_nearest_neighbors=3,
    model="text-embedding-ada-002"
):
    #Get all of the embeddings
    embeddings = [embedding_from_string(string) for string in strings]
    
    # get embedding for our specific query string
    query_embedding = embeddings[index_of_source_string]
    
    # get distances between our embedding and all other embeddings
    distances = distances_from_embeddings(query_embedding, embeddings)

    # get indices of the nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    
    query_string = strings[index_of_source_string]
    match_count = 0
    for i in indices_of_nearest_neighbors:
        if query_string == strings[i]:
            continue
        if match_count >= k_nearest_neighbors:
            break
        match_count += 1
        print(f"Found {match_count} closest match: ")
        print(f"Distance of: {distances[i]} ")
        print(strings[i])

In [23]:
print_recommendations_from_strings(movie_plots, 1)

Found 1 closest match: 
Distance of: 0.1421945034273181 
The story is set in Sakurada (咲良田), a town where everyone possesses special abilities. The protagonist is Kei Asai, a boy with eidetic memory. Upon Sumire Soma's urging, Kei meets Misora Haruki, a quiet, withdrawn girl who can reset the world up to three days in the past. Thanks to his ability, Kei can remember the time Misora has reset, and thus Sumire suggests that Kei becomes the companion of Haruki to help her use her abilities for good leading to them becoming part of the Service Club—a club that accomplishes tasks given to them. The Service Club then becomes involved with the Administration Bureau, an organization that monitors all special abilities in Sakurada and orchestrates events in Sakurada according to their directives, utilizing the 'Witch' at their disposal. Unknown to the Service Club, their involvement with the Bureau puts them into a larger chain of events that only the Witch knows.
Found 2 closest match: 
Dista