In [9]:
import pandas as pd
import numpy as np
import os
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("IMDB_Top_1000_Dataset_Preprocessed")

In [3]:
df.head(1)

Unnamed: 0,Series_Title,Combined_Text
0,The Shawshank Redemption,Title: The Shawshank Redemption | Genre: Drama...


In [4]:

client = OpenAI()
def get_embedding(data):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=data
    )
    return response.data[0].embedding

In [5]:
def parallel_get_embeddings(data):
    with ThreadPoolExecutor() as executor:
        embeddings_out = list(executor.map(get_embedding, data))
    return embeddings_out

In [6]:
df_embed=df.copy()

In [7]:
df_embed["Embedding"] = parallel_get_embeddings(df["Combined_Text"])

In [8]:
df_embed.head(1)

Unnamed: 0,Series_Title,Combined_Text,Embedding
0,The Shawshank Redemption,Title: The Shawshank Redemption | Genre: Drama...,"[-0.041567977517843246, 0.005746891722083092, ..."


In [None]:
def recommend_movies(movie_title, df, top_n=5):
    movie_row = df[df["Series_Title"] == movie_title]
    
    if movie_row.empty:
        print("Not in Dataset")
        return None
    
    movie_embedding = np.array(movie_row["Embedding"].values[0]).reshape(1, -1) 

    all_embeddings = np.vstack(df["Embedding"].values)

    similarities = cosine_similarity(movie_embedding, all_embeddings)[0]

    df["Similarity"] = similarities
    recommendations = df[df["Series_Title"] != movie_title].sort_values(by="Similarity", ascending=False).head(top_n)

    return recommendations[["Series_Title", "Similarity"]]


In [18]:
recommend_movies("The Shining",df_embed)

Unnamed: 0,Series_Title,Similarity
441,The Killing,0.50829
145,Shutter Island,0.505015
844,Halloween,0.503851
49,Psycho,0.492004
271,The Thing,0.488668
