In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("combined_data.csv")
clean = df[['title', 'genres' ,'overview']]
clean.head()

Unnamed: 0,title,genres,overview
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...","Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",Just when George Banks has recovered from his ...


In [3]:
movies_df = clean.dropna(subset=["overview"])
movies_df['overview'].isnull().values.any()

False

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

In [5]:
def vectorize_overview(overview, model):
    words = overview.lower().split()
    vectors = []
    for word in words:
        if word in model.wv.key_to_index:
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(vectors, axis=0)

model = Word2Vec.load('./training.model')

In [6]:
vectors = [vectorize_overview(description, model) for description in movies_df['overview']]

In [42]:
#def find_similar_movies(title, movies_df, vectors):
    #index = movies_df[movies_df['title'] == title].index[0]
    #similarity_scores = cosine_similarity([vectors[index]], vectors)
    #similar_indices = similarity_scores.argsort()[0][::-1][1:]
    #similar_movies = movies_df.iloc[similar_indices]['title']
    #print(similar_movies.to_string(index=False))

In [7]:
def find_similar_movies(title, movies_df, vectors):
    index = movies_df[movies_df['title'] == title].index[0]
    similarity_scores = cosine_similarity([vectors[index]], vectors)[0]
    similar_indices = similarity_scores.argsort()[::-1][1:]
    similar_movies = [(movies_df.iloc[i]['title'], similarity_scores[i]) for i in similar_indices]
    return similar_movies

In [8]:
similar_movies = find_similar_movies('Casper', movies_df, vectors)
similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[:10]
for movie, score in similar_movies:
    print(movie, score)

The Prisoner 0.6965958226159421
Time Indefinite 0.691141425202122
Beat the Drum 0.6910091417071138
Klippers 0.6845299469227315
When I Was Alive 0.6780471167608366
The Price 0.6752253527693317
The Enemy Within 0.665998547563657
Roads to the South 0.660525888563841
A Good Boy 0.6590439145990769
The Kovak Box 0.6587160318424365
