In [10]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path

PROCESSED_PATH = Path('../data/processed/movies_with_plots.csv')
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(exist_ok=True)

df = pd.read_csv(PROCESSED_PATH)
df['movieId'] = pd.to_numeric(df['movieId'], errors='coerce').astype('Int64')
df['overview'] = df['overview'].fillna("No plot summary available.")
df['genres'] = df['genres'].fillna("(no genres listed)")

# Text for embedding (genres + overview)
df['embed_text'] = (
    (df['genres'].str.replace('|', ' ') + " ") * 6 +   # ← genres ×4
    df['overview']
)
print(f"Loaded {len(df)} movies")
print(df[['title', 'embed_text']].head(3))

Loaded 9734 movies
                     title                                         embed_text
0         Toy Story (1995)  Adventure Animation Children Comedy Fantasy Ad...
1           Jumanji (1995)  Adventure Children Fantasy Adventure Children ...
2  Grumpier Old Men (1995)  Comedy Romance Comedy Romance Comedy Romance C...


In [11]:
# This downloads ~80–90 MB model on first run (be patient)
model = SentenceTransformer('all-mpnet-base-v2')

print("Encoding movie descriptions...")
embeddings = model.encode(
    df['embed_text'].tolist(),
    batch_size=32,               # lower to 16 if memory error
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True    # important for cosine similarity
)

print("Embeddings shape:", embeddings.shape)  # should be (~9734, 384)
np.save(MODELS_DIR / 'content_embeddings.npy', embeddings)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mMPNetModel LOAD REPORT[0m from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Encoding movie descriptions...


Batches:   0%|          | 0/305 [00:00<?, ?it/s]

Embeddings shape: (9734, 768)


In [12]:
d = embeddings.shape[1]  # 384
index = faiss.IndexFlatIP(d)  # Inner Product = cosine for normalized vectors

index.add(embeddings.astype('float32'))
print("FAISS index built with", index.ntotal, "vectors")

faiss.write_index(index, str(MODELS_DIR / 'content_faiss.index'))
print("Index saved")

FAISS index built with 9734 vectors
Index saved


In [13]:
def get_similar_movies_faiss(
    movie_id: int,
    n: int = 10,
    index=index,
    df=df,
    embeddings=embeddings
):
    try:
        idx = df[df['movieId'] == movie_id].index[0]
    except IndexError:
        raise ValueError(f"Movie ID {movie_id} not found")

    query_vec = embeddings[idx].reshape(1, -1).astype('float32')
    distances, indices = index.search(query_vec, n + 1)

    distances = distances[0][1:]   # skip self
    indices = indices[0][1:]

    result = df.iloc[indices][['movieId', 'title', 'genres']].copy()
    result['similarity_score'] = distances.round(4)

    return result.reset_index(drop=True)

In [14]:
print("Toy Story (1)")
display(get_similar_movies_faiss(1, n=10))

print("\nThe Matrix (2751)")
display(get_similar_movies_faiss(2751, n=8))

print("\nInception (79132)")
display(get_similar_movies_faiss(79132, n=8))

Toy Story (1)


Unnamed: 0,movieId,title,genres,similarity_score
0,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.9348
1,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,0.8909
2,43869,Curious George (2006),Adventure|Animation|Children|Comedy,0.6854
3,64249,Shrek the Halls (2007),Adventure|Animation|Comedy|Fantasy,0.6701
4,60487,"It's the Great Pumpkin, Charlie Brown (1966)",Animation|Children|Comedy,0.6602
5,145935,"Peanuts Movie, The (2015)",Adventure|Animation|Children|Comedy,0.6586
6,1064,Aladdin and the King of Thieves (1996),Animation|Children|Comedy|Fantasy|Musical|Romance,0.6574
7,81847,Tangled (2010),Animation|Children|Comedy|Fantasy|Musical|Roma...,0.653
8,86298,Rio (2011),Adventure|Animation|Children|Comedy,0.6519
9,78637,Shrek Forever After (a.k.a. Shrek: The Final C...,Adventure|Animation|Children|Comedy|Fantasy|IMAX,0.6512



The Matrix (2751)


Unnamed: 0,movieId,title,genres,similarity_score
0,5008,Witness for the Prosecution (1957),Drama|Mystery|Thriller,0.7142
1,1672,"Rainmaker, The (1997)",Drama,0.7051
2,3130,Bonfire of the Vanities (1990),Comedy|Crime|Drama,0.6928
3,75,Big Bully (1996),Comedy|Drama,0.6924
4,116897,Wild Tales (2014),Comedy|Drama|Thriller,0.6895
5,5938,Deathtrap (1982),Comedy|Crime|Mystery|Thriller,0.6806
6,3733,"Paper Chase, The (1973)",Drama,0.6779
7,1594,In the Company of Men (1997),Comedy|Drama,0.6763



Inception (79132)


Unnamed: 0,movieId,title,genres,similarity_score
0,1834,"Spanish Prisoner, The (1997)",Crime|Drama|Mystery|Thriller,0.7522
1,1573,Face/Off (1997),Action|Crime|Drama|Thriller,0.7421
2,183011,The Commuter (2018),Crime|Drama|Mystery|Thriller,0.7301
3,384,Bad Company (1995),Action|Crime|Drama,0.7294
4,3576,"Hidden, The (1987)",Action|Horror|Sci-Fi,0.7257
5,2985,RoboCop (1987),Action|Crime|Drama|Sci-Fi|Thriller,0.7224
6,1396,Sneakers (1992),Action|Comedy|Crime|Drama|Sci-Fi,0.7221
7,52281,Grindhouse (2007),Action|Crime|Horror|Sci-Fi|Thriller,0.7205
