In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path

PROCESSED_PATH = Path('../data/processed/movies_with_plots.csv')
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(exist_ok=True)

df = pd.read_csv(PROCESSED_PATH)
df['movieId'] = pd.to_numeric(df['movieId'], errors='coerce').astype('Int64')
df['overview'] = df['overview'].fillna("No plot summary available.")
df['genres'] = df['genres'].fillna("(no genres listed)")

# Text for embedding (genres + overview)
df['embed_text'] = df['genres'].str.replace('|', ' ') + " " + df['overview']
print(f"Loaded {len(df)} movies")
print(df[['title', 'embed_text']].head(3))

Loaded 9734 movies
                     title                                         embed_text
0         Toy Story (1995)  Adventure Animation Children Comedy Fantasy Le...
1           Jumanji (1995)  Adventure Children Fantasy When siblings Judy ...
2  Grumpier Old Men (1995)  Comedy Romance A family wedding reignites the ...


In [2]:
# This downloads ~80â€“90 MB model on first run (be patient)
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Encoding movie descriptions...")
embeddings = model.encode(
    df['embed_text'].tolist(),
    batch_size=32,               # lower to 16 if memory error
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True    # important for cosine similarity
)

print("Embeddings shape:", embeddings.shape)  # should be (~9734, 384)
np.save(MODELS_DIR / 'content_embeddings.npy', embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding movie descriptions...


Batches:   0%|          | 0/305 [00:00<?, ?it/s]

Embeddings shape: (9734, 384)


In [3]:
d = embeddings.shape[1]  # 384
index = faiss.IndexFlatIP(d)  # Inner Product = cosine for normalized vectors

index.add(embeddings.astype('float32'))
print("FAISS index built with", index.ntotal, "vectors")

faiss.write_index(index, str(MODELS_DIR / 'content_faiss.index'))
print("Index saved")

FAISS index built with 9734 vectors
Index saved


In [4]:
def get_similar_movies_faiss(
    movie_id: int,
    n: int = 10,
    index=index,
    df=df,
    embeddings=embeddings
):
    try:
        idx = df[df['movieId'] == movie_id].index[0]
    except IndexError:
        raise ValueError(f"Movie ID {movie_id} not found")

    query_vec = embeddings[idx].reshape(1, -1).astype('float32')
    distances, indices = index.search(query_vec, n + 1)

    distances = distances[0][1:]   # skip self
    indices = indices[0][1:]

    result = df.iloc[indices][['movieId', 'title', 'genres']].copy()
    result['similarity_score'] = distances.round(4)

    return result.reset_index(drop=True)

In [6]:
print("Toy Story (1)")
display(get_similar_movies_faiss(1, n=10))

print("\nThe Matrix (603)")
display(get_similar_movies_faiss(2751, n=8))

print("\nInception (79132)")
display(get_similar_movies_faiss(79132, n=8))

Toy Story (1)


Unnamed: 0,movieId,title,genres,similarity_score
0,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.8401
1,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,0.8371
2,239,"Goofy Movie, A (1995)",Animation|Children|Comedy|Romance,0.558
3,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,0.5568
4,6889,Brother Bear (2003),Adventure|Animation|Children,0.55
5,84637,Gnomeo & Juliet (2011),Adventure|Animation|Children|Comedy|Fantasy|Ro...,0.5491
6,43869,Curious George (2006),Adventure|Animation|Children|Comedy,0.549
7,58299,Horton Hears a Who! (2008),Adventure|Animation|Children|Comedy,0.5483
8,172793,Vovka in the Kingdom of Far Far Away (1965),Adventure|Animation|Children|Fantasy,0.5447
9,109846,Mr. Peabody & Sherman (2014),Adventure|Animation|Comedy,0.5414



The Matrix (603)


Unnamed: 0,movieId,title,genres,similarity_score
0,1672,"Rainmaker, The (1997)",Drama,0.5442
1,3068,"Verdict, The (1982)",Drama|Mystery,0.543
2,2433,"Civil Action, A (1998)",Drama,0.5112
3,628,Primal Fear (1996),Crime|Drama|Mystery|Thriller,0.5098
4,1645,The Devil's Advocate (1997),Drama|Mystery|Thriller,0.5043
5,84392,"Lincoln Lawyer, The (2011)",Crime|Drama|Thriller,0.5029
6,508,Philadelphia (1993),Drama,0.4974
7,5008,Witness for the Prosecution (1957),Drama|Mystery|Thriller,0.4948



Inception (79132)


Unnamed: 0,movieId,title,genres,similarity_score
0,38061,Kiss Kiss Bang Bang (2005),Comedy|Crime|Mystery|Thriller,0.5864
1,100498,"Good Day to Die Hard, A (2013)",Action|Crime|Thriller|IMAX,0.5678
2,4052,Antitrust (2001),Crime|Drama|Thriller,0.5673
3,109850,Need for Speed (2014),Action|Crime|Drama|IMAX,0.5658
4,7108,Crime Story (Zhong an zu) (1993),Action|Crime|Drama,0.5573
5,82461,Tron: Legacy (2010),Action|Adventure|Sci-Fi|IMAX,0.5513
6,6365,"Matrix Reloaded, The (2003)",Action|Adventure|Sci-Fi|Thriller|IMAX,0.5499
7,33794,Batman Begins (2005),Action|Crime|IMAX,0.5494
