In [5]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import csv


In [7]:

# Load movie data
file_path = "/Users/mohitbhoir/Git/Movie_Recommendation_Chatbot/constant/output_movies_copy.txt"
df = pd.read_csv(file_path, sep="^", quoting=csv.QUOTE_ALL)
# Ensure no NaN values in primaryTitle
df["primaryTitle"] = df["primaryTitle"].fillna("").astype(str)

# Initialize embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
embeddings = np.array([model.encode(title) for title in df["primaryTitle"]])

In [8]:
# Initialize FAISS index (L2 normalization for cosine similarity)
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings)  # Add vectors to FAISS

In [13]:
import sys
# Method 2: Calculating memory usage directly (more precise)
vector_size_bytes = embeddings.dtype.itemsize * d  # Size of a single vector in bytes
total_vectors = embeddings.shape[0]
total_index_size_bytes = vector_size_bytes * total_vectors
#If you are using an index that stores other data, you will need to account for it.
print(f"Index size (bytes): {total_index_size_bytes}")
print(f"Index size (MB): {total_index_size_bytes / (1024 * 1024):.2f}")
print(f"Index size (GB): {total_index_size_bytes / (1024 * 1024 * 1024):.2f}")
print(f"\n\nIndex size (bytes) using sys.getsizeof: {sys.getsizeof(index)}")


Index size (bytes): 7680000
Index size (MB): 7.32
Index size (GB): 0.01


Index size (bytes) using sys.getsizeof: 48


In [11]:
# Define a search function
def query_movie(query_text, top_k=5):
    query_embedding = model.encode(query_text).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    results = df.iloc[indices[0]]  # Retrieve matching movies
    return results

In [12]:
# Test the query function
query_text = "horror movies"
results = query_movie(query_text)
print("Query results:")
print(results)

Query results:
         tconst titleType           primaryTitle           originalTitle  \
476   tt0175142     movie            Scary Movie             Scary Movie   
2554  tt0244941     movie             Horror 101              Horror 101   
3323  tt0257106     movie          Scary Movie 2           Scary Movie 2   
2165  tt0238276     movie  Home the Horror Story  Home: The Horror Story   
237   tt0154605     movie            Home Movies             Home Movies   

      isAdult  startYear  endYear runtimeMinutes             genres  \
476      True     2000.0      NaN             88             Comedy   
2554     True     2001.0      NaN             89             Horror   
3323     True     2001.0      NaN             83      Comedy,Horror   
2165     True     2000.0      NaN             86      Comedy,Horror   
237      True     2000.0      NaN             \N  Documentary,Music   

      averageRating  numVotes  \
476             6.3  300465.0   
2554            3.4     338.0   
33