In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
import faiss

df = pd.read_csv(r"MovieRecommender\DataPreprocess\cleaned_movies.csv")


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


def get_bert_embeddings(text, tokenizer, bert_model):
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[0][0].numpy()

df['overview_embeddings'] = df['Overview'].apply(lambda x: get_bert_embeddings(x, tokenizer, bert_model))
df['plot_keywords_embeddings'] = df['Plot Kyeword'].apply(lambda x: get_bert_embeddings(x, tokenizer, bert_model))


label_encoder = LabelEncoder()
df['genre_encoded'] = label_encoder.fit_transform(df['Generes'])

embedding_dim = 50
genre_embeddings = torch.nn.Embedding(len(label_encoder.classes_), embedding_dim)
df['genre_embeddings'] = df['genre_encoded'].apply(lambda x: genre_embeddings(torch.tensor(x)).detach().numpy())


def combine_embeddings(row):
    return np.concatenate([row['overview_embeddings'], row['plot_keywords_embeddings'], row['genre_embeddings']])

df['combined_embeddings'] = df.apply(combine_embeddings, axis=1)

embedding_matrix = np.vstack(df['combined_embeddings'].values).astype('float32')


d = embedding_matrix.shape[1]
nbits = d 
index = faiss.IndexLSH(d, nbits)


index.add(embedding_matrix)


D, I = index.search(embedding_matrix[:3], k=5)


for query_idx, similar_indices in enumerate(I):
    print(f"\nTop 5 similar movies to index {query_idx}:")
    for rank, idx in enumerate(similar_indices):
        title = df.iloc[idx]['Title'] if 'Title' in df.columns else f"Index {idx}"
        print(f"  {rank+1}. {title} (index {idx})")
