# Movie Recommendation System

### Dataset

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df.head()

Unnamed: 0,index,wikipedia_id,name,text
0,0,975900,Ghosts of Mars,Ghosts of Mars. Set in the second half of the ...
1,1,18998739,The Sorcerer's Apprentice,The Sorcerer's Apprentice. Every hundred years...
2,2,21926710,White on Rice,White on Rice. Jimmy ([[Hiroshi Watanabe love...
3,3,20604092,Anbu Thozhi,Anbu Thozhi. The film opens with a simpleton f...
4,4,156558,Baby Boy,Baby Boy. A young 20-year-old named Jody live...


### Embeddings

In [3]:
from transformers import pipeline

In [4]:
CHECKPOINT = "bert-base-uncased"  # "facebook/bart-base"
EMBEDDING_SIZE = 768
MAX_INPUT_SIZE = 512
EMBEDDINGS_PATH = "../data/embeddings.npy"

In [5]:
feature_extractor = pipeline("feature-extraction", framework="pt", model=CHECKPOINT, padding=True, truncation=True)

In [6]:
def get_embeddings():
    if not os.path.exists(EMBEDDINGS_PATH):
        embeddings = []
        for text in df['text'].values:
            embedding = feature_extractor(text, return_tensors = "pt")[0].numpy().mean(axis=0)
            embeddings.append(embedding)
        np.save(EMBEDDINGS_PATH, embeddings)
    else:
        embeddings = np.load(EMBEDDINGS_PATH)
    return embeddings

In [7]:
embeddings = get_embeddings()

### Facebook AI Similarity Search

In [9]:
import faiss

In [10]:
INDEX_PATH = "../data/movies.index"

In [11]:
def get_index(index_path, refresh=False):
    if refresh or not os.path.exists(index_path):
        index = faiss.IndexFlatIP(EMBEDDING_SIZE) # faiss.IndexFlatL2(EMBEDDING_SIZE)
        index.add(np.array(embeddings).astype("float32"))
        faiss.write_index(index, index_path)
    else: 
        index = faiss.read_index(index_path)
    return index

In [12]:
index = get_index(INDEX_PATH, refresh=True)

In [13]:
k = 5

In [14]:
test_sample = 'Spiderman marvel superheroes'
test_sample = np.array([feature_extractor(test_sample, return_tensors = "pt")[0].numpy().mean(axis=0)]).astype('float32')
distances, indices = index.search(
    test_sample, 
    k
)

In [15]:
movies_df.iloc[indices[0]]['name'].values

array(['Seeds of Arkham', 'Powers', 'Batman: Revenge',
       'Superman vs. The Elite', 'Grim Reaper'], dtype=object)