In [1]:
import pandas as pd

bunch = pd.read_csv("https://raw.githubusercontent.com/madmaxeatfax/fellini/master/imdb250.csv")

In [2]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import time

class LemmaTokenizer:
    def  __init__(self):
        self.nlp = spacy.load("en")
    def __call__(self, text):
        return [
            token.lemma_ for token in self.nlp(text)
            if not (token.is_stop or token.is_punct or token.is_digit)
        ]

start_time = time.time()

vec = TfidfVectorizer(max_features=50_000, tokenizer=LemmaTokenizer())
data = \
    bunch.Title + " " + bunch.Crew + " " + bunch.Plot + " " + \
    bunch.Tags + " " + bunch.Country + " " + bunch.Reviews

# print([i for i, x in enumerate(data.isna()) if x == True])

features = vec.fit_transform(data)
print(f'Training time {time.time() - start_time}s, features = {len(vec.get_feature_names())}')

Training time 117.49183487892151s, features = 30968


In [3]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(features)

NearestNeighbors(metric='cosine', n_neighbors=10)

In [4]:
start_time = time.time()

search_query = "movie about banker that killed wife"
query_vector = vec.transform([search_query])
print(f'Query = {search_query}\n')

distances, neighbors = knn.kneighbors(query_vector, n_neighbors=5, return_distance=True)
    
for dist, neighbor_idx in zip(distances[0], neighbors[0]):
    print(*[
        bunch.Title[neighbor_idx],
        f'Distance = {dist}  Neighbor idx = {neighbor_idx}',
        bunch.Plot[neighbor_idx][:200], 
        bunch.Crew[neighbor_idx],
        "-"*200
    ], sep="\n")

print(f'\nQuery execution time - {time.time() - start_time:.5f} sec')


Query = movie about banker that killed wife

The Shawshank Redemption
Distance = 0.9119878791338835  Neighbor idx = 0
Chronicles the experiences of a formerly successful banker as a prisoner in the gloomy jailhouse of Shawshank after being found guilty of a crime he did not commit. The film portrays the man's unique 
Frank Darabont (dir.), Tim Robbins, Morgan Freeman, Stephen King, Frank Darabont
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Memento
Distance = 0.9303349687847877  Neighbor idx = 54
Memento chronicles two separate stories of Leonard, an ex-insurance investigator who can no longer build new memories, as he attempts to find the murderer of his wife, which is the last thing he remem
Christopher Nolan (dir.), Guy Pearce, Carrie-Anne Moss, Christopher Nolan, Jonathan Nolan
---------------------------------------------------