In [1]:
import pandas as pd

bunch = pd.read_csv("https://raw.githubusercontent.com/madmaxeatfax/fellini/master/imdb250.csv")

In [2]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import time

class LemmaTokenizer:
    def  __init__(self):
        self.nlp = spacy.load("en")
    def __call__(self, text):
        return [
            token.lemma_ for token in self.nlp(text)
            if not (token.is_stop or token.is_punct or token.is_digit)
        ]

start_time = time.time()

vec = TfidfVectorizer(max_features=50_000, tokenizer=LemmaTokenizer())
data = \
    bunch.Title + " " + bunch.Crew + " " + bunch.Plot + " " + \
    bunch.Tags + " " + bunch.Country + " " + bunch.Reviews

# print([i for i, x in enumerate(data.isna()) if x == True])

features = vec.fit_transform(data)
print(f'Training time {time.time() - start_time}s, features = {len(vec.get_feature_names())}')

Training time 115.94822192192078s, features = 30968


In [3]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=10, metric='cosine')
knn.fit(features)

NearestNeighbors(metric='cosine', n_neighbors=10)

In [7]:
start_time = time.time()

search_query = "very funny"
query_vector = vec.transform([search_query])
print(f'Query = {search_query}\n')

distances, neighbors = knn.kneighbors(query_vector, n_neighbors=5, return_distance=True)
    
for dist, neighbor_idx in zip(distances[0], neighbors[0]):
    print(*[
        bunch.Title[neighbor_idx],
        f'Distance = {dist}  Neighbor idx = {neighbor_idx}',
        bunch.Plot[neighbor_idx][:200], 
        bunch.Crew[neighbor_idx],
        "-"*200
    ], sep="\n")

print(f'\nQuery execution time - {time.time() - start_time:.5f} sec')


Query = very funny

The Circus
Distance = 0.9167930337795042  Neighbor idx = 246
The Tramp finds himself at a circus where he is promptly chased around by the police who think he is a pickpocket. Running into the Bigtop, he is an accidental sensation with his hilarious efforts to 
Charles Chaplin (dir.), Charles Chaplin, Merna Kennedy, Charles Chaplin
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Lock, Stock and Two Smoking Barrels
Distance = 0.9326207788635541  Neighbor idx = 143
Four Jack-the-lads find themselves heavily - seriously heavily - in debt to an East End hard man and his enforcers after a crooked card game. Overhearing their neighbours in the next flat plotting to 
Guy Ritchie (dir.), Jason Flemyng, Dexter Fletcher, Guy Ritchie
----------------------------------------------------------------------------------------------