In [None]:
from enum import Enum
class Lang(Enum):
    EN = 1
    RU = 2

# choose language (affect movie dataset)
ENGINE = Lang.RU

In [None]:
# solve problem with mystem package in google colab
!if [[ "$OSTYPE" == "linux-gnu" ]]; then wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz && tar -xvf mystem-3.0-linux3.1-64bit.tar.gz && cp mystem /bin; fi

In [None]:
import spacy
import requests
import pandas as pd
from pymystem3 import Mystem

COLAB_ENV = "google.colab" in str(get_ipython())
ENV_PREFIX = "https://raw.githubusercontent.com/madmaxeatfax/fellini/master/datasets/" if COLAB_ENV else "./datasets/"


def get_stopwords(lang):
    data, path = None, ENV_PREFIX + f"stopwords_{lang}.txt"
    if COLAB_ENV:
        data = requests.get(path).text.split("\n")
    else:
        data = open(path)

    stopwords = set()
    for word in data:
        stopwords.add(word.rstrip())

    return stopwords


class EnLemmaTokenizer:
    def  __init__(self):
        self.nlp = spacy.load("en")
        self.stopwords = get_stopwords("en")
    def __call__(self, text):
        return [
            token.lemma_ for token in self.nlp(text)
            if token.lemma_ not in self.stopwords 
            and token.lemma_[0].isalpha()
        ]

class RuLemmaTokenizer:
    def  __init__(self):
        self.lemma = Mystem().lemmatize
        self.stopwords = get_stopwords("ru")
    def __call__(self, text):
        return [
            token for token in self.lemma(text)
            if token not in self.stopwords and token[0].isalpha()
        ]

lemma_tokenizer = EnLemmaTokenizer() if ENGINE == Lang.EN else RuLemmaTokenizer()

bunch = pd.read_csv(ENV_PREFIX + ("imdb250.csv" if ENGINE == Lang.EN else "kp250.csv"))
bunch.fillna("", inplace=True)

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=50_000, tokenizer=lemma_tokenizer)
data = \
    bunch.Title + " " + bunch.Crew + " " + bunch.Plot + " " + \
    bunch.Tags + " " + bunch.Country + " " + bunch.Reviews

features = vec.fit_transform(data)

words = vec.get_feature_names()
print(f'Dictionary size = {len(words)}')

In [None]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(features)

In [None]:
%%time

# write your search query
QUERY = "фильм в котором бухгалтер убил свою жену и отправился в тюрьму"
query_vector = vec.transform([QUERY])
print(f'Query = \'{QUERY}\'\nTokens =  {lemma_tokenizer(QUERY)}\n')

distances, neighbors = knn.kneighbors(query_vector, return_distance=True)
    
for dist, neighbor_idx in zip(distances[0], neighbors[0]):
    print(*[
        bunch.Title[neighbor_idx],
        f'Distance = {dist}  Neighbor idx = {neighbor_idx}',
        bunch.Plot[neighbor_idx][:200], 
        bunch.Crew[neighbor_idx],
        "-"*200
    ], sep="\n")
