# Imports

In [94]:
import json
import re
from typing import List
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from parsers import *

[nltk_data] Downloading package stopwords to /home/diveev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading and saving data

In [38]:
url = "https://www.kinopoisk.ru/lists/movies/country--1/?ss_subscription=ANY"

movie_links = parse_links(url)
descriptions, rates = parse_descriptions_and_rates(movie_links, 2)

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=106.0.5249.91)
Stacktrace:
#0 0x56170bded2c3 <unknown>
#1 0x56170bbf683a <unknown>
#2 0x56170bbd58e3 <unknown>
#3 0x56170bc51ebd <unknown>
#4 0x56170bc65029 <unknown>
#5 0x56170bc4dd63 <unknown>
#6 0x56170bc227e3 <unknown>
#7 0x56170bc23a21 <unknown>
#8 0x56170be3b18e <unknown>
#9 0x56170be3e622 <unknown>
#10 0x56170be21aae <unknown>
#11 0x56170be3f2a3 <unknown>
#12 0x56170be15ecf <unknown>
#13 0x56170be5f588 <unknown>
#14 0x56170be5f706 <unknown>
#15 0x56170be798b2 <unknown>
#16 0x7f166af10e2d <unknown>


In [5]:
data = {
    'descriptions' : descriptions,
    'rates' : rates
}

In [7]:
with open("descriptions_and_rates.json", "w") as f:
    json.dump(data, f)

# Text preprocessing

In [74]:
def normalize_text(strings:List[str]) -> List[str]:
    normalized = []
    
    for string in strings:
        string = re.findall(r"\w+", string)
        string = " ".join(string)
        string = re.sub(r"\s+", " ", string)
        string = string.lower()
        
        normalized.append(string)
        
    return normalized

In [67]:
def filter_stopwords(strings:List[str]) -> List[str]:
    filtered = []
    
    for string in strings:
        words = string.split()
        words = [w for w in words if w not in stopwords.words("russian")]
        string = " ".join(words)
        
        filtered.append(string)
        
    return filtered

In [80]:
def lemmatize_words(strings:List[str]) -> List[str]:
    lemmatized = []
    stem = Mystem()
    
    for string in strings:
        lemmas = stem.lemmatize(string)
        lemmas[-1] = lemmas[-1].replace("\n", "")
        string = ''.join(lemmas)
        
        lemmatized.append(string)
        
    return lemmatized

In [81]:
def preprocessing_pipeline(strings:List[str]) -> List[str]:
    strings = normalize_text(strings)
    strings = filter_stopwords(strings)
    strings = lemmatize_words(strings)
    
    return strings

In [97]:
corpus = preprocessing_pipeline(descriptions)

In [98]:
vectorizer = TfidfVectorizer()
sparse_corpus = vectorizer.fit_transform(corpus)

# Building model

In [102]:
x_train, x_test, y_train, y_test = train_test_split(sparse_corpus, rates, 
                                                    random_state=42,
                                                    test_size=0.2,
                                                    shuffle=True,)