# Imports

In [140]:
import json
import re
from typing import List
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error as mae

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniildiveev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loadind saved data

## Скрипт для парсинга, отдельный файл создал для удобства

### P.S Жанры так и не смог спарсить, кинопоиск постоянно блочил :(

```Python
import json 
from typing import List, Union, Tuple
from time import sleep
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from fake_useragent import UserAgent

def get_headers() -> 'Options':
    user_agent = UserAgent().random
    options = Options()
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument("window-size=1200,800")

    print(user_agent)

    return options


def parse_links(url:str, 
                time_to_load_url:Union[float, int]=3.,
                cooldown:Union[float, int]=4.) -> List[str]:
    options = get_headers()

    with webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options) as driver:
        driver.get(url)

        sleep(time_to_load_url)
        
        links = driver.find_elements(By.CLASS_NAME, 'base-movie-main-info_link__YwtP1')
        links = [link.get_property("href") for link in links]

        sleep(cooldown)

    return links

def parse_data(urls:List[str], 
               time_to_load_url:Union[float, int]=3.,
               cooldown:Union[float, int]=4.) -> Tuple[List[str], List[float], List[List[str]]]:
    descriptions, rates, genres_list = [], [], []
    options = get_headers()

    GENRES_XPATHS = ('//*[@id="__next"]/div[2]/div[2]/div[1]/div[2]/div/div[3]/div/div/div[2]/div[1]/div/div[3]/div[2]/div',
                    '//*[@id="__next"]/div[2]/div[2]/div[2]/div[2]/div/div[3]/div/div/div[2]/div[1]/div/div[3]/div[2]/div',
                    '//*[@id="__next"]/div[2]/div[2]/div[1]/div[2]/div/div[3]/div/div/div[2]/div[1]/div/div[4]/div[2]/div')

    with webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options) as driver:

        for url in tqdm(urls):
            driver.get(url)

            sleep(time_to_load_url)

            desc = driver.find_elements(By.CLASS_NAME, 'styles_paragraph__wEGPz')[0].text
            rate = driver.find_elements(By.CLASS_NAME, 'film-rating-value')[0].text

            genres = []

            for xpath in GENRES_XPATHS:
                genres += driver.find_elements(By.XPATH, xpath)

            genres = [el.text.repace(',', '').split() for el in genres if '«' not in el.text]
            print(genres)

            descriptions.append(desc)
            genres_list.append(genres)
            rates.append(float(rate))

            sleep(cooldown)
    
    return descriptions, rates, genres_list

if __name__ == '__main__':
    url = "https://www.kinopoisk.ru/lists/movies/country--1/?ss_subscription=ANY"

    movie_links = parse_links(url)
    descriptions, rates, genres = parse_data(movie_links, 2)

    data = {
        'descriptions' : descriptions,
        'rates' : rates, 
        'genres' : genres
    }

    with open("descriptions_rates_and_genres.json", "w") as f:
        json.dump(data, f)
```

In [141]:
with open("descriptions_and_rates.json") as f:
    data = json.loads(f.read())

In [142]:
descriptions, rates = data['descriptions'], data['rates']

# Text preprocessing

In [124]:
def normalize_text(strings:List[str]) -> List[str]:
    normalized = []
    
    for string in strings:
        string = re.findall(r"\w+", string)
        string = " ".join(string)
        string = re.sub(r"\s+", " ", string)
        string = string.lower()
        
        normalized.append(string)
        
    return normalized

In [125]:
def filter_stopwords(strings:List[str]) -> List[str]:
    filtered = []
    
    for string in strings:
        words = string.split()
        words = [w for w in words if w not in stopwords.words("russian")]
        string = " ".join(words)
        
        filtered.append(string)
        
    return filtered

In [126]:
def lemmatize_words(strings:List[str]) -> List[str]:
    lemmatized = []
    stem = Mystem()
    
    for string in strings:
        lemmas = stem.lemmatize(string)
        lemmas[-1] = lemmas[-1].replace("\n", "")
        string = ''.join(lemmas)
        
        lemmatized.append(string)
        
    return lemmatized

In [127]:
def preprocessing_pipeline(strings:List[str]) -> List[str]:
    strings = normalize_text(strings)
    strings = filter_stopwords(strings)
    strings = lemmatize_words(strings)
    
    return strings

In [128]:
corpus = preprocessing_pipeline(descriptions)

# Building model

In [129]:
vectorizer = TfidfVectorizer()
sparse_corpus = vectorizer.fit_transform(corpus)

In [130]:
mae_scorer = make_scorer(mae)

In [131]:
forest_parameters = {
    "n_estimators" : [int(x) for x in np.linspace(10, 50, 5)],
    "max_depth" : [x for x in range(4, 8)],
    "max_features" : ["log2", "sqrt"], 
    "bootstrap" : [True, False]
}

In [132]:
forest = RandomForestRegressor()
grid_search = HalvingGridSearchCV(forest, forest_parameters, scoring=mae_scorer, cv=4)

In [133]:
grid_search.fit(sparse_corpus, rates)

In [137]:
grid_search.best_params_

{'bootstrap': True, 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 20}

In [139]:
mean_mae_on_test = np.mean(grid_search.cv_results_['mean_test_score'])
print("Mean MAE on test folds: %s" % mean_mae_on_test)

Mean MAE on test folds: 0.5935970161292785
