# Imports

In [116]:
import json
import re
from typing import List
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import HalvingGridSearchCV, train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error as mae
from parsers import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/daniildiveev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Parsing and saving data

In [3]:
url = "https://www.kinopoisk.ru/lists/movies/country--1/?ss_subscription=ANY"

movie_links = parse_links(url)
descriptions, rates = parse_descriptions_and_rates(movie_links, 2)



Current google-chrome version is 105.0.5195
Get LATEST driver version for 105.0.5195
There is no [mac64] chromedriver for browser 105.0.5195 in cache
Get LATEST driver version for 105.0.5195
Trying to download new driver from https://chromedriver.storage.googleapis.com/105.0.5195.52/chromedriver_mac64.zip
Driver has been saved in cache [/Users/daniildiveev/.wdm/drivers/chromedriver/mac64/105.0.5195.52]


Current google-chrome version is 105.0.5195
Get LATEST driver version for 105.0.5195
Driver [/Users/daniildiveev/.wdm/drivers/chromedriver/mac64/105.0.5195.52/chromedriver] found in cache
100%|██████████| 50/50 [02:26<00:00,  2.92s/it]


In [4]:
data = {
    'descriptions' : descriptions,
    'rates' : rates
}

In [5]:
with open("descriptions_and_rates.json", "w") as f:
    json.dump(data, f)

# Text preprocessing

In [6]:
def normalize_text(strings:List[str]) -> List[str]:
    normalized = []
    
    for string in strings:
        string = re.findall(r"\w+", string)
        string = " ".join(string)
        string = re.sub(r"\s+", " ", string)
        string = string.lower()
        
        normalized.append(string)
        
    return normalized

In [7]:
def filter_stopwords(strings:List[str]) -> List[str]:
    filtered = []
    
    for string in strings:
        words = string.split()
        words = [w for w in words if w not in stopwords.words("russian")]
        string = " ".join(words)
        
        filtered.append(string)
        
    return filtered

In [8]:
def lemmatize_words(strings:List[str]) -> List[str]:
    lemmatized = []
    stem = Mystem()
    
    for string in strings:
        lemmas = stem.lemmatize(string)
        lemmas[-1] = lemmas[-1].replace("\n", "")
        string = ''.join(lemmas)
        
        lemmatized.append(string)
        
    return lemmatized

In [9]:
def preprocessing_pipeline(strings:List[str]) -> List[str]:
    strings = normalize_text(strings)
    strings = filter_stopwords(strings)
    strings = lemmatize_words(strings)
    
    return strings

In [10]:
corpus = preprocessing_pipeline(descriptions)

Installing mystem to /Users/daniildiveev/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-macosx.tar.gz


# Building model

In [53]:
vectorizer = TfidfVectorizer()
sparse_corpus = vectorizer.fit_transform(corpus)

In [89]:
mae_scorer = make_scorer(mae)

In [109]:
forest_parameters = {
    "n_estimators" : [int(x) for x in np.linspace(10, 50, 5)],
    "max_depth" : [x for x in range(4, 8)],
    "max_features" : ["log2", "sqrt"], 
    "bootstrap" : [True, False]
}

In [110]:
forest = RandomForestRegressor()
grid_search = HalvingGridSearchCV(forest, forest_parameters, scoring=mae_scorer, cv=4)

In [111]:
grid_search.fit(sparse_corpus, rates)

In [112]:
grid_search.best_params_

{'bootstrap': True, 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 20}

In [115]:
grid_search.cv_results_

{'iter': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'n_resources': array([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
        24, 24, 24, 24, 24]),
 'mean_fit_time': array([0.01544428, 0.01821148, 0.02582842, 0.03319567, 0.04145461,
        0.00931245, 0.0175637 , 0.02449173,