In [16]:
import joblib
from navec import Navec
import numpy as np
import os
import pandas as pd
import re

In [None]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [18]:
nav = Navec.load('models/emb_navec.tar')

In [19]:
def get_sentence_vecror(sentence_list):
    vectors = []
    for sentence in sentence_list:
        sent_vec = []
        for i in sentence.split():
            if i in nav:
                sent_vec.append(nav[i])
            else:
                sent_vec.append(nav['<unk>'])
        if sentence.strip() == '':
            sent_vec = [nav['<unk>']]
        vectors.append(np.mean(sent_vec,axis=0))
    return np.vstack(vectors)

In [20]:
def voting(estimators,data,func=np.max):
    probs = []
    for i in estimators:
        probs.append(i.predict_proba(data)[:,1])
    return np.apply_over_axes(func,np.array(probs),axes=0)[0]

In [None]:
def normalizer (text):
    words_only = re.sub('[^А-я]+',' ',text.lower())
    doc = Doc(words_only)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    clean_text = []
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if len(set(token.lemma))>1:
            clean_text.append(token.lemma)
            
    return ' '.join(clean_text)

In [21]:
models =[joblib.load(f'models/{model}') for model in os.listdir('models') if '2' in model]

In [23]:
voting(estimators=models,data=['ну ты и гнида','ты просто большой умница'])

array([0.9999971 , 0.21108247])

In [7]:
twitter_neg = pd.read_csv('data/negative.csv',sep=';',header=None,usecols=[3])
twitter_pos = pd.read_csv('data/positive.csv',sep=';',header=None,usecols=[3])

In [8]:
ttl_neg = models[0].predict(twitter_neg[3])

In [9]:
ttl_pos = models[0].predict(twitter_pos[3])

In [10]:
ttl_neg.mean()

0.013187637929648062

In [11]:
ttl_pos.mean()

0.009389875642888845