In [1]:
import joblib
from navec import Navec
import numpy as np
import pandas as pd
import re

In [2]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

In [4]:
nav = Navec.load('../models/emb_navec.tar')

In [14]:
m1 = joblib.load('../prod_models/m1.joblib')
m2 = joblib.load('../prod_models/m2.joblib')
m3 = joblib.load('../prod_models/m3.joblib')

In [15]:

def normalizer (text):
    words_only = re.sub('[^А-я]+',' ',text.lower())
    doc = Doc(words_only)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    clean_text = []
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if (len(set(token.lemma))>1):
            clean_text.append(token.lemma)
            
    return ' '.join(clean_text)


def get_sentence_vector(sentence_list):
    vectors = []
    for sentence in sentence_list:
        sent_vec = []
        for i in sentence.split():
            if i in nav:
                sent_vec.append(nav[i])
            else:
                sent_vec.append(nav['<unk>'])
        if sentence.strip() == '':
            sent_vec = [nav['<unk>']]
        vectors.append(np.mean(sent_vec,axis=0))
    return np.vstack(vectors)

In [16]:

def voting(sentences,func=np.max):
    sentences = [normalizer(txt) for txt in sentences]
    probs = []
    probs.append(m1.predict_proba(sentences)[:,1])
    probs.append(m2.predict_proba(get_sentence_vector(sentences))[:,1])
    probs.append(m3.predict_proba(get_sentence_vector(sentences))[:,1])
    return np.apply_over_axes(func,np.array(probs),axes=0)[0]



In [17]:
voting(['ну ты и гнида','ты просто большой умница'])

array([0.99998319, 0.42920822])

In [18]:
twitter_neg = pd.read_csv('../data/negative.csv',sep=';',header=None,usecols=[3])
twitter_pos = pd.read_csv('../data/positive.csv',sep=';',header=None,usecols=[3])

In [20]:
ttl_neg = voting(twitter_neg[3])

In [9]:
ttl_pos = voting(twitter_pos[3])

In [10]:
ttl_neg.mean()

0.013187637929648062

In [11]:
ttl_pos.mean()

0.009389875642888845