In [2]:
from sklearn import *
import lightgbm as lgbm
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
nltk.download("stopwords")
plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/charubaiel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
twitter_neg = pd.read_csv('data/negative.csv',sep=';',header=None,usecols=[3])
twitter_pos = pd.read_csv('data/positive.csv',sep=';',header=None,usecols=[3])
vk_all = pd.read_csv('data/labeled.csv')
ttl_toxic = vk_all.append(twitter_pos.rename(columns={3:'comment'}).sample(5000)).fillna(0)

In [4]:
data,target = ttl_toxic['comment'],ttl_toxic['toxic']

In [5]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
stopwords = nltk.corpus.stopwords.words('russian')

In [6]:
def normalizer (text):
    words_only = re.sub('[^А-я]+',' ',text.lower())
    doc = Doc(words_only)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    clean_text = []
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if (token.lemma not in stopwords) & (len(set(token.lemma))>1):
            clean_text.append(token.lemma)
            
    return ' '.join(clean_text)

In [8]:
scores = {}

In [7]:
data = pd.Series(normalizer(' жожо '.join(data)).split('жожо'))

In [9]:
from navec import Navec
nav = Navec.load('models/emb_navec.tar')
def get_sentence_vector(sentence_list):
    vectors = []
    for sentence in sentence_list:
        sent_vec = []
        for i in sentence.split():
            if i in nav:
                sent_vec.append(nav[i])
            else:
                sent_vec.append(nav['<unk>'])
        if sentence.strip() == '':
            sent_vec = [nav['<unk>']]
        vectors.append(np.mean(sent_vec,axis=0))
    return np.vstack(vectors)
    
clf_vec = linear_model.LogisticRegression(max_iter=1000,C=6,class_weight= target.value_counts(normalize=True).to_dict())

scores['vec'] = pd.DataFrame(model_selection.cross_validate(clf_vec,get_sentence_vector(data),target,scoring=['f1','precision','recall'],cv=5)).mean()

In [10]:
word_vectorizer = feature_extraction.text.TfidfVectorizer()
word_vectorizer.fit(data)
char_vectorizer = feature_extraction.text.TfidfVectorizer(
    min_df=3,
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(2,4))
char_vectorizer.fit(data)

idf_fu = pipeline.FeatureUnion([('idf_w',word_vectorizer),('idf_c',char_vectorizer)])
clf_2idf = linear_model.LogisticRegression(max_iter=1000,C=6,class_weight= target.value_counts(normalize=True).to_dict())
pipe_idf_fe = pipeline.Pipeline([('idf',idf_fu),('clf',clf_2idf)])
scores['idf_features'] = pd.DataFrame(model_selection.cross_validate(pipe_idf_fe,data,target,scoring=['f1','precision','recall'],cv=5)).mean()


In [11]:
class_prior=target.value_counts(normalize=True).values[::-1]
clf_lr_vec = linear_model.LogisticRegression(max_iter=1000,C=6,class_weight= target.value_counts(normalize=True).to_dict())
clf_svc_vec = linear_model.SGDClassifier(loss='modified_huber',class_weight= target.value_counts(normalize=True).to_dict())
clf_rf_vec = lgbm.LGBMClassifier(n_estimators=1500,learning_rate=0.07,num_leaves=15,class_weight= target.value_counts(normalize=True).to_dict())
clf_mlp_vec = neural_network.MLPClassifier(hidden_layer_sizes=(300,1),max_iter=1000,learning_rate='adaptive')


clf_vote = ensemble.VotingClassifier(estimators=[('lr',clf_lr_vec),('svc',clf_svc_vec),('gbm',clf_rf_vec),('mlp',clf_mlp_vec)],voting='soft')

scores['vote_models_vec'] = pd.DataFrame(model_selection.cross_validate(clf_vote,get_sentence_vector(data),target,scoring=['f1','precision','recall'],cv=5)).mean()
scores['vote_models_idf'] = pd.DataFrame(model_selection.cross_validate(clf_vote,idf_fu.transform(data),target,scoring=['f1','precision','recall'],cv=5)).mean()





In [None]:
def voting(estimators,x,func=np.mean):
    probs = []
    for i in estimators:
        probs.append(i.predict_proba(x)[:,1])
    return np.apply_over_axes(func,np.array(probs),axes=0)[0]

In [None]:
pd.DataFrame(scores)