In [128]:
from sklearn import *
import lightgbm as lgbm
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
nltk.download("stopwords")
plt.style.use('ggplot')
import joblib

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/charubaiel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
twitter_neg = pd.read_csv('data/negative.csv',sep=';',header=None,usecols=[3])
twitter_pos = pd.read_csv('data/positive.csv',sep=';',header=None,usecols=[3])
vk_all = pd.read_csv('data/labeled.csv')
ttl_toxic = vk_all.append(twitter_pos.rename(columns={3:'comment'}).sample(5000)).fillna(0)

In [3]:
data,target = ttl_toxic['comment'],ttl_toxic['toxic']

In [4]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
stopwords = nltk.corpus.stopwords.words('russian')

In [5]:
def normalizer (text):
    words_only = re.sub('[^А-я]+',' ',text.lower())
    doc = Doc(words_only)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    clean_text = []
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if (token.lemma not in stopwords) & (len(set(token.lemma))>1):
            clean_text.append(token.lemma)
            
    return ' '.join(clean_text)

In [6]:
scores = {}

In [7]:
data = pd.Series(normalizer(' жожо '.join(data)).split('жожо'))

In [105]:
from navec import Navec

nav = Navec.load('models/emb_navec.tar')
def get_sentence_vector(sentence_list):
    vectors = []
    for sentence in sentence_list:
        sent_vec = []
        for i in sentence.split():
            if i in nav:
                sent_vec.append(nav[i])
            else:
                sent_vec.append(nav['<unk>'])
        if sentence.strip() == '':
            sent_vec = [nav['<unk>']]
        vectors.append(np.mean(sent_vec,axis=0))
    return np.vstack(vectors)
    
clf_vec = neural_network.MLPClassifier(hidden_layer_sizes=(100,3))

scores['vec'] = pd.DataFrame(model_selection.cross_validate(clf_vec,get_sentence_vector(data),target,scoring=['f1','precision','recall'],cv=5,verbose=10)).mean()

clf_vec.fit(get_sentence_vector(data),target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END  f1: (test=0.662) precision: (test=0.625) recall: (test=0.704) total time=  15.5s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.5s remaining:    0.0s


[CV] END  f1: (test=0.759) precision: (test=0.825) recall: (test=0.703) total time=  15.0s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   30.4s remaining:    0.0s


[CV] END  f1: (test=0.713) precision: (test=0.802) recall: (test=0.641) total time=  12.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   43.4s remaining:    0.0s


[CV] END  f1: (test=0.566) precision: (test=0.609) recall: (test=0.528) total time=  11.1s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   54.4s remaining:    0.0s


[CV] END  f1: (test=0.628) precision: (test=0.585) recall: (test=0.678) total time=  16.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


MLPClassifier(hidden_layer_sizes=(100, 3))

In [90]:
word_vectorizer = feature_extraction.text.TfidfVectorizer()
word_vectorizer.fit(data)
char_vectorizer = feature_extraction.text.TfidfVectorizer(
    min_df=3,
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(2,4))
char_vectorizer.fit(data)

idf_fu = pipeline.FeatureUnion([('idf_w',word_vectorizer),('idf_c',char_vectorizer)])
clf_2idf = lgbm.LGBMClassifier(n_estimators=500,class_weight = target.value_counts(normalize=True).to_dict())
pipe_idf_fe = pipeline.Pipeline([('idf',idf_fu),('clf',clf_2idf)])
scores['idf_features'] = pd.DataFrame(model_selection.cross_validate(pipe_idf_fe,data,target,scoring=['f1','precision','recall'],cv=5,verbose=10)).mean()
pipe_idf_fe.fit(data,target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END  f1: (test=0.670) precision: (test=0.764) recall: (test=0.596) total time= 1.2min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


[CV] END  f1: (test=0.793) precision: (test=0.939) recall: (test=0.686) total time= 1.2min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.4min remaining:    0.0s


[CV] END  f1: (test=0.752) precision: (test=0.967) recall: (test=0.616) total time= 1.3min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.7min remaining:    0.0s


[CV] END  f1: (test=0.511) precision: (test=0.784) recall: (test=0.379) total time= 1.5min
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.2min remaining:    0.0s


[CV] END  f1: (test=0.703) precision: (test=0.797) recall: (test=0.629) total time= 1.5min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.8min finished


Pipeline(steps=[('idf',
                 FeatureUnion(transformer_list=[('idf_w', TfidfVectorizer()),
                                                ('idf_c',
                                                 TfidfVectorizer(analyzer='char',
                                                                 min_df=3,
                                                                 ngram_range=(2,
                                                                              4),
                                                                 sublinear_tf=True))])),
                ('clf',
                 LGBMClassifier(class_weight={0.0: 0.7513908922316093,
                                              1.0: 0.24860910776839068},
                                n_estimators=500))])

In [106]:
class_prior=target.value_counts(normalize=True).values[::-1]

clf_svc_vec = linear_model.SGDClassifier(loss='modified_huber',penalty = 'l1',class_weight= target.value_counts(normalize=True).to_dict())
clf_rf_vec = lgbm.LGBMClassifier(n_estimators=1000,num_leaves=15,class_weight= target.value_counts(normalize=True).to_dict())
clf_mlp_vec = neural_network.MLPClassifier(hidden_layer_sizes=(300,1),max_iter=1000,learning_rate='adaptive')

clf_vote = ensemble.VotingClassifier(estimators=[('svc',clf_svc_vec),('gbm',clf_rf_vec),('mlp',clf_mlp_vec)],voting='soft')

scores['vote_models_vec'] = pd.DataFrame(model_selection.cross_validate(clf_vote,get_sentence_vector(data),target,scoring=['f1','precision','recall'],cv=5,verbose=10)).mean()

clf_vote.fit(get_sentence_vector(data),target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END  f1: (test=0.664) precision: (test=0.728) recall: (test=0.610) total time=  24.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.6s remaining:    0.0s


[CV] END  f1: (test=0.737) precision: (test=0.948) recall: (test=0.603) total time=  27.2s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   51.8s remaining:    0.0s


[CV] END  f1: (test=0.715) precision: (test=0.968) recall: (test=0.567) total time=  30.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.4min remaining:    0.0s


[CV] END  f1: (test=0.575) precision: (test=0.671) recall: (test=0.503) total time=  25.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.8min remaining:    0.0s


[CV] END  f1: (test=0.655) precision: (test=0.684) recall: (test=0.628) total time=  26.7s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


VotingClassifier(estimators=[('svc',
                              SGDClassifier(class_weight={0.0: 0.7513908922316093,
                                                          1.0: 0.24860910776839068},
                                            loss='modified_huber',
                                            penalty='l1')),
                             ('gbm',
                              LGBMClassifier(class_weight={0.0: 0.7513908922316093,
                                                           1.0: 0.24860910776839068},
                                             n_estimators=1000,
                                             num_leaves=15)),
                             ('mlp',
                              MLPClassifier(hidden_layer_sizes=(300, 1),
                                            learning_rate='adaptive',
                                            max_iter=1000))],
                 voting='soft')

In [113]:
def voting(sentences,func=np.max):
    sentences = [normalizer(txt) for txt in sentences]
    probs = []
    probs.append(pipe_idf_fe.predict_proba(sentences)[:,1])
    probs.append(clf_vote.predict_proba(get_sentence_vector(sentences))[:,1])
    probs.append(clf_vec.predict_proba(get_sentence_vector(sentences))[:,1])
    return np.apply_over_axes(func,np.array(probs),axes=0)[0]

In [114]:
%%timeit
voting(['какая же ты тварь'])

1.46 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [115]:
zz3 = vk_all.assign(vote_max = voting(vk_all['comment']),
                    idf = pipe_idf_fe.predict_proba(vk_all['comment'])[:,1],
                    vote = clf_vote.predict_proba(get_sentence_vector(vk_all['comment']))[:,1],
                    mlp = clf_vec.predict_proba(get_sentence_vector(vk_all['comment']))[:,1])

In [122]:
zz3.loc[:,['toxic','mlp']].round().value_counts(normalize=True).unstack()

mlp,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.46149,0.20365
1.0,0.079656,0.255204


In [123]:
zz3.loc[:,['toxic','vote']].round().value_counts(normalize=True).unstack()

vote,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.606647,0.058493
1.0,0.134055,0.200805


In [124]:
zz3.loc[:,['toxic','idf']].round().value_counts(normalize=True).unstack()

idf,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.658965,0.006175
1.0,0.107272,0.227588


In [125]:
zz3.loc[:,['toxic','vote_max']].round().value_counts(normalize=True).unstack()

vote_max,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.661879,0.003261
1.0,0.004163,0.330697


In [129]:
joblib.dump(pipe_idf_fe,'m1.joblib')

['m1.joblib']

In [130]:
joblib.dump(clf_vote,'m2.joblib')

['m2.joblib']

In [131]:
joblib.dump(clf_vec,'m3.joblib')

['m3.joblib']