In [13]:
from sklearn import *
import lightgbm as lgbm
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
nltk.download("stopwords")
plt.style.use('ggplot')
import joblib
from navec import Navec

nav = Navec.load('../models/emb_navec.tar')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/charubaiel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
twitter_neg = pd.read_csv('../data/negative.csv',sep=';',header=None,usecols=[3])
twitter_pos = pd.read_csv('../data/positive.csv',sep=';',header=None,usecols=[3])
vk_all = pd.read_csv('../data/labeled.csv')
ttl_toxic = vk_all.append(twitter_pos.rename(columns={3:'comment'}).sample(5000)).fillna(0)

In [3]:
data,target = ttl_toxic['comment'],ttl_toxic['toxic']

In [4]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
stopwords = nltk.corpus.stopwords.words('russian')

In [15]:
def normalizer (text):
    words_only = re.sub('[^А-я]+',' ',text.lower())
    doc = Doc(words_only)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    clean_text = []
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if (token.lemma not in stopwords) & (len(set(token.lemma))>1):
            clean_text.append(token.lemma)
            
    return ' '.join(clean_text)
    
def get_sentence_vector(sentence_list):
    vectors = []
    for sentence in sentence_list:
        sent_vec = []
        for i in sentence.split():
            if i in nav:
                sent_vec.append(nav[i])
            else:
                sent_vec.append(nav['<unk>'])
        if sentence.strip() == '':
            sent_vec = [nav['<unk>']]
        vectors.append(np.mean(sent_vec,axis=0))
    return np.vstack(vectors)

In [6]:
scores = {}

In [7]:
data = pd.Series(normalizer(' жожо '.join(data)).split('жожо'))

In [69]:
clf_vec = neural_network.MLPClassifier(hidden_layer_sizes=(100,3),learning_rate='adaptive')
prms_vc = {'hidden_layer_sizes':[(100,1),(100,3),(100,5),(300,1),(300,3),(300,5)],
            'learning_rate':['constant', 'invscaling', 'adaptive'],
            'alpha':np.arange(0.001,0.1,0.05)}

rs_vc = model_selection.RandomizedSearchCV(clf_vec,prms_vc,cv=3,scoring='f1',n_iter=10,n_jobs=4,verbose=5)

rs_vc.fit(get_sentence_vector(data),target)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END alpha=0.01, hidden_layer_sizes=(100, 3);, score=0.707 total time=  22.1s
[CV 2/3] END alpha=0.01, hidden_layer_sizes=(100, 3);, score=0.704 total time=   7.5s
[CV 2/3] END alpha=0.060000000000000005, hidden_layer_sizes=(100, 1);, score=0.000 total time=  33.1s
[CV 3/3] END alpha=0.01, hidden_layer_sizes=(100, 3);, score=0.551 total time=  11.6s
[CV 1/3] END alpha=0.060000000000000005, hidden_layer_sizes=(300, 5);, score=0.722 total time=  40.2s
[CV 2/3] END alpha=0.060000000000000005, hidden_layer_sizes=(300, 5);, score=0.737 total time=  58.1s
[CV 1/3] END alpha=0.01, hidden_layer_sizes=(300, 5);, score=0.725 total time=  10.1s
[CV 2/3] END alpha=0.01, hidden_layer_sizes=(300, 5);, score=0.722 total time=   9.4s
[CV 1/3] END alpha=0.060000000000000005, hidden_layer_sizes=(100, 1);, score=0.710 total time= 2.1min
[CV 3/3] END alpha=0.060000000000000005, hidden_layer_sizes=(300, 5);, score=0.557 total time=  54.2s

RandomizedSearchCV(cv=3,
                   estimator=MLPClassifier(hidden_layer_sizes=(100, 3),
                                           learning_rate='adaptive',
                                           max_iter=500),
                   n_jobs=4,
                   param_distributions={'alpha': array([0.01, 0.06]),
                                        'hidden_layer_sizes': [(100, 1),
                                                               (100, 3),
                                                               (100, 5),
                                                               (300, 1),
                                                               (300, 3),
                                                               (300, 5)]},
                   scoring='f1', verbose=5)

In [68]:
top_params_idx = pd.DataFrame(rs_vc.cv_results_).sort_values(by='mean_test_score',ascending=False).apply(lambda x: x['mean_test_score'] / x['mean_score_time'],axis=1).sort_values().idxmax()

In [67]:
opt_params_mlp = pd.DataFrame(rs_vc.cv_results_).loc[top_params_idx]['params']

{'learning_rate': 'adaptive', 'hidden_layer_sizes': (100, 1), 'alpha': 0.001}

In [31]:
 
clf_vec = neural_network.MLPClassifier(**opt_params_mlp)

scores['vec'] = pd.DataFrame(model_selection.cross_validate(clf_vec,get_sentence_vector(data),target,scoring=['f1','precision','recall'],cv=5,verbose=10)).mean()

clf_vec.fit(get_sentence_vector(data),target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END  f1: (test=0.669) precision: (test=0.637) recall: (test=0.704) total time=  16.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.9s remaining:    0.0s


[CV] END  f1: (test=0.763) precision: (test=0.788) recall: (test=0.740) total time=  14.0s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   30.9s remaining:    0.0s


[CV] END  f1: (test=0.745) precision: (test=0.834) recall: (test=0.674) total time=  10.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   41.5s remaining:    0.0s


[CV] END  f1: (test=0.558) precision: (test=0.637) recall: (test=0.497) total time=  12.5s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   54.0s remaining:    0.0s


[CV] END  f1: (test=0.628) precision: (test=0.617) recall: (test=0.638) total time=  13.2s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min finished


MLPClassifier(alpha=0.001, hidden_layer_sizes=(300, 5))

In [32]:
word_vectorizer = feature_extraction.text.TfidfVectorizer()
word_vectorizer.fit(data)
char_vectorizer = feature_extraction.text.TfidfVectorizer(
    min_df=3,
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(2,4))
char_vectorizer.fit(data)

idf_fu = pipeline.FeatureUnion([('idf_w',word_vectorizer),('idf_c',char_vectorizer)])
clf_2idf = lgbm.LGBMClassifier(n_estimators=500,class_weight = target.value_counts(normalize=True).to_dict())
pipe_idf_fe = pipeline.Pipeline([('idf',idf_fu),('clf',clf_2idf)])

In [33]:
pipe_idf_fe

Pipeline(steps=[('idf',
                 FeatureUnion(transformer_list=[('idf_w', TfidfVectorizer()),
                                                ('idf_c',
                                                 TfidfVectorizer(analyzer='char',
                                                                 min_df=3,
                                                                 ngram_range=(2,
                                                                              4),
                                                                 sublinear_tf=True))])),
                ('clf',
                 LGBMClassifier(class_weight={0.0: 0.7513908922316093,
                                              1.0: 0.24860910776839068},
                                n_estimators=500))])

In [35]:
prms_vc_idf = {'clf__n_estimators':np.arange(100,2000,200),
            'clf__learning_rate':np.arange(0.01,0.2,.05),
            'clf__subsample' : [1,.9,.8],
            'clf__subsample_freq':[1,2],
            'clf__max_depth': [-1,2,4,6,9,12,15],
            'clf__num_leaves': [5,10,20,30,50,80]}

In [37]:
rs_vc_idf = model_selection.RandomizedSearchCV(pipe_idf_fe,prms_vc_idf,cv=3,scoring='roc_auc',n_iter=10,n_jobs=4,verbose=5)

rs_vc_idf.fit(data,target)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END clf__learning_rate=0.060000000000000005, clf__max_depth=6, clf__n_estimators=500, clf__num_leaves=10, clf__subsample=1, clf__subsample_freq=1;, score=0.911 total time= 1.0min
[CV 2/3] END clf__learning_rate=0.060000000000000005, clf__max_depth=6, clf__n_estimators=500, clf__num_leaves=10, clf__subsample=1, clf__subsample_freq=1;, score=0.933 total time= 1.1min
[CV 3/3] END clf__learning_rate=0.060000000000000005, clf__max_depth=6, clf__n_estimators=500, clf__num_leaves=10, clf__subsample=1, clf__subsample_freq=1;, score=0.818 total time= 1.3min
[CV 1/3] END clf__learning_rate=0.060000000000000005, clf__max_depth=-1, clf__n_estimators=1300, clf__num_leaves=5, clf__subsample=1, clf__subsample_freq=2;, score=0.915 total time= 1.5min
[CV 1/3] END clf__learning_rate=0.11, clf__max_depth=6, clf__n_estimators=1100, clf__num_leaves=30, clf__subsample=0.8, clf__subsample_freq=2;, score=0.912 total time= 2.8min
[CV 2/3] EN

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('idf',
                                              FeatureUnion(transformer_list=[('idf_w',
                                                                              TfidfVectorizer()),
                                                                             ('idf_c',
                                                                              TfidfVectorizer(analyzer='char',
                                                                                              min_df=3,
                                                                                              ngram_range=(2,
                                                                                                           4),
                                                                                              sublinear_tf=True))])),
                                             ('clf',
                                           

In [41]:
rs_vc_idf.best_params_

{'clf__subsample_freq': 2,
 'clf__subsample': 1,
 'clf__num_leaves': 5,
 'clf__n_estimators': 1300,
 'clf__max_depth': -1,
 'clf__learning_rate': 0.060000000000000005}

In [46]:
idf_fu = pipeline.FeatureUnion([('idf_w',word_vectorizer),('idf_c',char_vectorizer)])
clf_2idf = lgbm.LGBMClassifier(subsample_freq=2,num_leaves=5,n_estimators=1300,learning_rate=0.06,subsample=1,class_weight = target.value_counts(normalize=True).to_dict())
pipe_idf_fe = pipeline.Pipeline([('idf',idf_fu),('clf',clf_2idf)])

In [48]:
scores['idf_features'] = pd.DataFrame(model_selection.cross_validate(pipe_idf_fe,data,target,scoring=['f1','precision','recall'],cv=5,verbose=10)).mean()
pipe_idf_fe.fit(data,target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  56.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.7s remaining:    0.0s


[CV] END .................................................... total time= 1.0min
[CV] END .................................................... total time= 1.0min
[CV] END .................................................... total time= 1.1min
[CV] END .................................................... total time= 1.2min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.2min finished


Pipeline(steps=[('idf',
                 FeatureUnion(transformer_list=[('idf_w', TfidfVectorizer()),
                                                ('idf_c',
                                                 TfidfVectorizer(analyzer='char',
                                                                 min_df=3,
                                                                 ngram_range=(2,
                                                                              4),
                                                                 sublinear_tf=True))])),
                ('clf',
                 LGBMClassifier(class_weight={0.0: 0.7513908922316093,
                                              1.0: 0.24860910776839068},
                                learning_rate=0.06, n_estimators=1300,
                                num_leaves=5, subsample=1, subsample_freq=2))])

In [50]:
class_prior=target.value_counts(normalize=True).values[::-1]

clf_svc_vec = linear_model.SGDClassifier(loss='modified_huber',penalty = 'l1',class_weight= target.value_counts(normalize=True).to_dict())
clf_rf_vec = lgbm.LGBMClassifier(subsample_freq=2,num_leaves=5,n_estimators=1300,learning_rate=0.06,subsample=1,class_weight= target.value_counts(normalize=True).to_dict())
clf_mlp_vec = neural_network.MLPClassifier(hidden_layer_sizes=(300,1),max_iter=1000,learning_rate='adaptive')

clf_vote = ensemble.VotingClassifier(estimators=[('svc',clf_svc_vec),('gbm',clf_rf_vec),('mlp',clf_mlp_vec)],voting='soft')

scores['vote_models_vec'] = pd.DataFrame(model_selection.cross_validate(clf_vote,get_sentence_vector(data),target,scoring=['f1','precision','recall'],cv=5,verbose=10)).mean()

clf_vote.fit(get_sentence_vector(data),target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] START .....................................................................
[CV] END  f1: (test=0.658) precision: (test=0.714) recall: (test=0.610) total time=  19.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   19.6s remaining:    0.0s


[CV] END  f1: (test=0.751) precision: (test=0.942) recall: (test=0.624) total time=  22.4s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   42.0s remaining:    0.0s


[CV] END  f1: (test=0.718) precision: (test=0.963) recall: (test=0.572) total time=  20.4s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.0min remaining:    0.0s


[CV] END  f1: (test=0.381) precision: (test=0.858) recall: (test=0.245) total time=  14.4s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


[CV] END  f1: (test=0.652) precision: (test=0.697) recall: (test=0.612) total time=  19.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.6min finished


VotingClassifier(estimators=[('svc',
                              SGDClassifier(class_weight={0.0: 0.7513908922316093,
                                                          1.0: 0.24860910776839068},
                                            loss='modified_huber',
                                            penalty='l1')),
                             ('gbm',
                              LGBMClassifier(class_weight={0.0: 0.7513908922316093,
                                                           1.0: 0.24860910776839068},
                                             learning_rate=0.06,
                                             n_estimators=1300, num_leaves=5,
                                             subsample=1, subsample_freq=2)),
                             ('mlp',
                              MLPClassifier(hidden_layer_sizes=(300, 1),
                                            learning_rate='adaptive',
                                            max_iter=1000))]

In [51]:
def voting(sentences,func=np.max):
    sentences = [normalizer(txt) for txt in sentences]
    probs = []
    probs.append(pipe_idf_fe.predict_proba(sentences)[:,1])
    probs.append(clf_vote.predict_proba(get_sentence_vector(sentences))[:,1])
    probs.append(clf_vec.predict_proba(get_sentence_vector(sentences))[:,1])
    return np.apply_over_axes(func,np.array(probs),axes=0)[0]

In [52]:
%%timeit
voting(['какая же ты тварь'])

26.4 ms ± 2.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [56]:
pd.DataFrame(scores)

Unnamed: 0,vec,idf_features,vote_models_vec
fit_time,13.424084,62.230302,19.141512
score_time,0.016642,0.599446,0.110458
test_f1,0.672736,0.633992,0.631755
test_precision,0.702725,0.909813,0.834901
test_recall,0.650631,0.495643,0.532516


In [115]:
zz3 = vk_all.assign(vote_max = voting(vk_all['comment']),
                    idf = pipe_idf_fe.predict_proba(vk_all['comment'])[:,1],
                    vote = clf_vote.predict_proba(get_sentence_vector(vk_all['comment']))[:,1],
                    mlp = clf_vec.predict_proba(get_sentence_vector(vk_all['comment']))[:,1])

In [122]:
zz3.loc[:,['toxic','mlp']].round().value_counts(normalize=True).unstack()

mlp,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.46149,0.20365
1.0,0.079656,0.255204


In [123]:
zz3.loc[:,['toxic','vote']].round().value_counts(normalize=True).unstack()

vote,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.606647,0.058493
1.0,0.134055,0.200805


In [124]:
zz3.loc[:,['toxic','idf']].round().value_counts(normalize=True).unstack()

idf,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.658965,0.006175
1.0,0.107272,0.227588


In [125]:
zz3.loc[:,['toxic','vote_max']].round().value_counts(normalize=True).unstack()

vote_max,0.0,1.0
toxic,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.661879,0.003261
1.0,0.004163,0.330697


In [53]:
joblib.dump(pipe_idf_fe,'m1.joblib')

['m1.joblib']

In [54]:
joblib.dump(clf_vote,'m2.joblib')

['m2.joblib']

In [55]:
joblib.dump(clf_vec,'m3.joblib')

['m3.joblib']