In [52]:
from sklearn import *
import lightgbm as lgbm
import pandas as pd
import numpy as np
import re
import nltk
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
nltk.download("stopwords")
plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/charubaiel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

# pos \ neg - комменты с твиттера http://study.mokoron.com/ 
# labeled каггловский датасет по токсикам https://www.kaggle.com/blackmoon/russian-language-toxic-comments

In [3]:
twitter_neg = pd.read_csv('data/negative.csv',sep=';',header=None,usecols=[3])
twitter_pos = pd.read_csv('data/positive.csv',sep=';',header=None,usecols=[3])
vk_all = pd.read_csv('data/labeled.csv')

In [4]:
ttl_toxic = vk_all.append(twitter_neg.rename(columns={3:'comment'})).fillna(1)
df = ttl_toxic.append(twitter_pos.rename(columns={3:'comment'})).fillna(0)


In [5]:
df['toxic'].value_counts(normalize=True)

0.0    0.516058
1.0    0.483942
Name: toxic, dtype: float64

In [6]:
scores = {}

In [7]:
data,val_data,target,val_target = model_selection.train_test_split(vk_all['comment'],vk_all['toxic'],train_size=.75)

### stupid baseline

In [8]:
tf = feature_extraction.text.TfidfVectorizer(min_df=10)

In [9]:
tf.fit(data)

TfidfVectorizer(min_df=10)

In [10]:
clf_base = linear_model.LogisticRegression(max_iter=1000)
pipe_base = pipeline.make_pipeline(tf,clf_base)

In [11]:
scores['baseline'] = pd.DataFrame(model_selection.cross_validate(pipe_base,data,target,scoring=['f1','precision','recall'],cv=5)).mean()

In [12]:
pipe_base.fit(data,target)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=10)),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [13]:
pd.DataFrame(scores)

Unnamed: 0,baseline
fit_time,0.180401
score_time,0.029296
test_f1,0.678159
test_precision,0.839016
test_recall,0.569149


### work with text

In [14]:
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
stopwords = nltk.corpus.stopwords.words('russian')

In [15]:
def normalizer (text):
    words_only = re.sub('[^А-я]+',' ',text.lower())
    doc = Doc(words_only)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    clean_text = []
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        if (token.lemma not in stopwords) & (len(set(token.lemma))>1):
            clean_text.append(token.lemma)
            
    return ' '.join(clean_text)

In [16]:
raw_data = data.copy()

In [17]:
old_text = ' жожо '.join(data)

In [18]:
new_text = normalizer(old_text)

In [19]:
data = new_text.split('жожо')

In [20]:
raw_val_data = val_data.copy()
val_data = pd.Series(normalizer(' жожо '.join(val_data)).split('жожо'))

In [21]:
clf_norm = linear_model.LogisticRegression(max_iter=1000)
pipe_norm = pipeline.make_pipeline(tf,clf_norm)
scores['normalize'] = pd.DataFrame(model_selection.cross_validate(pipe_norm,data,target,scoring=['f1','precision','recall'],cv=5)).mean()

In [22]:
pipe_norm.fit(data,target)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=10)),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [23]:
pd.DataFrame(scores)

Unnamed: 0,baseline,normalize
fit_time,0.180401,0.224762
score_time,0.029296,0.02786
test_f1,0.678159,0.715096
test_precision,0.839016,0.892805
test_recall,0.569149,0.596822


### micro EDA

In [24]:
eda = vk_all.copy(deep=True)

In [25]:
eda['txt_len'] = eda['comment'].str.len()
eda['txt_len_avg'] = eda['comment'].str.split().apply(lambda x: np.mean([len(word) for word in x]))
eda['txt_words'] = eda['comment'].str.count(' ')
eda['txt_puncts'] = eda['comment'].str.count('[^\w^ ]')
eda['txt_upper_cnt'] = eda['comment'].apply(lambda x: len([i for i in x if i.isupper()]))
eda['txt_pct_upper'] = eda['txt_upper_cnt'] / eda['txt_len']
eda['txt_pos_punc'] = eda['comment'].apply(lambda text: len(re.findall(r'\)|D',text)))
eda['txt_neg_punc'] = eda['comment'].apply(lambda text: len(re.findall(r'\(|C|c|С|c',text)))

In [26]:
eda.groupby('toxic')[eda.filter(regex='txt').columns].agg(['mean','std'])

Unnamed: 0_level_0,txt_len,txt_len,txt_len_avg,txt_len_avg,txt_words,txt_words,txt_puncts,txt_puncts,txt_upper_cnt,txt_upper_cnt,txt_pct_upper,txt_pct_upper,txt_pos_punc,txt_pos_punc,txt_neg_punc,txt_neg_punc
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
toxic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0.0,194.213332,274.750067,5.312308,1.063207,29.713436,40.811415,7.481849,9.135183,3.984978,7.81298,0.023631,0.027502,0.279261,0.709825,0.499478,1.353467
1.0,141.392665,261.776417,5.467866,1.466984,21.449233,42.106635,5.972234,9.188326,6.595939,26.932055,0.052774,0.130923,0.094488,0.434023,0.454414,1.689128


In [27]:
eda.iloc[:,1:].corr()

Unnamed: 0,toxic,txt_len,txt_len_avg,txt_words,txt_puncts,txt_upper_cnt,txt_pct_upper,txt_pos_punc,txt_neg_punc
toxic,1.0,-0.091782,0.060394,-0.094138,-0.077608,0.072997,0.171514,-0.136895,-0.014424
txt_len,-0.091782,1.0,0.024703,0.991756,0.92417,0.370052,-0.047513,0.382291,0.529863
txt_len_avg,0.060394,0.024703,1.0,-0.02515,0.016616,0.022158,0.07945,-0.001665,0.037176
txt_words,-0.094138,0.991756,-0.02515,1.0,0.929034,0.368631,-0.049015,0.378389,0.514993
txt_puncts,-0.077608,0.92417,0.016616,0.929034,1.0,0.346096,-0.033393,0.418016,0.497885
txt_upper_cnt,0.072997,0.370052,0.022158,0.368631,0.346096,1.0,0.565105,0.115176,0.692265
txt_pct_upper,0.171514,-0.047513,0.07945,-0.049015,-0.033393,0.565105,1.0,-0.040094,0.312005
txt_pos_punc,-0.136895,0.382291,-0.001665,0.378389,0.418016,0.115176,-0.040094,1.0,0.39542
txt_neg_punc,-0.014424,0.529863,0.037176,0.514993,0.497885,0.692265,0.312005,0.39542,1.0


### features

In [28]:
def features_from_text(df):
    df = pd.DataFrame(df)
    df.columns=['text']
    df['txt_len'] = df['text'].str.len()
    df['txt_len_avg'] = df['text'].str.split().apply(lambda x: np.mean([len(word) for word in x]))
    df['txt_words'] = df['text'].str.count(' ')
    df['txt_puncts'] = df['text'].str.count('[^\w^ ]')
    df['txt_upper_cnt'] = df['text'].apply(lambda x: len([i for i in x if i.isupper()]))
    df['txt_pos_punc'] = df['text'].apply(lambda text: len(re.findall(r'\)|D',text)))
    df['txt_neg_punc'] = df['text'].apply(lambda text: len(re.findall(r'\(|C|c|С|c',text)))
    
    return df.drop('text',axis=1).fillna(0)

In [29]:
fe_tf = feature_extraction.text.TfidfVectorizer(min_df=10)
fe_tf.fit(data,target)
new_fe=preprocessing.FunctionTransformer(features_from_text)
text_preproc = pipeline.FeatureUnion([('idf',fe_tf),('fe',new_fe)])

In [30]:
clf_fe = linear_model.LogisticRegression(max_iter=1000)
pipe_fe = pipeline.Pipeline([('preproc',text_preproc),('clf',clf_fe)])

In [31]:
scores['features'] = pd.DataFrame(model_selection.cross_validate(pipe_fe,data,target,scoring=['f1','precision','recall'],cv=5)).mean()

  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,


In [32]:
pipe_fe.fit(data,target)

  return _methods._mean(a, axis=axis, dtype=dtype,


Pipeline(steps=[('preproc',
                 FeatureUnion(transformer_list=[('idf',
                                                 TfidfVectorizer(min_df=10)),
                                                ('fe',
                                                 FunctionTransformer(func=<function features_from_text at 0x7ff39901fee0>))])),
                ('clf', LogisticRegression(max_iter=1000))])

In [33]:
pd.DataFrame(scores)

Unnamed: 0,baseline,normalize,features
fit_time,0.180401,0.224762,0.587189
score_time,0.029296,0.02786,0.066969
test_f1,0.678159,0.715096,0.717004
test_precision,0.839016,0.892805,0.888669
test_recall,0.569149,0.596822,0.601478


### vectors

In [34]:
from navec import Navec

In [35]:
nav = Navec.load('emb_navec.tar')

In [36]:
def get_sentence_vecror(sentence_list):
    vectors = []
    for sentence in sentence_list:
        sent_vec = []
        for i in sentence.split():
            if i in nav:
                sent_vec.append(nav[i])
            else:
                sent_vec.append(nav['<unk>'])
        if sentence.strip() == '':
            sent_vec = [nav['<unk>']]
        vectors.append(np.mean(sent_vec,axis=0))
    return np.vstack(vectors)

In [37]:
vec_func = preprocessing.FunctionTransformer(get_sentence_vecror)

In [38]:
clf_vec = linear_model.LogisticRegression(max_iter=1000)
pipe_vec = pipeline.make_pipeline(vec_func,clf_vec)

In [39]:
scores['sample_vec'] = pd.DataFrame(model_selection.cross_validate(pipe_vec,data,target,scoring=['f1','precision','recall'],cv=5)).mean()

In [40]:
pipe_vec.fit(data,target)

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function get_sentence_vecror at 0x7ff38b283280>)),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [41]:
pd.DataFrame(scores)

Unnamed: 0,baseline,normalize,features,sample_vec
fit_time,0.180401,0.224762,0.587189,1.051636
score_time,0.029296,0.02786,0.066969,0.233051
test_f1,0.678159,0.715096,0.717004,0.807755
test_precision,0.839016,0.892805,0.888669,0.843106
test_recall,0.569149,0.596822,0.601478,0.775403


### combo

In [42]:
word_vectorizer = feature_extraction.text.TfidfVectorizer(min_df=10)
word_vectorizer.fit(data)

TfidfVectorizer(min_df=10)

In [43]:
char_vectorizer = feature_extraction.text.TfidfVectorizer(
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(2,5))
char_vectorizer.fit(data)

TfidfVectorizer(analyzer='char', ngram_range=(2, 5), sublinear_tf=True)

In [44]:
idf_fu = pipeline.FeatureUnion([('idf_w',word_vectorizer),('idf_c',char_vectorizer)])

In [45]:
clf_2idf = linear_model.LogisticRegression(max_iter=1000)
pipe_idf_fe = pipeline.Pipeline([('idf',idf_fu),('clf',clf_2idf)])

In [46]:
scores['idf_features'] = pd.DataFrame(model_selection.cross_validate(pipe_idf_fe,data,target,scoring=['f1','precision','recall'],cv=5)).mean()

In [47]:
pipe_idf_fe.fit(data,target)

Pipeline(steps=[('idf',
                 FeatureUnion(transformer_list=[('idf_w',
                                                 TfidfVectorizer(min_df=10)),
                                                ('idf_c',
                                                 TfidfVectorizer(analyzer='char',
                                                                 ngram_range=(2,
                                                                              5),
                                                                 sublinear_tf=True))])),
                ('clf', LogisticRegression(max_iter=1000))])

In [48]:
pd.DataFrame(scores)

Unnamed: 0,baseline,normalize,features,sample_vec,idf_features
fit_time,0.180401,0.224762,0.587189,1.051636,6.028868
score_time,0.029296,0.02786,0.066969,0.233051,0.507511
test_f1,0.678159,0.715096,0.717004,0.807755,0.794277
test_precision,0.839016,0.892805,0.888669,0.843106,0.901471
test_recall,0.569149,0.596822,0.601478,0.775403,0.710209


### blend models

In [49]:
vec_func = preprocessing.FunctionTransformer(get_sentence_vecror)

In [121]:
clf_lr_vec = linear_model.LogisticRegression(max_iter=1000,C=6,penalty='l1',solver='liblinear')
clf_nb_vec = naive_bayes.BernoulliNB()
clf_knn_vec = neighbors.KNeighborsClassifier(30)
clf_svc_vec = svm.SVC(probability=True)
clf_rf_vec = lgbm.LGBMClassifier(n_estimators=1500,learning_rate=0.07,num_leaves=15)
clf_mlp_vec = neural_network.MLPClassifier(hidden_layer_sizes=(300,1),max_iter=1000,learning_rate='adaptive')


In [102]:
models = {}
for n,model in tqdm(enumerate([clf_lr_vec,clf_nb_vec,clf_knn_vec,clf_svc_vec,clf_rf_vec,clf_mlp_vec])):
    tmp_pipe = pipeline.make_pipeline(vec_func,model)
    models[n] = pd.DataFrame(model_selection.cross_validate(tmp_pipe,data,target,scoring=['f1','precision','recall'],cv=5)).mean()
    print(models[n])

1it [00:09,  9.18s/it]

fit_time          1.636022
score_time        0.198677
test_f1           0.807069
test_precision    0.838823
test_recall       0.777865
dtype: float64


2it [00:14,  6.83s/it]

fit_time          0.806764
score_time        0.230192
test_f1           0.706233
test_precision    0.707869
test_recall       0.704732
dtype: float64


3it [00:20,  6.69s/it]

fit_time          0.758724
score_time        0.542445
test_f1           0.769747
test_precision    0.773026
test_recall       0.766910
dtype: float64


4it [00:50, 15.60s/it]

fit_time          4.472831
score_time        1.376537
test_f1           0.819420
test_precision    0.849801
test_recall       0.791290
dtype: float64


5it [01:25, 22.58s/it]

fit_time          6.770268
score_time        0.217524
test_f1           0.804337
test_precision    0.842790
test_recall       0.769376
dtype: float64


6it [02:23, 23.84s/it]

fit_time          11.366353
score_time         0.222684
test_f1            0.806354
test_precision     0.812877
test_recall        0.800879
dtype: float64





In [112]:
pd.DataFrame(models).rename(columns = {0:'lr',1:'nb',2:'knn',3:'svc',4:'gbm',5:'mlp'}).T.sort_values(by='test_f1',ascending=False)

Unnamed: 0,fit_time,score_time,test_f1,test_precision,test_recall
svc,4.472831,1.376537,0.81942,0.849801,0.79129
lr,1.636022,0.198677,0.807069,0.838823,0.777865
mlp,11.366353,0.222684,0.806354,0.812877,0.800879
gbm,6.770268,0.217524,0.804337,0.84279,0.769376
knn,0.758724,0.542445,0.769747,0.773026,0.76691
nb,0.806764,0.230192,0.706233,0.707869,0.704732


In [122]:

vote_vec = ensemble.VotingClassifier(estimators=[('lr',clf_lr_vec),('nb',clf_nb_vec),('knn',clf_knn_vec),('svc',clf_svc_vec),('gbm',clf_rf_vec),('mlp',clf_mlp_vec)],voting='soft')

In [123]:
pipe_vote_vec = pipeline.make_pipeline(vec_func,vote_vec)

In [124]:
scores['vote_models'] = pd.DataFrame(model_selection.cross_validate(pipe_vote_vec,data,target,scoring=['f1','precision','recall'],cv=5)).mean()

In [125]:
pipe_vote_vec.fit(data,target)

Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function get_sentence_vecror at 0x7ff38b283280>)),
                ('votingclassifier',
                 VotingClassifier(estimators=[('lr',
                                               LogisticRegression(C=6,
                                                                  max_iter=1000,
                                                                  penalty='l1',
                                                                  solver='liblinear')),
                                              ('nb', BernoulliNB()),
                                              ('knn',
                                               KNeighborsClassifier(n_neighbors=30)),
                                              ('svc', SVC(probability=True)),
                                              ('gbm',
                                               LGBMClassifier(learning_rate=0.07,
                                 

In [126]:
pd.DataFrame(scores)

Unnamed: 0,baseline,normalize,features,sample_vec,idf_features,vote_models
fit_time,0.180401,0.224762,0.587189,1.051636,6.028868,48.953206
score_time,0.029296,0.02786,0.066969,0.233051,0.507511,1.718827
test_f1,0.678159,0.715096,0.717004,0.807755,0.794277,0.81448
test_precision,0.839016,0.892805,0.888669,0.843106,0.901471,0.836196
test_recall,0.569149,0.596822,0.601478,0.775403,0.710209,0.794025


### blend results

In [127]:
def voting(estimators,x,func=np.mean):
    probs = []
    for i in estimators:
        probs.append(i.predict_proba(x)[:,1])
    return np.apply_over_axes(func,np.array(probs),axes=0)[0]

In [138]:
eval_data = defaultdict(list)
for model in [pipe_norm,pipe_fe,pipe_idf_fe,pipe_vec,pipe_vote_vec]:
    eval_data['f1'].append(metrics.f1_score(val_target,model.predict(val_data)))
    eval_data['recall'].append(metrics.recall_score(val_target,model.predict(val_data)))
    eval_data['precision'].append(metrics.precision_score(val_target,model.predict(val_data)))
    eval_data['roc_auc'].append(metrics.roc_auc_score(val_target,model.predict(val_data)))

  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,


In [139]:
eval_data['f1'].append(metrics.f1_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data).round()))
eval_data['recall'].append(metrics.recall_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data).round()))
eval_data['precision'].append(metrics.precision_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data).round()))
eval_data['roc_auc'].append(metrics.roc_auc_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data).round()))

In [140]:
eval_data['f1'].append(metrics.f1_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data,func=np.max).round()))
eval_data['recall'].append(metrics.recall_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data,func=np.max).round()))
eval_data['precision'].append(metrics.precision_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data,func=np.max).round()))

In [142]:
eval_data['roc_auc'].append(metrics.roc_auc_score(val_target,voting([pipe_vec,pipe_idf_fe],val_data,func=np.max).round()))

In [143]:
pd.DataFrame(eval_data,index=['norm','fe','idf_df','vec','vote_vec','blend_mean','blend_max']).sort_values(by='f1',ascending=False)

Unnamed: 0,f1,recall,precision,roc_auc
blend_max,0.841494,0.862979,0.821053,0.885979
blend_mean,0.814215,0.770213,0.86355,0.855658
idf_df,0.808789,0.73617,0.897303,0.847698
vote_vec,0.806563,0.794894,0.81858,0.854819
vec,0.801754,0.777872,0.827149,0.849603
fe,0.734491,0.629787,0.880952,0.794301
norm,0.733433,0.626383,0.884615,0.793422


### testing

In [209]:
def check_text (text):
    return pd.Series({'pure_idf_model' : pipe_norm.predict_proba([text])[:,1][0],
    'some_features_model' : pipe_fe.predict_proba([text])[:,1][0],
    'double_idf_model' :pipe_idf_fe.predict_proba([text])[:,1][0],
    'pure_vectors_model' :pipe_vec.predict_proba([text])[:,1][0],
    'blending_models' : pipe_vote_vec.predict_proba([text])[:,1][0],
    'blending_ttl_mean' : voting([pipe_vec,pipe_idf_fe],[text],func=np.mean)[0],
    'blending_ttl_max' : voting([pipe_vec,pipe_idf_fe],[text],func=np.max)[0]},name='Степень токсичности')

In [227]:

def check_text (text):
    return pd.Series({'pure_idf_model' : pipe_norm.predict_proba([text])[:,1][0],
    'some_features_model' : pipe_fe.predict_proba([text])[:,1][0],
    'double_idf_model' :pipe_idf_fe.predict_proba([text])[:,1][0],
    'pure_vectors_model' :pipe_vec.predict_proba([text])[:,1][0],
    # 'blending_models' : pipe_vote_vec.predict_proba([text])[:,1][0],
    'blending_ttl_mean' : voting([pipe_vec,pipe_idf_fe],[text],func=np.mean)[0],
    'blending_ttl_max' : voting([pipe_vec,pipe_idf_fe],[text],func=np.max)[0]},name='Степень токсичности')

In [219]:
def time_predict(model,text):
    return model.predict_proba([text])[:,1][0]

In [220]:
%%timeit
time_predict(pipe_norm,'Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал')

301 µs ± 3.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [221]:
%%timeit
time_predict(pipe_fe,'Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал')

3.68 ms ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [222]:
%%timeit
time_predict(pipe_idf_fe,'Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал')

2.18 ms ± 253 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [223]:
%%timeit
time_predict(pipe_vec,'Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал')

162 µs ± 6.55 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [224]:
%%timeit
time_predict(pipe_vote_vec,'Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал')

76.6 ms ± 5.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [228]:
%%timeit
check_text('Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал')

11 ms ± 28.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [241]:
check_text('Уроды у нас в семье 3 поколения там родились')

pure_idf_model         0.425171
some_features_model    0.494172
double_idf_model       0.457508
pure_vectors_model     0.207928
blending_ttl_mean      0.332718
blending_ttl_max       0.457508
Name: Степень токсичности, dtype: float64

In [254]:
check_text('Ну в целом я хочу сказать что товар не плохой, а просто каличный - я рот его ебал').to_frame().style.format('{:.2%}')

Unnamed: 0,Степень токсичности
pure_idf_model,68.93%
some_features_model,76.55%
double_idf_model,69.47%
pure_vectors_model,53.04%
blending_ttl_mean,61.26%
blending_ttl_max,69.47%
