## Подготовка данных

### Импорт библиотек

In [46]:

import numpy as np
import pandas as pd

!pip install -U nltk

from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords as nltk_stopwords

nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))
#!pip install pymorphy2
# nltk.download('wordnet')
# nltk.download('omw-1.4')
import pymorphy2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier, Pool
from sklearn.pipeline import make_pipeline

import warnings

warnings.filterwarnings('ignore')
#!pip install nltk

import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artyom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
try:
    df = pd.read_csv('toxic_comments.csv')
except:
    df = pd.read_csv('https://code.s3.yandex.net/datasets/toxic_comments.csv')
display(df.head(5))

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


### Проверка на дубликаты и пропуски

In [4]:
print(f'Пропусков:{df.isna().sum()}')
print(f'Дубликатов:{df.duplicated().sum()}')

Пропусков:text     0
toxic    0
dtype: int64
Дубликатов:0


### Лемматизация
#### C помощью `WorldNetLemmatizer`

In [5]:
# напишем функцию для лемматизации и очистки


m = WordNetLemmatizer()
def lemmatize(text):
    text = text.lower()

    lemm_list = m.lemmatize(text)
    lemm_text = "".join(lemm_list)

    cleared_text = re.sub('[^a-zA-Z]', ' ', lemm_text)

    return " ".join(cleared_text.split())

In [6]:
df.sample(5)

Unnamed: 0,text,toxic
95648,"I found a credible source, A New History of Ko...",0
70899,"""\nThank you for pointing out my mistake, I've...",0
26313,", 5 April 2006 (UTC)\n\nlol at not acceptable....",0
128600,Fair enough; the category does include 'Restau...,0
105987,You are NOT an administrator \n\nYou don't hav...,0


In [7]:
%%time

df['lemm_text'] = df['text'].apply(lemmatize)
df.sample(5)

Wall time: 10.1 s


Unnamed: 0,text,toxic,lemm_text
93615,"""\nI don't really need to give sources because...",0,i don t really need to give sources because th...
36767,"""\nBUT I DIDNT """"DISCLOSED PERSONAL INFORMATIO...",0,but i didnt disclosed personal information any...
30298,I SAID GO FUCK YOURSELF. WHY ARE YOU CHAANGING IT,1,i said go fuck yourself why are you chaanging it
155388,"""\n @TheRedPenOfDoom: I suggest you take your ...",0,theredpenofdoom i suggest you take your preten...
46271,I took out your Frank Knight bit because it wa...,0,i took out your frank knight bit because it wa...


## Обучение

In [11]:
# Размер разбивок для CV
cv_counts = 3

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['lemm_text'], df['toxic'], random_state=42)

### Векторизация

In [13]:
count_vect = CountVectorizer(stop_words=stopwords)
n_gramm_train = count_vect.fit_transform(X_train)
n_gramm_test = count_vect.transform(X_test)

print("Размер train'a:", n_gramm_train.shape)
print("Размер test'a:", n_gramm_test.shape)

Размер train'a: (119678, 142897)
Размер test'a: (39893, 142897)


In [58]:
Tf_Idf_count = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 2))
n_gramm_train = Tf_Idf_count.fit_transform(X_train)
n_gramm_test = Tf_Idf_count.transform(X_test)

print("Размер train'a:", n_gramm_train.shape)
print("Размер test'a:", n_gramm_test.shape)

Размер train'a: (119678, 2273105)
Размер test'a: (39893, 2273105)


### LogisticRegression

Будем перебирать параметры GridSearch'ем. `solver` будем использовать `lbfgs`, `sag`

In [64]:
# Инциализация пайплайна:
model_lr = make_pipeline(LogisticRegression())
# Параметры
grid_values = {'logisticregression__class_weight':['balanced'] ,'logisticregression__penalty': ['l1', 'l2'], 'logisticregression__C': [0.001, 0.01, 0.1, 1, 7, 10, 100], 'logisticregression__solver': ['sag', 'lbfgs']}

grid_lr = GridSearchCV(model_lr, param_grid=grid_values, verbose=10, cv=cv_counts)

In [16]:
#grid_lr.estimator.get_params().keys()

Обучение

In [65]:
%%time
grid_lr.fit(n_gramm_train, y_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV 1/3; 1/28] START logisticregression__C=0.001, logisticregression__class_weight=balanced, logisticregression__penalty=l1, logisticregression__solver=sag
[CV 1/3; 1/28] END logisticregression__C=0.001, logisticregression__class_weight=balanced, logisticregression__penalty=l1, logisticregression__solver=sag; total time=   0.0s
[CV 2/3; 1/28] START logisticregression__C=0.001, logisticregression__class_weight=balanced, logisticregression__penalty=l1, logisticregression__solver=sag
[CV 2/3; 1/28] END logisticregression__C=0.001, logisticregression__class_weight=balanced, logisticregression__penalty=l1, logisticregression__solver=sag; total time=   0.0s
[CV 3/3; 1/28] START logisticregression__C=0.001, logisticregression__class_weight=balanced, logisticregression__penalty=l1, logisticregression__solver=sag
[CV 3/3; 1/28] END logisticregression__C=0.001, logisticregression__class_weight=balanced, logisticregression__penalty=l1, 

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 7, 10,
                                                   100],
                         'logisticregression__class_weight': ['balanced'],
                         'logisticregression__penalty': ['l1', 'l2'],
                         'logisticregression__solver': ['sag', 'lbfgs']},
             verbose=10)

In [67]:
%%time

print("F1 для train:", f1_score(y_train, grid_lr.best_estimator_.predict(n_gramm_train)))

lr_score_test = f1_score(y_test, grid_lr.best_estimator_.predict(n_gramm_test))
print("\nF1 для test:", lr_score_test)

F1 для train: 0.9969258515391236

F1 для test: 0.7959132189707366
Wall time: 123 ms


После подбора параметров логистической регрессией, мы видим, что `f1` для тестовоый выборки равно $0.77$, что уже неплохо, выше требуемого минимума

### RandomForestClassifier

In [60]:
# Инциализация пайплайна:
model_rf = make_pipeline(RandomForestClassifier())
# Параметры
grid_values_rf = {'randomforestclassifier__max_features':['auto', 'log2'] ,'randomforestclassifier__criterion': ['gini'], 'randomforestclassifier__max_depth': range(1, 30, 5), 'randomforestclassifier__n_estimators': range(10, 100, 20)}

grid_rf = GridSearchCV(model_rf, param_grid=grid_values_rf, verbose=10, cv=cv_counts)

Wall time: 0 ns


In [32]:
#grid_rf.estimator.get_params().keys()

In [61]:
%%time

grid_rf.fit(n_gramm_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV 1/3; 1/60] START randomforestclassifier__criterion=gini, randomforestclassifier__max_depth=1, randomforestclassifier__max_features=auto, randomforestclassifier__n_estimators=10
[CV 1/3; 1/60] END randomforestclassifier__criterion=gini, randomforestclassifier__max_depth=1, randomforestclassifier__max_features=auto, randomforestclassifier__n_estimators=10; total time=   1.7s
[CV 2/3; 1/60] START randomforestclassifier__criterion=gini, randomforestclassifier__max_depth=1, randomforestclassifier__max_features=auto, randomforestclassifier__n_estimators=10
[CV 2/3; 1/60] END randomforestclassifier__criterion=gini, randomforestclassifier__max_depth=1, randomforestclassifier__max_features=auto, randomforestclassifier__n_estimators=10; total time=   1.7s
[CV 3/3; 1/60] START randomforestclassifier__criterion=gini, randomforestclassifier__max_depth=1, randomforestclassifier__max_features=auto, randomforestclassifier__n_estimators=

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__criterion': ['gini'],
                         'randomforestclassifier__max_depth': range(1, 30, 5),
                         'randomforestclassifier__max_features': ['auto',
                                                                  'log2'],
                         'randomforestclassifier__n_estimators': range(10, 100, 20)},
             verbose=10)

In [66]:
# Запишем лучшие параметры, на случай перезапуска ноутбука, подбор шел 1час 40 минут
#grid_rf.best_params_

best_params_rf = {'randomforestclassifier__criterion': 'gini',
                  'randomforestclassifier__max_depth': 26,
                  'randomforestclassifier__max_features': 'auto',
                  'randomforestclassifier__n_estimators': 10}

In [63]:
# Результаты f1-score для RFC с лучшими параметрами
print("F1 для train:", f1_score(y_train, grid_rf.best_estimator_.predict(n_gramm_train)))

rf_score_test = f1_score(y_test, grid_rf.best_estimator_.predict(n_gramm_test))
print("\nF1 для test:", rf_score_test)

F1 для train: 0.006880733944954128

F1 для test: 0.00980632507967639


### Catboost

In [49]:
%%time
catboost = CatBoostClassifier(random_state=42, iterations=100)
parameters_cat = {'depth': [5, 7], 'learning_rate': np.arange(0.1, 1, 0.2)}
catboost_grid = catboost.grid_search(parameters_cat,
                                     Pool(n_gramm_train, y_train), cv=cv_counts, verbose=True, plot=False)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.5969841	test: 0.5973054	best: 0.5973054 (0)	total: 364ms	remaining: 36s
1:	learn: 0.5205256	test: 0.5209901	best: 0.5209901 (1)	total: 731ms	remaining: 35.8s
2:	learn: 0.4623295	test: 0.4630190	best: 0.4630190 (2)	total: 1.09s	remaining: 35.4s
3:	learn: 0.4138575	test: 0.4148315	best: 0.4148315 (3)	total: 1.47s	remaining: 35.3s
4:	learn: 0.3767393	test: 0.3781133	best: 0.3781133 (4)	total: 1.81s	remaining: 34.4s
5:	learn: 0.3485094	test: 0.3499569	best: 0.3499569 (5)	total: 2.16s	remaining: 33.9s
6:	learn: 0.3265725	test: 0.3281877	best: 0.3281877 (6)	total: 2.53s	remaining: 33.7s
7:	learn: 0.3094546	test: 0.3111125	best: 0.3111125 (7)	total: 2.89s	remaining: 33.3s
8:	learn: 0.2957979	test: 0.2976699	best: 0.2976699 (8)	total: 3.23s	remaining: 32.7s
9:	learn: 0.2828034	test: 0.2848115	best: 0.2848115 (9)	total: 3.59s	remaining: 32.3s
10:	learn: 0.2734813	test: 0.2757741	best: 0.2757741 (10)	total: 3.92s	remaining: 31.7s
11:	learn: 0.2656154	test: 0.2680728	best: 0.2680728 (

Лучшие параметры

In [50]:
catboost_grid['params'] # {'depth': 7, 'learning_rate': 0.7000000000000001}

{'depth': 7, 'learning_rate': 0.7000000000000001}

In [52]:
%%time
final_cat = CatBoostClassifier(depth= 7, learning_rate=0.7000000000000001)
final_cat.fit(Pool(n_gramm_train, y_train))

0:	learn: 0.3011154	total: 752ms	remaining: 12m 31s
1:	learn: 0.2424668	total: 1.46s	remaining: 12m 10s
2:	learn: 0.2224175	total: 2.1s	remaining: 11m 38s
3:	learn: 0.2117888	total: 2.77s	remaining: 11m 29s
4:	learn: 0.2010902	total: 3.44s	remaining: 11m 25s
5:	learn: 0.1944210	total: 4.05s	remaining: 11m 11s
6:	learn: 0.1889714	total: 4.64s	remaining: 10m 58s
7:	learn: 0.1850215	total: 5.24s	remaining: 10m 49s
8:	learn: 0.1813019	total: 5.81s	remaining: 10m 40s
9:	learn: 0.1780079	total: 6.4s	remaining: 10m 33s
10:	learn: 0.1736922	total: 7.04s	remaining: 10m 32s
11:	learn: 0.1714059	total: 7.6s	remaining: 10m 25s
12:	learn: 0.1691225	total: 8.16s	remaining: 10m 19s
13:	learn: 0.1670614	total: 8.73s	remaining: 10m 14s
14:	learn: 0.1645490	total: 9.34s	remaining: 10m 13s
15:	learn: 0.1623211	total: 9.95s	remaining: 10m 12s
16:	learn: 0.1604110	total: 10.6s	remaining: 10m 12s
17:	learn: 0.1587869	total: 11.2s	remaining: 10m 12s
18:	learn: 0.1568676	total: 11.9s	remaining: 10m 12s
19:	le

<catboost.core.CatBoostClassifier at 0x2c4542c8fd0>

In [57]:

# Результаты f1-score для RFC с лучшими параметрами
print("F1 для train:", f1_score(y_train, final_cat.predict(n_gramm_train)))

cat_score_test = f1_score(y_test, final_cat.predict(n_gramm_test))
print("\nF1 для test:", cat_score_test)

F1 для train: 0.9327851114477386

F1 для test: 0.771342300301949


## Вывод

In [68]:
models_data = [[lr_score_test],[rf_score_test], [cat_score_test]]

model = ['LogisticRegression','RandomForestClassifier', 'CatBoost']
values = ['F1-score']

table = pd.DataFrame(data = models_data, index = model, columns=values)
table['Выполнение требования'] = table['F1-score'] > 0.75

table.style.highlight_max(color = 'green', axis = 0)

Unnamed: 0,F1-score,Выполнение требования
LogisticRegression,0.795913,True
RandomForestClassifier,0.009806,False
CatBoost,0.771342,True


Наилучшая модель, очевидно, `LogisticRegression`, но сам подбор параметров занял больше времени, и оубчается она больше. `CatBoost` немного хуже. `RandomForestClassifier` не справилась, нужно больше и лучше подбирать параметры, но по времени это очень затратно.