# Токсичные комментарии

## Подготовка

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [2]:
data = pd.read_csv('...')

Посмотрим на выборку 10 случайных строк:

In [3]:
data.sample(10)

Unnamed: 0,text,toxic
126428,What is best for the article is probably best ...,0
121506,That official BBC link refers to it as series ...,0
51931,"""\n Hello, , and Welcome to Wikipedia!\nPlease...",0
31704,Oppose The current redirects and intro do the ...,0
78408,I promise you the first thing I will do is hav...,0
143750,"""*:*:*Or we can do a search on """"the Wilhelmst...",0
42176,DYK peer review?,0
94850,Edit request on 26 November 2012 \n\nhttp://ww...,0
90473,"""\nYour hairsplitting distinction between """"ru...",0
147279,This page was nominated for deletion at Wikipe...,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
text     159571 non-null object
toxic    159571 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


Всего в файле 159571 записей, 2 столбца.

In [5]:
data.describe()

Unnamed: 0,toxic
count,159571.0
mean,0.101679
std,0.302226
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


"Токсичных" записей всего около 10%.

In [6]:
data.duplicated().sum()

0

Дублированных строк нет.

Создадим функцию и удалим лишние символы из данных:

In [7]:
def clear_text(text):
    string = re.sub(r'[^a-zA-Z]',' ', text)
    string = string.split()
    string = " ".join(string)
    return string

In [8]:
%%time
data['text'] = data['text'].apply(clear_text)

Wall time: 4.2 s


Выполним лемматизацию с помощью wordnet NLTK:

In [9]:
#pip install nltk
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('punkt')

In [10]:
wnl = WordNetLemmatizer()

In [11]:
def lemm(text):

    list = nltk.word_tokenize(text)
    lemmatized_string = ' '.join([wnl.lemmatize(words) for words in list])
    
    return lemmatized_string

In [12]:
%%time

data['lemm_text'] = data['text'].apply(lemm)

Wall time: 1min 25s


Разделим данные на тренировочную и тестовую выборки в соотношении 80/20%:

In [13]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=123)

In [14]:
features_train = data_train['text']

In [15]:
features_test = data_test['text']

In [16]:
target_train = data_train['toxic']

In [17]:
target_test = data_test['toxic']

In [18]:
corpus_train = features_train.values.astype('U')

In [19]:
corpus_test = features_test.values.astype('U')

Проведем кодирование текста с помощью TF-IDF:

In [20]:
#nltk.download('stopwords')

In [21]:
stopwords = set(nltk_stopwords.words('english'))

In [22]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords) 

In [23]:
%%time

tf_idf_train = count_tf_idf.fit_transform(corpus_train)

tf_idf_test = count_tf_idf.transform(corpus_test)

Wall time: 8.91 s


### Вывод

На данном этапе был загружен файл с данными "toxic_comments.csv". В файле имеется 159571 запись, 2 столбца с текстом твитов и целевым признаком "toxic". Записей с целевым признаком "1" около 10% всех данных. Дублированные строки отсутствуют. Из текста были удалены символы, отличные от букв латинского алфавита. Далее текст твитов был лемматизирован, данные разделены на тренировочную и тестовую выборки и преобразованы с помощью TF-IDF Vectorizer.

## Обучение

Найдем оптимальные параметры для моделей логистической регрессии, решающего дерева, случайного леса, LightGBM при помощи GridSearchCV и найдем значение F1-score на тестовой выборке.

Создадим таблицу для результатов f1 моделей:

In [24]:
model_results = pd.DataFrame(
    columns=['f1_train', 'f1_test'], 
    index=['logistic_regression', 'decision_tree', 'random_forest', 'LightGBM']
)

- Логистическая регрессия

Ищем оптимальные параметры:

In [25]:
model_regression = LogisticRegression(solver='lbfgs')

In [26]:
params_regression = {
    'solver': ['lbfgs'],
    'C': [17.5]
}

In [27]:
grid_regression = GridSearchCV(model_regression, params_regression, cv=3, scoring='f1', n_jobs=-1)

In [28]:
%%time

grid_regression.fit(tf_idf_train, target_train)

Wall time: 9.54 s




GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [17.5], 'solver': ['lbfgs']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1', verbose=0)

In [29]:
grid_regression.best_score_

0.7638917143677643

In [30]:
grid_regression.best_params_

{'C': 17.5, 'solver': 'lbfgs'}

In [31]:
model_results.loc['logistic_regression', 'f1_train'] = round(grid_regression.best_score_, 3)

Обучаем модель и проверяем на тестовых данных:

In [32]:
model_regression = LogisticRegression(
    
    solver = grid_regression.best_params_['solver'], 
    C = grid_regression.best_params_['C']
    
)

In [33]:
%%time
model_regression.fit(tf_idf_train, target_train)

Wall time: 3.19 s




LogisticRegression(C=17.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
%%time
predictions_regression = model_regression.predict(tf_idf_test)

Wall time: 4.01 ms


In [35]:
f1_regression = f1_score(target_test, predictions_regression)
f1_regression

0.7795123662515347

In [36]:
model_results.loc['logistic_regression', 'f1_test'] = round(f1_regression, 3)

- Дерево решений

Находим оптимальные параметры:

In [37]:
model_tree = DecisionTreeClassifier(random_state=1234)

In [38]:
params_tree = {
    'max_depth': [100],
    'min_samples_split': [2],
    'min_samples_leaf': [1] 
}

In [39]:
grid_tree = GridSearchCV(model_tree, params_tree, cv=3, scoring='f1', n_jobs=-1)

In [40]:
%%time

grid_tree.fit(tf_idf_train, target_train)

Wall time: 1min 32s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=1234,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [100], 'min_samples_leaf': [1],
                         'min_samples_split': [2]},
             pre

In [41]:
grid_tree.best_score_

0.7161854050762234

In [42]:
grid_tree.best_params_

{'max_depth': 100, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [43]:
model_results.loc['decision_tree', 'f1_train'] = round(grid_tree.best_score_, 3)

Обучаем модель и проверяем на тестовых данных:

In [44]:
model_tree = DecisionTreeClassifier(
    
    random_state=1234,
    max_depth = grid_tree.best_params_['max_depth'],
    min_samples_split = grid_tree.best_params_['min_samples_split'],
    min_samples_leaf = grid_tree.best_params_['min_samples_leaf']
    
)

In [45]:
%%time
model_tree.fit(tf_idf_train, target_train)

Wall time: 51.7 s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=100,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=1234, splitter='best')

In [46]:
predictions_tree = model_tree.predict(tf_idf_test)

In [47]:
f1_tree = f1_score(target_test, predictions_tree)
f1_tree

0.7308693388859968

In [48]:
model_results.loc['decision_tree', 'f1_test'] = round(f1_tree, 3)

- Случайный лес

Находим оптимальные параметры:

In [49]:
model_forest = RandomForestClassifier(random_state=1234, n_jobs=-1)

In [50]:
params_forest = {
    'max_depth': [500],
    'n_estimators': [10]
}

In [51]:
grid_forest = GridSearchCV(model_forest, params_forest, cv=3, scoring='f1')

In [52]:
%%time

grid_forest.fit(tf_idf_train, target_train)

Wall time: 1min 2s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=1234, verbose=0,
                                              warm_start=False),
             iid='

In [53]:
grid_forest.best_score_

0.660115030896057

In [54]:
grid_forest.best_params_

{'max_depth': 500, 'n_estimators': 10}

In [55]:
model_results.loc['random_forest', 'f1_train'] = round(grid_forest.best_score_, 3)

Обучаем модель и проверяем на тестовых данных:

In [56]:
model_forest = RandomForestClassifier(
    
    random_state=1234,
    max_depth = grid_forest.best_params_['max_depth'],
    n_estimators = grid_forest.best_params_['n_estimators']

)

In [57]:
%%time
model_forest.fit(tf_idf_train, target_train)

Wall time: 44.2 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=500, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=1234,
                       verbose=0, warm_start=False)

In [58]:
predictions_forest = model_forest.predict(tf_idf_test)

In [59]:
f1_forest = f1_score(target_test, predictions_forest)
f1_forest

0.6847632256786209

In [60]:
model_results.loc['random_forest', 'f1_test'] = round(f1_forest, 3)

- LightGBM

Подготовим признаки для обучения модели:

In [61]:
lgb_train = lgb.Dataset(tf_idf_train, target_train)
lgb_test = lgb.Dataset(tf_idf_test, target_test)

Запускаем LightGBM в GridSearch для поиска оптимальных параметров:

In [62]:
gridParams = {
    
    'n_estimators': [10], 
    'max_depth': [29], 
    'learning_rate': [0.35], 
    'reg_alpha': [0.1],
    'reg_lambda': [0.02]
    
}

In [63]:
gbm = lgb.LGBMClassifier()

In [64]:
grid_GBM = GridSearchCV(gbm, gridParams, cv=3, scoring='f1', n_jobs=-1)

In [65]:
%%time
grid_GBM.fit(tf_idf_train, target_train)

Wall time: 24.3 s


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=-1,
             param_grid={'learning_rate': [0.35], 'max_depth': [29],
                 

In [66]:
grid_GBM.best_params_

{'learning_rate': 0.35,
 'max_depth': 29,
 'n_estimators': 10,
 'reg_alpha': 0.1,
 'reg_lambda': 0.02}

In [67]:
grid_GBM.best_score_

0.6958488745984188

In [68]:
model_results.loc['LightGBM', 'f1_train'] = round(grid_GBM.best_score_, 3)

- Обучаем модель и проверяем на тестовых данных:

In [69]:
params = {
    
    'objective': 'binary',
    'metric': 'f1',
    'learning_rate': grid_GBM.best_params_['learning_rate'],
    'n_estimators': grid_GBM.best_params_['n_estimators'],
    'max_depth': grid_GBM.best_params_['max_depth'],
    'reg_alpha': grid_GBM.best_params_['reg_alpha'],
    'reg_lambda': grid_GBM.best_params_['reg_lambda'],
    'verbose': -1
    
    }

In [70]:
%%time

gbm = lgb.train(
    
    params, 
    lgb_train, 
    valid_sets=lgb_test, 
    verbose_eval=False
    
)



Wall time: 8.16 s


In [71]:
%%time
predictions_lgb = gbm.predict(tf_idf_test, num_iteration=gbm.best_iteration)

Wall time: 81.9 ms


In [72]:
predictions_lgb = (predictions_lgb.round(0)).astype('int')

In [73]:
f1_lgb = f1_score(target_test, predictions_lgb)
f1_lgb

0.711017112093828

In [74]:
model_results.loc['LightGBM', 'f1_test'] = round(f1_lgb, 3)

### Вывод

На данном этапе было проведен поиск параметров по сетке с помощью GridSearchCV моделей логистической регрессии, решающего дерева, случайного леса, LightGBM с кол.-вом folds равным 3 и метрикой f1-score. По найденным оптимальным гиперпараметрам были обучены соответствующие модели и получены предсказания на тестовых данных. 

## Выводы

Посмотрим на значения f1 моделей:

In [75]:
model_results.sort_values(by='f1_test', ascending=False)

Unnamed: 0,f1_train,f1_test
logistic_regression,0.764,0.78
decision_tree,0.716,0.731
LightGBM,0.696,0.711
random_forest,0.66,0.685


Наилучшие результаты показали модели логистической регрессии и решающего дерева - значение f1 на тестовых данных составляет 0.781 и 0.731 соответственно. При этом время обучения и предсказания решающего дерева намного больше времени, которое необходимо логистической регрессии. Худшие результаты показали модели LightGBM и случайный лес - значения их f1 ниже порога в 0.75. Т.о., в сервисе "Викишоп" для оптимального соотношения качества предсказаний и затраченного времени рекомендуется использовать логистическую регрессию.

## Чек-лист проверки

- [x]  Весь код выполняется без ошибок
- [x]  Ячейки с кодом расположены в порядке исполнения
- [x]  Данные загружены и подготовлены
- [x]  Модели обучены
- [x]  Значение метрики *F1* не меньше 0.75
- [x]  Выводы написаны