## Подготовка

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from tqdm import notebook 
from sklearn.model_selection import cross_val_score

import catboost
from catboost import cv
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [2]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

In [3]:
df = pd.read_csv('/datasets/toxic_comments.csv')

In [4]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

df['text_lemmatized'] = df.text.apply(lemmatize_text)

In [5]:
df

Unnamed: 0.1,Unnamed: 0,text,toxic,text_lemmatized
0,0,Explanation\nWhy the edits made under my usern...,0,"[Explanation, Why, the, edits, made, under, my..."
1,1,D'aww! He matches this background colour I'm s...,0,"[D'aww!, He, match, this, background, colour, ..."
2,2,"Hey man, I'm really not trying to edit war. It...",0,"[Hey, man,, I'm, really, not, trying, to, edit..."
3,3,"""\nMore\nI can't make any real suggestions on ...",0,"["", More, I, can't, make, any, real, suggestio..."
4,4,"You, sir, are my hero. Any chance you remember...",0,"[You,, sir,, are, my, hero., Any, chance, you,..."
...,...,...,...,...
159287,159446,""":::::And for the second time of asking, when ...",0,"["":::::And, for, the, second, time, of, asking..."
159288,159447,You should be ashamed of yourself \n\nThat is ...,0,"[You, should, be, ashamed, of, yourself, That,..."
159289,159448,"Spitzer \n\nUmm, theres no actual article for ...",0,"[Spitzer, Umm,, there, no, actual, article, fo..."
159290,159449,And it looks like it was actually you who put ...,0,"[And, it, look, like, it, wa, actually, you, w..."


## Обучение

In [6]:
features_train, features_test, target_train, target_test = train_test_split(df['text_lemmatized'], df['toxic'], test_size=0.2, random_state=42)

In [7]:
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

features_train = features_train.astype('U')
features_test = features_test.astype('U')

count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf_train = count_tf_idf.fit_transform(features_train)
tf_idf_test = count_tf_idf.transform(features_test)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
param = { 'C': range(1, 11, 2), 'class_weight': [None, 'balanced'] }

model_log = LogisticRegression()

cv_log = GridSearchCV(estimator = model_log, 
                           param_grid = param, 
                           cv = 5,
                           n_jobs = -1, 
                           verbose = 0, 
                           scoring = 'f1',
                          )
cv_log.fit(tf_idf_train, target_train)    
print('ЛУчшая модель получила результат метрики f1 равный',cv_log.best_score_ ,'при параметрах',cv_log.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

ЛУчшая модель получила результат метрики f1 равный 0.7669421010820224 при параметрах {'C': 9, 'class_weight': 'balanced'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
param = { 'max_depth': range(280, 350, 10), 'n_estimators': range(1,5,1),'class_weight': [None, 'balanced'] }

model_forest = RandomForestClassifier(random_state=12345)

cv_forest = GridSearchCV(estimator = model_forest, 
                           param_grid = param, 
                           cv = 5,
                           n_jobs = -1, 
                           verbose = 0, 
                           scoring = 'f1',
                          )
#scores = cross_val_score(cv_forest,tf_idf_train,target_train,cv=5)
cv_forest.fit(tf_idf_train, target_train)    
print('Лучшая модель получила результат метрики f1 равный',cv_forest.best_score_ ,'при параметрах',cv_forest.best_params_)

Лучшая модель получила результат метрики f1 равный 0.5954379653011159 при параметрах {'class_weight': None, 'max_depth': 320, 'n_estimators': 4}


In [11]:
param = { 'max_depth': range(100, 170, 10),'class_weight': [None, 'balanced']}

model_tree = DecisionTreeClassifier(random_state=12345)

cv_tree = GridSearchCV(estimator = model_tree, 
                           param_grid = param, 
                           cv = 5,
                           n_jobs = -1, 
                           verbose = 0, 
                           scoring = 'f1',
                          )
cv_tree.fit(tf_idf_train, target_train)    
print('Лeчшая модель получила результат метрики f1 равный',cv_tree.best_score_ ,'при параметрах',cv_tree.best_params_)

Лeчшая модель получила результат метрики f1 равный 0.724747403425168 при параметрах {'class_weight': None, 'max_depth': 140}


In [12]:
predictions = cv_log.predict(tf_idf_test)
f1_score(target_test,predictions)

0.7679175864606327

## Выводы

Лучшей моделью оказалась модель логистической регрессии с результатом 0.762