## Подготовка

In [1]:
!pip install pymystem3



In [2]:
import pandas as pd
import re
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import numpy as np
from pymystem3 import Mystem
m = Mystem()
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package wordnet to /Users/DMITRYD/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/DMITRYD/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('toxic_comments.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
df.text

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159287    ":::::And for the second time of asking, when ...
159288    You should be ashamed of yourself \n\nThat is ...
159289    Spitzer \n\nUmm, theres no actual article for ...
159290    And it looks like it was actually you who put ...
159291    "\nAnd ... I really don't think you understand...
Name: text, Length: 159292, dtype: object

Очистим текст от ненужных символов. 

In [6]:
def cleaning(text):
    text = re.sub(r"(?:\n|\r)", " ", text)
    text = re.sub(r"[^a-zA-Z ]+", "", text)
    text = text.lower()
    return text.split()

df['text'] = df['text'].apply(cleaning)
df['text'].head()

0    [explanation, why, the, edits, made, under, my...
1    [daww, he, matches, this, background, colour, ...
2    [hey, man, im, really, not, trying, to, edit, ...
3    [more, i, cant, make, any, real, suggestions, ...
4    [you, sir, are, my, hero, any, chance, you, re...
Name: text, dtype: object

Проведем лемматизацию. 

In [7]:
wnl = WordNetLemmatizer()
corpus = df['text']

def lemmatizer(text):
    lem = [wnl.lemmatize(word) for word in text]
    return ' '.join(lem)


corpus = corpus.apply(lemmatizer)
corpus.head()

0    explanation why the edits made under my userna...
1    daww he match this background colour im seemin...
2    hey man im really not trying to edit war it ju...
3    more i cant make any real suggestion on improv...
4    you sir are my hero any chance you remember wh...
Name: text, dtype: object

##### Вывод

Текст обработан. Можем переходить к обучению. 

## Обучение

In [8]:
stopwords = set(nltk_stopwords.words('english'))

In [9]:
target = df['toxic'][0 : 160000].values
features = corpus

Векторизируем признаки. 

In [10]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf = count_tf_idf.fit_transform(features)

Обучим модель логистической регрессии и подберем параметры. 

In [11]:
%%time

pipe_lr = Pipeline([
 ('tfidf',  TfidfTransformer()),
 ('model', LogisticRegression())])

param_grid = [
        {

            'model': [LogisticRegression(random_state=12345, solver='liblinear')],
            'model__penalty' : ['l1', 'l2'],
            'model__C': list(range(1,15,3))
        }
]
grid = GridSearchCV(pipe_lr, param_grid=param_grid, scoring='f1', cv=3, verbose=True, n_jobs=-1)
best_grid = grid.fit(tf_idf, target)
print('Best parameters is:', grid.best_params_)
print('Best score is:', grid.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters is: {'model': LogisticRegression(C=4, penalty='l1', random_state=12345, solver='liblinear'), 'model__C': 4, 'model__penalty': 'l1'}
Best score is: 0.7580108185706296
CPU times: user 2.83 s, sys: 356 ms, total: 3.19 s
Wall time: 48.6 s


Проделаем то же самое для случайного леса. 

In [12]:
%%time
params_forest = {
    'n_estimators': list(range(50, 300, 50)),
    'max_depth':[5, 30],
    'max_features' : list(range(10, 100, 10))
}


model_forest = RandomForestClassifier(random_state=12345)
                                 
grid = GridSearchCV(model_forest, param_grid=params_forest, scoring='f1', cv=3, verbose=True, n_jobs=-1)
best_grid = grid.fit(tf_idf, target)
print('Best parameters is:', grid.best_params_)
print('Best score is:', grid.best_score_)

"\n%%time\nparams_forest = {\n    'n_estimators': list(range(50, 300, 50)),\n    'max_depth':[5, 30],\n    'max_features' : list(range(10, 100, 10))\n}\n\n\nmodel_forest = RandomForestClassifier(random_state=12345)\n                                 \ngrid = GridSearchCV(model_forest, param_grid=params_forest, scoring='f1', cv=3, verbose=True, n_jobs=-1)\nbest_grid = grid.fit(tf_idf, target)\nprint('Best parameters is:', grid.best_params_)\nprint('Best score is:', grid.best_score_)\n"

In [13]:
model = LogisticRegression(random_state=12345, C = 4, penalty = 'l1', solver='liblinear')
model.fit(tf_idf, target)
valid_pred = model.predict(tf_idf)
f1 = cross_val_score(
        model, tf_idf, target, scoring="f1", cv=3
    ).mean()
f1

0.7707162323098605

In [14]:
model_forest = RandomForestClassifier(max_depth=5, max_features=10, n_estimators = 50)
model_forest.fit(tf_idf, target)
valid_pred_f = model_forest.predict(tf_idf)
f1 = cross_val_score(
        model_forest, tf_idf, target, scoring="f1", cv=3
    ).mean()
f1

0.0

## Выводы

Обработали и подготовили данные, обучили разные модели с подбором гиперпараметров и на тестовой выборке получили значение f1 метрики **0.77**, использовав логистическую регрессию.  