## Этап 1: Загрузка и подготовка данных

In [1]:
import pandas as pd
import numpy as np
from pymystem3 import Mystem
import re
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score




[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alekseyfedko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alekseyfedko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alekseyfedko/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alekseyfedko/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#data = pd.read_csv('/Users/alekseyfedko/Desktop/Projects YP/ml_for_texts/toxic_comments.csv')
data = pd.read_csv('/datasets/toxic_comments.csv')
#data = pd.read_csv('/Users/alekseyfedko/Desktop/Projects YP/ml_for_texts/lemmatized_data.csv') это я загружал обработанный датасет

In [3]:
data

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [5]:
data.duplicated().sum()

0

1) Загрузил данные.  
2) Посмотрел пропуски - отсутствуют.  
3) Посмотрел дубликаты - отсутствуют.  
4) Проверил тип данных - определены верно.  



In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(string):
    lemmatizer = WordNetLemmatizer()
    lem = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(string)]
    result = " ".join(lem)
    return result


### Лемматизация

Для лемматизации пришлось написать две функции. Образец из тренажёра не подходил, поскольку в проекте приходилось работать с английским языком. Освоил библиотеку WordNetLemmatizer.    
Функция get_wordnet_pos тегирует слова для их правильной лемматизации посредством функции lemmatize.


In [7]:
data['lemmatized_text'] = data['text'].apply(lemmatize)

In [12]:
data

Unnamed: 0,text,toxic,lemmatized_text
0,Explanation\nWhy the edits made under my usern...,0,Explanation Why the edits make under my userna...
1,D'aww! He matches this background colour I'm s...,0,D'aww ! He match this background colour I 'm s...
2,"Hey man, I'm really not trying to edit war. It...",0,"Hey man , I 'm really not try to edit war . It..."
3,"""\nMore\nI can't make any real suggestions on ...",0,`` More I ca n't make any real suggestion on i...
4,"You, sir, are my hero. Any chance you remember...",0,"You , sir , be my hero . Any chance you rememb..."
...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,"`` : : : : : And for the second time of ask , ..."
159567,You should be ashamed of yourself \n\nThat is ...,0,You should be ashamed of yourself That be a ho...
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,"Spitzer Umm , there no actual article for pros..."
159569,And it looks like it was actually you who put ...,0,And it look like it be actually you who put on...


In [13]:
def clear_text(text):
    upd_text = " ".join((re.sub(r'[^a-zA-Z ]', ' ', text)).split())
    return upd_text

In [14]:
clear_text(data['lemmatized_text'][0])

'Explanation Why the edits make under my username Hardcore Metallica Fan be revert They be n t vandalism just closure on some GAs after I vote at New York Dolls FAC And please do n t remove the template from the talk page since I m retire now'

Проверил функцию очистки - работает корректно.  

In [15]:
data['lem_clear_text'] = data['lemmatized_text'].apply(clear_text)

#### В результате получил датафрейм с исходным текстом, а также очищеным и лемматизированным. Сделал его выгрузку на локальную машину, так как на обработку данных тратится огромное количество времени.  

In [16]:
corpus = data['lem_clear_text'].values.astype('U')

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
n_gramm = count_vect.fit_transform(corpus)
n_gramm.shape

(159571, 161430)

In [18]:
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = set(nltk_stopwords.words('english'))

count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf = count_tf_idf.fit_transform(corpus)

tf_idf.shape

(159571, 161287)

Исключил стоп слова. Их оказалось немного.

In [19]:
target = data['toxic']
features = tf_idf

In [20]:

features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.25, random_state=12345)


In [21]:
target_mini = target[:1000]
features_mini = features[:1000]

In [22]:
mini_features_train, mini_features_valid, mini_target_train, mini_target_valid = train_test_split(
    features_mini, target_mini, test_size=0.25, random_state=12345)

## Этап 2: Работа с моделями, гиперпараметрами

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

def f1 (answers, predictions):
    result = f1_score(answers, predictions)
    return result

score_f1 = make_scorer(f1)

### Модель LogisticRegression

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
log_reg_model = LogisticRegression(max_iter=10000, class_weight= 'balanced')
log_reg_model.fit(features_train, target_train)
log_reg_predictions = log_reg_model.predict(features_valid)

In [26]:
log_reg_f1 = f1_score(target_valid, log_reg_predictions)
log_reg_f1

0.7528138528138529

In [27]:
mini_log_reg_model = LogisticRegression(max_iter=10000, class_weight= 'balanced')
mini_log_reg_model.fit(mini_features_train, mini_target_train)
mini_log_reg_predictions = log_reg_model.predict(mini_features_valid)
mini_log_reg_f1 = f1_score(mini_target_valid, mini_log_reg_predictions)
mini_log_reg_f1

0.9180327868852459

### Модель CatBoost

In [28]:
from catboost import CatBoostClassifier

In [29]:
def cat_boost_testing (features_train, features_valid, target_train, target_valid):
    cat_model = CatBoostClassifier(iterations=10, depth=16,
                          learning_rate=0.1)         
    cat_model.fit(features_train, target_train)
    cat_predictions = cat_model.predict(features_valid)
    cat_f1 = f1_score(target_valid, cat_predictions)
    return cat_f1

In [30]:
cat_boost_testing(mini_features_train, mini_features_valid, mini_target_train, mini_target_valid)

0:	learn: 0.6133099	total: 20.5s	remaining: 3m 4s
1:	learn: 0.5519494	total: 37.6s	remaining: 2m 30s
2:	learn: 0.4929178	total: 54.4s	remaining: 2m 7s
3:	learn: 0.4533608	total: 1m 11s	remaining: 1m 47s
4:	learn: 0.4189407	total: 1m 28s	remaining: 1m 28s
5:	learn: 0.3945184	total: 1m 45s	remaining: 1m 10s
6:	learn: 0.3715920	total: 1m 45s	remaining: 45.3s
7:	learn: 0.3563831	total: 2m 2s	remaining: 30.6s
8:	learn: 0.3381503	total: 2m 19s	remaining: 15.5s
9:	learn: 0.3233835	total: 2m 37s	remaining: 0us


0.23529411764705882

In [31]:
cat_model = CatBoostClassifier(iterations=10, depth=5,
                          learning_rate=0.1)         
cat_model.fit(features_train, target_train)
cat_predictions = cat_model.predict(features_valid)
cat_f1 = f1_score(target_valid, cat_predictions)



0:	learn: 0.5900621	total: 1.38s	remaining: 12.5s
1:	learn: 0.5138156	total: 2.69s	remaining: 10.8s
2:	learn: 0.4541468	total: 3.96s	remaining: 9.25s
3:	learn: 0.4074771	total: 5.24s	remaining: 7.86s
4:	learn: 0.3716028	total: 6.53s	remaining: 6.53s
5:	learn: 0.3420935	total: 7.8s	remaining: 5.2s
6:	learn: 0.3196981	total: 9.05s	remaining: 3.88s
7:	learn: 0.3019999	total: 10.3s	remaining: 2.58s
8:	learn: 0.2881625	total: 11.6s	remaining: 1.28s
9:	learn: 0.2773064	total: 12.8s	remaining: 0us


In [32]:
cat_f1

0.46250927988121754

### Модель lightgbm


In [33]:
import lightgbm as lgb
lgb_model = lgb.LGBMClassifier(max_depth = 400, num_leaves = 400)
lgb_model.fit(features_train, target_train)
lgb_predictions = lgb_model.predict(features_valid)
lgb_f1 = f1_score(target_valid, lgb_predictions)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### BERT

В работе использовалась готовая модель https://huggingface.co/models

In [34]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelWithLMHead.from_pretrained("bert-base-uncased")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertFor

In [35]:
bert_data = data.sample(50000, random_state = 12345)

In [36]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
encoded_input = bert_data['lem_clear_text'].apply(lambda x: tokenizer(x[:512], return_tensors='pt'))


In [37]:
encoded_input

146790    [input_ids, token_type_ids, attention_mask]
2941      [input_ids, token_type_ids, attention_mask]
115087    [input_ids, token_type_ids, attention_mask]
48830     [input_ids, token_type_ids, attention_mask]
136034    [input_ids, token_type_ids, attention_mask]
                             ...                     
105341    [input_ids, token_type_ids, attention_mask]
112974    [input_ids, token_type_ids, attention_mask]
58630     [input_ids, token_type_ids, attention_mask]
111281    [input_ids, token_type_ids, attention_mask]
110088    [input_ids, token_type_ids, attention_mask]
Name: lem_clear_text, Length: 50000, dtype: object

In [38]:
res = []
for x in range(len(bert_data)):
    output = model(**encoded_input.iloc[x])
    for y in output:
        out_y = y.detach().numpy()
    res.append(out_y)

In [39]:
len(res)

50000

In [40]:
bert_features = np.array(res)
bert_target = bert_data['toxic']

In [41]:
nsamples, nx, ny = bert_features.shape
bert_features = bert_features.reshape((nsamples,nx*ny))

In [42]:
bert_features_train, bert_features_valid, bert_target_train, bert_target_valid = train_test_split(
    bert_features, bert_target, test_size=0.5, random_state=12345)

In [43]:
bert_log_model = LogisticRegression(max_iter = 10000)
bert_log_model.fit(bert_features_train, bert_target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [48]:
bert_predictions = bert_log_model.predict(bert_features_valid)

In [49]:
bert_f1 = f1_score(bert_predictions, bert_target_valid)

In [53]:
final_info = pd.DataFrame({'model': ['LogisticRegression', 'CatBoost', 'LGBMClassifier', 'BERT/LogisticRegression'],
             'f1_score': [log_reg_f1, cat_f1, lgb_f1, bert_f1]})

final_info

Unnamed: 0,model,f1_score
0,LogisticRegression,0.752814
1,CatBoost,0.462509
2,LGBMClassifier,0.774743
3,BERT/LogisticRegression,0.685202


### Выводы  
Работа с текстами - просто восхитительно сложный процесс.  
Удивил тот факт, что логистическая регрессия показала себя с лучше стороны - быстро обучалась и давала хорошие результаты.  
Работа с большими данными требуем колоссальных ресурсов. Я думал, что мой мак не переживёт этот проект.  
Познакомился с разными библиотеками машинного обучения: tensorflor, keras, torch, nltk. Понял, что их иногда сложно подружить.  
Важно уметь минимизировать продолжительность проекта и сохранять обработанные данные в отдельные файлы. Это сохранит время.  
Ну а результаты работы моделей в таблице выше. Интересными показались результаты эксперимента теста моделей на небольших выборках. Логистическая регрессия обошла catboost и здесь.  

