# Проект для «Викишоп» BERT

## Подготовка

In [1]:
#!pip install transformers

In [2]:
#!pip3 install https://download.pytorch.org/whl/cpu/torch-1.0.1-cp37-cp37m-win_amd64.whl

In [3]:
#!pip3 install pytorch

In [4]:
# Загрузим необходимыe библиотеки
import numpy as np
import pandas as pd
import gc
import torch
import transformers as ppb
import warnings
from tqdm import notebook
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline
import os
from os import listdir
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore')
np.random.seed(seed=42)

In [5]:
df = pd.read_csv('./datasets/toxic_comments.csv')
display(df.head())

Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [6]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  159292 non-null  int64 
 1   text        159292 non-null  object
 2   toxic       159292 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB


None

In [7]:
# Оценим сбалансированность df
df['toxic'].value_counts()

0    143106
1     16186
Name: toxic, dtype: int64

In [9]:
# Для BERT
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Загрузим предварительно обученную модель/токенизатор
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
tokenized = df["text"].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))
tokenized

0         [101, 7526, 2339, 1996, 10086, 2015, 2081, 210...
1         [101, 1040, 1005, 22091, 2860, 999, 2002, 3503...
2         [101, 4931, 2158, 1010, 1045, 1005, 1049, 2428...
3         [101, 1000, 2062, 1045, 2064, 1005, 1056, 2191...
4         [101, 2017, 1010, 2909, 1010, 2024, 2026, 5394...
                                ...                        
159287    [101, 1000, 1024, 1024, 1024, 1024, 1024, 1998...
159288    [101, 2017, 2323, 2022, 14984, 1997, 4426, 200...
159289    [101, 13183, 6290, 26114, 1010, 2045, 2015, 20...
159290    [101, 1998, 2009, 3504, 2066, 2009, 2001, 2941...
159291    [101, 1000, 1998, 1012, 1012, 1012, 1045, 2428...
Name: text, Length: 159292, dtype: object

In [13]:
# Искала индексы длинных предложений, чтобы потом удалить их из выдачи BERT
# при дальнейшем обучении модели. Само удаление по too_long_rows_indexes ниже.

# too_long_rows_indexes = []
#
# for i in notebook.tqdm(range(df.shape[0])):
#     res = tokenizer.encode(df.loc[i, "text"], add_special_tokens=True)
#     if (len(res) > 512):
#         too_long_rows_indexes.append(i)
#
# too_long_rows_indexes

In [14]:
# len(too_long_rows_indexes)

In [15]:
#  Приведём векторы к одному размеру путем прибавления к более коротким векторам 0
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
padded

array([[  101,  7526,  2339, ...,     0,     0,     0],
       [  101,  1040,  1005, ...,     0,     0,     0],
       [  101,  4931,  2158, ...,     0,     0,     0],
       ...,
       [  101, 13183,  6290, ...,     0,     0,     0],
       [  101,  1998,  2009, ...,     0,     0,     0],
       [  101,  1000,  1998, ...,     0,     0,     0]])

In [16]:
# Выведем размер, полученной матрицы
np.array(padded).shape

(159292, 512)

In [17]:
# Создадим переменную, чтобы указать ей игнорировать (маскировать) заполнение, которое мы добавили
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(159292, 512)

In [18]:
results_location = "./output/"

In [19]:
# Здесь запускала BERT батчами по 500 строк и сохраняла результаты на диск.
# Размер батча выбирался исходя из ресурсов локального компьютера.
# Обработка работала больше суток.
# Оставшийся хвост не кратный 500 дочитывали отдельно, здесь это код не сохранился,
# но это не принципиально.

# batch_size = 500
#
# for i in notebook.tqdm(range(53, padded.shape[0] // batch_size)):
#     batch = torch.tensor(padded[batch_size*i:batch_size*(i+1)])
#     attention_mask_batch = torch.tensor(attention_mask[batch_size*i:batch_size*(i+1)])
#
#     with torch.no_grad():
#         batch_embeddings = model(batch, attention_mask=attention_mask_batch)
#
#     batch_features = batch_embeddings[0][:,0,:].numpy()
#     np.savetxt(results_location + "batch_" + str(i) + ".csv", batch_features, delimiter=",")

In [20]:
# После того как все бачти были записаны на диск, вычитывала их в память
# для дальнейшей обработки / составления набора фич для них.

# embeddings = []
# batch_files_names = [f for f in listdir(results_location) if isfile(join(results_location, f))]
# batch_files_names.sort(key=lambda x: os.path.getmtime(results_location + x))
# for i in notebook.tqdm(range(len(batch_files_names))):
#     batch_features = np.genfromtxt(results_location + batch_files_names[i], delimiter=",")
#     embeddings.append(batch_features)

  0%|          | 0/319 [00:00<?, ?it/s]

In [21]:
concatenated_embeddings = np.concatenate(embeddings)
model_data_df = pd.DataFrame(concatenated_embeddings)
model_data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.220490,-0.097744,-0.073250,-0.071392,-0.085661,-0.183036,0.322567,0.242754,-0.038898,-0.271066,...,0.177155,-0.026148,0.086816,-0.171984,0.303477,0.019842,-0.199227,0.153798,0.416238,0.403389
1,-0.118798,-0.156563,0.238368,-0.128910,-0.122928,-0.096002,0.643840,0.153690,-0.207519,-0.335488,...,0.098510,-0.217268,0.038986,-0.440684,0.285151,-0.148381,0.210517,-0.048998,0.543574,0.514805
2,0.075954,0.061317,-0.122162,-0.135580,-0.120700,-0.396361,0.039420,0.519339,-0.130742,-0.324620,...,0.017994,-0.276312,0.167717,-0.168578,0.155621,0.323910,-0.154194,0.106488,0.530345,0.335555
3,-0.027679,-0.098320,0.177464,-0.105311,-0.114202,-0.387548,0.093583,0.325143,-0.159144,-0.106888,...,0.364155,-0.030819,-0.106879,-0.193697,0.161622,-0.022145,-0.361364,-0.032484,0.349627,0.503448
4,-0.116651,-0.038441,-0.080841,-0.028249,-0.014768,-0.308648,0.141258,0.533530,-0.274986,-0.334551,...,-0.012091,-0.161019,0.241566,-0.178456,0.210459,0.240435,-0.285982,0.031116,0.412562,0.304650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159287,-0.042763,-0.055712,-0.183988,-0.089951,-0.025618,-0.113750,0.119546,0.317383,-0.088454,-0.085701,...,0.129792,-0.099137,-0.013240,-0.144907,-0.119820,-0.015859,-0.037695,-0.002855,0.467618,0.370806
159288,0.024698,0.066385,-0.063861,-0.197261,-0.151639,-0.129282,0.311044,0.398141,-0.090190,-0.230817,...,0.058604,-0.056400,-0.059040,-0.317960,0.187389,0.001698,-0.063733,-0.020872,0.291844,0.413866
159289,-0.038442,-0.066573,0.042136,-0.080004,-0.034432,-0.230124,0.108003,0.437829,-0.065173,-0.100695,...,-0.146132,-0.289016,-0.140271,-0.234595,-0.146086,-0.073088,-0.129928,-0.212899,0.692907,0.256324
159290,0.139081,-0.061743,0.048407,-0.073013,-0.020383,-0.087851,0.151391,0.380969,-0.256102,-0.171124,...,-0.025120,-0.236260,0.061853,-0.068619,0.191564,0.153432,-0.067029,0.034036,0.354455,0.277578


## Обучение

In [23]:
# Выделим целевой признак
features = model_data_df
target = df['toxic']

display(features)
display(target)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.220490,-0.097744,-0.073250,-0.071392,-0.085661,-0.183036,0.322567,0.242754,-0.038898,-0.271066,...,0.177155,-0.026148,0.086816,-0.171984,0.303477,0.019842,-0.199227,0.153798,0.416238,0.403389
1,-0.118798,-0.156563,0.238368,-0.128910,-0.122928,-0.096002,0.643840,0.153690,-0.207519,-0.335488,...,0.098510,-0.217268,0.038986,-0.440684,0.285151,-0.148381,0.210517,-0.048998,0.543574,0.514805
2,0.075954,0.061317,-0.122162,-0.135580,-0.120700,-0.396361,0.039420,0.519339,-0.130742,-0.324620,...,0.017994,-0.276312,0.167717,-0.168578,0.155621,0.323910,-0.154194,0.106488,0.530345,0.335555
3,-0.027679,-0.098320,0.177464,-0.105311,-0.114202,-0.387548,0.093583,0.325143,-0.159144,-0.106888,...,0.364155,-0.030819,-0.106879,-0.193697,0.161622,-0.022145,-0.361364,-0.032484,0.349627,0.503448
4,-0.116651,-0.038441,-0.080841,-0.028249,-0.014768,-0.308648,0.141258,0.533530,-0.274986,-0.334551,...,-0.012091,-0.161019,0.241566,-0.178456,0.210459,0.240435,-0.285982,0.031116,0.412562,0.304650
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159287,-0.042763,-0.055712,-0.183988,-0.089951,-0.025618,-0.113750,0.119546,0.317383,-0.088454,-0.085701,...,0.129792,-0.099137,-0.013240,-0.144907,-0.119820,-0.015859,-0.037695,-0.002855,0.467618,0.370806
159288,0.024698,0.066385,-0.063861,-0.197261,-0.151639,-0.129282,0.311044,0.398141,-0.090190,-0.230817,...,0.058604,-0.056400,-0.059040,-0.317960,0.187389,0.001698,-0.063733,-0.020872,0.291844,0.413866
159289,-0.038442,-0.066573,0.042136,-0.080004,-0.034432,-0.230124,0.108003,0.437829,-0.065173,-0.100695,...,-0.146132,-0.289016,-0.140271,-0.234595,-0.146086,-0.073088,-0.129928,-0.212899,0.692907,0.256324
159290,0.139081,-0.061743,0.048407,-0.073013,-0.020383,-0.087851,0.151391,0.380969,-0.256102,-0.171124,...,-0.025120,-0.236260,0.061853,-0.068619,0.191564,0.153432,-0.067029,0.034036,0.354455,0.277578


0         0
1         0
2         0
3         0
4         0
         ..
159287    0
159288    0
159289    0
159290    0
159291    0
Name: toxic, Length: 159292, dtype: int64

In [24]:
# Пробовала удалять по индексам предложения, которые при токенизации выходили за лимиты
# предобученной модели BERT (512), но это дало незначительный прирост метрики приблизительно ~0.01.
# Скорее всего из-за того, что таких предложений было мало всего около 2%.

# features = features.drop(too_long_rows_indexes) # удаление длинных предложений
# target = target.drop(too_long_rows_indexes)

features.reset_index(drop=True, inplace=True)
target.reset_index(drop=True, inplace=True)

print(features.shape)
print(target.shape)

(159292, 768)
(159292,)


In [25]:
# Разделим данные на 2 выборки: обучающуюся, тестовую 75:25
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.25)

features_train.reset_index(drop=True, inplace=True)
features_test.reset_index(drop=True, inplace=True)
target_train.reset_index(drop=True, inplace=True)
target_test.reset_index(drop=True, inplace=True)

In [26]:
# Создадим pipeline
pipeline = make_pipeline()
pipeline.steps.append(('classification', DummyClassifier()))
pipeline

In [40]:
def create_randomized_search_cv(pipeline, iterations_count, parameters=None):
    if parameters is None:
        params = [
            {
                'classification': [SGDClassifier()],
                'classification__alpha': [1, 1e-01, 1e-02, 1e-03, 1e-04, 1e-05, 1e-06],
                 'classification__penalty': ['l1','l2'],

            },

            {
                'classification': [LinearSVC()],
                'classification__max_iter': [1000, 2000, 100],
                'classification__C': [0.1, 1, 10, 100],
            },
            {
                'classification': [LogisticRegression()],
                'classification__C': [0.001, 0.01, 0.1, 1, 10, 50, 100, 200],
            }
        ]
    else:
        params = parameters

    grid = RandomizedSearchCV(pipeline,
                              params,
                              n_iter = iterations_count,
                              cv = 3,
                              scoring = 'f1',
                              n_jobs=-1)

    return grid

In [42]:
grid = create_randomized_search_cv(pipeline, 100)
grid

In [43]:
%%time

grid.fit(features_train, target_train)

CPU times: total: 2min 48s
Wall time: 20min 35s


In [44]:
# Посмотрим лучшие параметры
grid.best_params_

{'classification__max_iter': 2000,
 'classification__C': 1,
 'classification': LinearSVC(C=1, max_iter=2000)}

In [45]:
# Посмотрим получившуюся лучшую метрику на обучающейся выборке
grid.best_score_

0.7416498753059999

In [47]:
# Посчитаем предсказания на тестовой выборке
predictions = grid.predict(features_test)
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [48]:
f1_test = f1_score(target_test, predictions)
f1_test

0.7357395612172682

## Выводы

## Чек-лист проверки

- [x]  Jupyter Notebook открыт
- [ ]  Весь код выполняется без ошибок
- [ ]  Ячейки с кодом расположены в порядке исполнения
- [ ]  Данные загружены и подготовлены
- [ ]  Модели обучены
- [ ]  Значение метрики *F1* не меньше 0.75
- [ ]  Выводы написаны