## Все то же самое, что и в основном файле, но с нейросетями

Делалось в Google Colab с использованием GPU

# 1. Общая подготовка

Установка библиотеки transformers (ее нет по умолчанию)

In [None]:
pip install transformers



In [None]:
import pandas as pd
import numpy as np
import torch
import nltk
import re
import random
import time
import datetime
import transformers
import lightgbm as lgb
from tqdm import notebook
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoConfig, AutoModelForSequenceClassification
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score, classification_report

Подключаем Google Drive для импорта данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('drive/My Drive/toxic_comments.csv')

Основная идея нижеследующих блоков в том, чтобы обучить модель без нейронок, посмотреть, сможем ли выжать приемлемый результат без них (хорошо бы, если так), а потом повторить обучение на PyTorch+BERT

In [None]:
train, test = train_test_split(df, random_state=12345)

Вспомогательная функция для перевода времени из секунд в Ч:ММ:СС

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Вспомогательная функция, к-я позволяет указать, сколько обновлений мы хотим получить на экране во время обучения

In [None]:
def good_update_interval(total_iters, num_desired_updates):
    '''
    This function will try to pick an intelligent progress update interval 
    based on the magnitude of the total iterations.

    Parameters:
      `total_iters` - The number of iterations in the for-loop.
      `num_desired_updates` - How many times we want to see an update over the 
                              course of the for-loop.
    '''
    # Divide the total iterations by the desired number of updates. Most likely
    # this will be some ugly number.
    exact_interval = total_iters / num_desired_updates

    # The `round` function has the ability to round down a number to, e.g., the
    # nearest thousandth: round(exact_interval, -3)
    #
    # To determine the magnitude to round to, find the magnitude of the total,
    # and then go one magnitude below that.

    # Get the order of magnitude of the total.
    order_of_mag = len(str(total_iters)) - 1

    # Our update interval should be rounded to an order of magnitude smaller. 
    round_mag = order_of_mag - 1

    # Round down and cast to an int.
    update_interval = int(round(exact_interval, -round_mag))

    # Don't allow the interval to be zero!
    if update_interval == 0:
        update_interval = 1

    return update_interval

# 3. Нейросеть с умными батчами

## Подготовка

Загрузка токенизатора BERT (нечувствительный к регистру вариант)

In [None]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


Установка максимального размера эмбеддинга

In [None]:
max_len = 400

Самая главная функция. Делаем умные батчи

In [None]:
def make_smart_batches(text_samples, labels, batch_size):
    '''
    This function combines all of the required steps to prepare batches.
    '''

    print('Creating Smart Batches from {:,} examples with batch size {:,}...\n'.format(len(text_samples), batch_size))

    # =========================
    #   Tokenize & Truncate
    # =========================

    full_input_ids = []

    # Tokenize all training examples
    print('Tokenizing {:,} samples...'.format(len(labels)))

    # Choose an interval on which to print progress updates.
    update_interval = good_update_interval(total_iters=len(labels), num_desired_updates=10)

    # For each training example...
    for text in text_samples:
        
        # Report progress.
        if ((len(full_input_ids) % update_interval) == 0):
            print('  Tokenized {:,} samples.'.format(len(full_input_ids)))

        # Tokenize the sample.
        input_ids = tokenizer.encode(text=text,              # Text to encode.
                                    add_special_tokens=True, # Do add specials.
                                    max_length=max_len,      # Do Truncate!
                                    truncation=True,         # Do Truncate!
                                    padding=False)           # DO NOT pad.
                                    
        # Add the tokenized result to our list.
        full_input_ids.append(input_ids)
        
    print('DONE.')
    print('{:>10,} samples\n'.format(len(full_input_ids)))

    # =========================
    #      Select Batches
    # =========================    

    # Sort the two lists together by the length of the input sequence.
    samples = sorted(zip(full_input_ids, labels), key=lambda x: len(x[0]))

    print('{:>10,} samples after sorting\n'.format(len(samples)))


    # List of batches that we'll construct.
    batch_ordered_sentences = []
    batch_ordered_labels = []

    print('Creating batches of size {:}...'.format(batch_size))

    # Choose an interval on which to print progress updates.
    update_interval = good_update_interval(total_iters=len(samples), num_desired_updates=10)
    
    # Loop over all of the input samples...    
    while len(samples) > 0:
        
        # Report progress.
        if ((len(batch_ordered_sentences) % update_interval) == 0 \
            and not len(batch_ordered_sentences) == 0):
            print('  Selected {:,} batches.'.format(len(batch_ordered_sentences)))

        # `to_take` is our actual batch size. It will be `batch_size` until 
        # we get to the last batch, which may be smaller. 
        to_take = min(batch_size, len(samples))

        # Pick a random index in the list of remaining samples to start
        # our batch at.
        select = random.randint(0, len(samples) - to_take)

        # Select a contiguous batch of samples starting at `select`.
        #print("Selecting batch from {:} to {:}".format(select, select+to_take))
        batch = samples[select:(select + to_take)]

        #print("Batch length:", len(batch))

        # Each sample is a tuple--split them apart to create a separate list of 
        # sequences and a list of labels for this batch.
        batch_ordered_sentences.append([s[0] for s in batch])
        batch_ordered_labels.append([s[1] for s in batch])

        # Remove these samples from the list.
        del samples[select:select + to_take]

    print('\n  DONE - Selected {:,} batches.\n'.format(len(batch_ordered_sentences)))

    # =========================
    #        Add Padding
    # =========================    

    print('Padding out sequences within each batch...')

    py_inputs = []
    py_attn_masks = []
    py_labels = []

    # For each batch...
    for (batch_inputs, batch_labels) in zip(batch_ordered_sentences, batch_ordered_labels):

        # New version of the batch, this time with padded sequences and now with
        # attention masks defined.
        batch_padded_inputs = []
        batch_attn_masks = []
        
        # First, find the longest sample in the batch. 
        # Note that the sequences do currently include the special tokens!
        max_size = max([len(sen) for sen in batch_inputs])

        # For each input in this batch...
        for sen in batch_inputs:
            
            # How many pad tokens do we need to add?
            num_pads = max_size - len(sen)

            # Add `num_pads` padding tokens to the end of the sequence.
            padded_input = sen + [tokenizer.pad_token_id]*num_pads

            # Define the attention mask--it's just a `1` for every real token
            # and a `0` for every padding token.
            attn_mask = [1] * len(sen) + [0] * num_pads

            # Add the padded results to the batch.
            batch_padded_inputs.append(padded_input)
            batch_attn_masks.append(attn_mask)

        # Our batch has been padded, so we need to save this updated batch.
        # We also need the inputs to be PyTorch tensors, so we'll do that here.
        # Todo - Michael's code specified "dtype=torch.long"
        py_inputs.append(torch.tensor(batch_padded_inputs))
        py_attn_masks.append(torch.tensor(batch_attn_masks))
        py_labels.append(torch.tensor(batch_labels))
    
    print('  DONE.')

    # Return the smart-batched dataset!
    return (py_inputs, py_attn_masks, py_labels)

Переводим тексты и метки классов в формат списков Python, т.к. функция выше ожидает такой формат

In [None]:
text_train = train.text.to_list()
labels_train = train.toxic.to_list()

Теперь получаем батчи, маски внимания и соответствующие им метки классов при помощи этой функции

In [None]:
(py_inputs, py_attn_masks, py_labels) = make_smart_batches(text_train, labels_train, 10)

Creating Smart Batches from 119,678 examples with batch size 10...

Tokenizing 119,678 samples...
  Tokenized 0 samples.
  Tokenized 10,000 samples.
  Tokenized 20,000 samples.
  Tokenized 30,000 samples.
  Tokenized 40,000 samples.
  Tokenized 50,000 samples.
  Tokenized 60,000 samples.
  Tokenized 70,000 samples.
  Tokenized 80,000 samples.
  Tokenized 90,000 samples.
  Tokenized 100,000 samples.
  Tokenized 110,000 samples.
DONE.
   119,678 samples

   119,678 samples after sorting

Creating batches of size 10...
  Selected 10,000 batches.

  DONE - Selected 11,968 batches.

Padding out sequences within each batch...
  DONE.


То же самое делаем для тестовой выборки

In [None]:
text_test = test.text.to_list()
labels_test = test.toxic.to_list()

In [None]:
(py_inputs_test, py_attn_masks_test, py_labels_test) = make_smart_batches(text_test, labels_test, 10)

Creating Smart Batches from 39,893 examples with batch size 10...

Tokenizing 39,893 samples...
  Tokenized 0 samples.
  Tokenized 4,000 samples.
  Tokenized 8,000 samples.
  Tokenized 12,000 samples.
  Tokenized 16,000 samples.
  Tokenized 20,000 samples.
  Tokenized 24,000 samples.
  Tokenized 28,000 samples.
  Tokenized 32,000 samples.
  Tokenized 36,000 samples.
DONE.
    39,893 samples

    39,893 samples after sorting

Creating batches of size 10...

  DONE - Selected 3,990 batches.

Padding out sequences within each batch...
  DONE.


Загрузка конфигурации и самой модели (автоподбор)

In [None]:
# Load the Config object, with an output configured for classification.
config = AutoConfig.from_pretrained(pretrained_model_name_or_path='bert-base-uncased',
                                    num_labels=2)

print('Config type:', str(type(config)), '\n')


Config type: <class 'transformers.configuration_bert.BertConfig'> 



In [None]:
# Load the pre-trained model for classification, passing in the `config` from
# above.
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path='bert-base-uncased',
    config=config)

print('\nModel type:', str(type(model)))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at


Model type: <class 'transformers.modeling_bert.BertForSequenceClassification'>


Некий оптимизатор

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # This is the value Michael used.
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


Задать количество эпох для обучения

In [None]:
# Number of training epochs.
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
# Note that it's the number of *batches*, not *samples*!
total_steps = len(py_inputs) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Перекидываем вычисления на GPU (если блокнот открыт в режиме CPU-only, здесь вас ждет фиаско)

In [None]:
print('\nLoading model to GPU...')

device = torch.device('cuda')

print('  GPU:', torch.cuda.get_device_name(0))

desc = model.to(device)

print('    DONE.')


Loading model to GPU...
  GPU: Tesla T4
    DONE.


## Обучение

In [None]:
seed_val = 321

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Update every `update_interval` batches.
update_interval = good_update_interval(total_iters=len(py_inputs), num_desired_updates=10)

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    
    # At the start of each epoch (except for the first) we need to re-randomize
    # our training data.
    if epoch_i > 0:
        # Use our `make_smart_batches` function (from 6.1.) to re-shuffle the 
        # dataset into new batches.
        (py_inputs, py_attn_masks, py_labels) = make_smart_batches(text_train, labels_train, 10)
    
    print('Training on {:,} batches...'.format(len(py_inputs)))

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step in range(0, len(py_inputs)):

        # Progress update every, e.g., 100 batches.
        if step % update_interval == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Calculate the time remaining based on our progress.
            steps_per_sec = (time.time() - t0) / step
            remaining_sec = steps_per_sec * (len(py_inputs) - step)
            remaining = format_time(remaining_sec)

            # Report progress.
            print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs), elapsed, remaining))

        # Copy the current training batch to the GPU using the `to` method.
        b_input_ids = py_inputs[step].to(device)
        b_input_mask = py_attn_masks[step].to(device)
        b_labels = py_labels[step].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass.
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The call returns the loss (because we provided labels) and the 
        # "logits"--the model outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(py_inputs)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Time': training_time,
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training on 11,968 batches...
  Batch   1,000  of   11,968.    Elapsed: 0:02:59.  Remaining: 0:32:45
  Batch   2,000  of   11,968.    Elapsed: 0:06:38.  Remaining: 0:33:02
  Batch   3,000  of   11,968.    Elapsed: 0:10:10.  Remaining: 0:30:24
  Batch   4,000  of   11,968.    Elapsed: 0:13:45.  Remaining: 0:27:24
  Batch   5,000  of   11,968.    Elapsed: 0:17:21.  Remaining: 0:24:10
  Batch   6,000  of   11,968.    Elapsed: 0:20:58.  Remaining: 0:20:52
  Batch   7,000  of   11,968.    Elapsed: 0:24:30.  Remaining: 0:17:23
  Batch   8,000  of   11,968.    Elapsed: 0:27:56.  Remaining: 0:13:51
  Batch   9,000  of   11,968.    Elapsed: 0:31:17.  Remaining: 0:10:19
  Batch  10,000  of   11,968.    Elapsed: 0:34:45.  Remaining: 0:06:50
  Batch  11,000  of   11,968.    Elapsed: 0:38:11.  Remaining: 0:03:22

  Average training loss: 0.25
  Training epcoh took: 0:41:41

Creating Smart Batches from 119,678 examples with batch size 10...

Tokenizing 119,678 samples...
  Tokenized 0 samples.
  To

## Проверка на тестовой выборке

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(py_labels_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions, true_labels = [], []

# Choose an interval on which to print progress updates.
update_interval = good_update_interval(total_iters=len(py_inputs_test), num_desired_updates=10)

# Measure elapsed time.
t0 = time.time()

# Put model in evaluation mode
model.eval()

# For each batch of training data...
for step in range(0, len(py_inputs_test)):

    # Progress update every 100 batches.
    if step % update_interval == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
        
        # Calculate the time remaining based on our progress.
        steps_per_sec = (time.time() - t0) / step
        remaining_sec = steps_per_sec * (len(py_inputs_test) - step)
        remaining = format_time(remaining_sec)

        # Report progress.
        print('  Batch {:>7,}  of  {:>7,}.    Elapsed: {:}.  Remaining: {:}'.format(step, len(py_inputs_test), elapsed, remaining))

    # Copy the batch to the GPU.
    b_input_ids = py_inputs_test[step].to(device)
    b_input_mask = py_attn_masks_test[step].to(device)
    b_labels = py_labels_test[step].to(device)
  
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
  
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 3,990 test sentences...
  Batch     400  of    3,990.    Elapsed: 0:00:22.  Remaining: 0:03:21
  Batch     800  of    3,990.    Elapsed: 0:00:45.  Remaining: 0:02:58
  Batch   1,200  of    3,990.    Elapsed: 0:01:08.  Remaining: 0:02:38
  Batch   1,600  of    3,990.    Elapsed: 0:01:31.  Remaining: 0:02:16
  Batch   2,000  of    3,990.    Elapsed: 0:01:56.  Remaining: 0:01:56
  Batch   2,400  of    3,990.    Elapsed: 0:02:18.  Remaining: 0:01:31
  Batch   2,800  of    3,990.    Elapsed: 0:02:41.  Remaining: 0:01:08
  Batch   3,200  of    3,990.    Elapsed: 0:03:06.  Remaining: 0:00:46
  Batch   3,600  of    3,990.    Elapsed: 0:03:30.  Remaining: 0:00:23
    DONE.


Склеиваем результаты батчей:

In [None]:
# Combine the results across the batches.
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# Choose the label with the highest score as our prediction.
preds = np.argmax(predictions, axis=1).flatten()

Результат:

In [None]:
f1_score(true_labels, preds)

0.7714521452145214

Отчет по классам

In [None]:
print(classification_report(true_labels, preds))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     35806
           1       0.88      0.69      0.77      4087

    accuracy                           0.96     39893
   macro avg       0.92      0.84      0.87     39893
weighted avg       0.96      0.96      0.96     39893



In [None]:
print(confusion_matrix(true_labels, preds))

[[35426   380]
 [ 1282  2805]]
