# Обуяение модели

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from khajiit_common.intermediate_results_caching import Vault
from tqdm.auto import tqdm

In [2]:
import sys
import re
import torch
from torch import nn as nn
import random
import pytorch_transformers
import logging
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2"
from path import Path
from pytorch_transformers import BertTokenizer, BertPreTrainedModel, BertModel
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
from sklearn_pytorch.nlp import BertPreprocessor
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
from torch.nn import BCEWithLogitsLoss, BCELoss
from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, RunningAverage, TopKCategoricalAccuracy
from ignite.contrib.handlers import ProgressBar, TensorboardLogger
from ignite.contrib.handlers.tensorboard_logger import OutputHandler, OptimizerParamsHandler
from ignite.handlers import ModelCheckpoint, EarlyStopping

I1027 05:30:31.581629 140488245839680 modeling_bert.py:139] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
I1027 05:30:31.588151 140488245839680 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
HEADER_TEST = ['id', 'title', 'text']
HEADER_TRAIN = ['id', 'title', 'text', 'tags']
RANDOM_SEED = 442
TEST_SIZE = .1
EXPERIMENT_NAME = 'ya_champ_multiclass_full_data_preprocessed'

In [4]:
def set_random_seed(seed, cudnn_deterministic=False):
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)

    torch.cuda.manual_seed(seed)
    if torch.cuda.device_count() > 0:
        torch.cuda.manual_seed_all(seed)

    if cudnn_deterministic:
        torch.backends.cudnn.deterministic = True
    else:
        torch.backends.cudnn.deterministic = False

In [5]:
def get_file_iterator(file):
    for line in file:
        yield line #.readline()

In [6]:
set_random_seed(RANDOM_SEED)

In [7]:
def get_data(path, col_no=4, header=HEADER_TRAIN):
    count = 0
    junk_rows = {k: [] for k in np.arange(col_no)}
    junk_row_numbers = {k: [] for k in np.arange(col_no)}
    train_data_clean = {k: [] for k in header}
    with open(path, encoding='utf-8') as src_file:
        f_iter = get_file_iterator(src_file)
        for i, line in enumerate(f_iter):
            count += 1
            row = line.replace('\n', '').split('\t')
            if len(row) >= col_no - 1:
                if len(row) == col_no:
                    for j, k in enumerate(header):
                        train_data_clean[k].append(row[j].strip())
                elif col_no == 3:
                    train_data_clean[header[0]].append(row[0])
                    train_data_clean[header[1]].append(' ')
                    train_data_clean[header[2]].append(row[-1].strip())
#                     else:
#                         train_data_clean[header[2]].append(row[-2])
#                         train_data_clean[header[3]].append(row[-1])
                    
    #         if count > 10:
    #             break
            else:
                junk_rows[len(row)].append(row)
                junk_row_numbers[len(row)].append(i)
    train_data_clean = pd.DataFrame.from_dict(train_data_clean)
    train_data_clean['concatted_text'] = train_data_clean[['title', 'text']].apply(lambda row: '. '.join([row['title'], row['text']]).strip(), axis=1)
    train_data_clean.loc[train_data_clean['concatted_text'].str.startswith('. '), 'concatted_text'] = train_data_clean.loc[train_data_clean['concatted_text'].str.startswith('. '), 'concatted_text'].str[2:]
    mlb = MultiLabelBinarizer()
    if col_no > 3:
        train_data_clean['tags'] = train_data_clean['tags'].apply(lambda x: [int(nmbr) for nmbr in x.split(',')])
        mlb.fit(train_data_clean['tags'])
        train_data_clean['tags'] = [tag_list for tag_list in mlb.transform(train_data_clean['tags'])]
    return train_data_clean, mlb, junk_rows, junk_row_numbers

In [8]:
train_clean, mlb, junk_rows, junk_row_numbers = get_data('data/train.tsv', 4, HEADER_TRAIN)

In [11]:
vault = Vault('processed', 'intermediate_results')

In [12]:
# vault.train_clean = train_clean

In [13]:
train_clean = vault.train_clean

In [14]:
logger = logging.getLogger(__name__)

# we'll be writing to stdout and to log file for reliability
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%Y-%m-%d %H:%M:%S',
                    level = logging.INFO,
                    handlers = [logging.StreamHandler(sys.stdout),
                                logging.FileHandler(f'logs/{EXPERIMENT_NAME}.log', encoding='utf-8')])

In [15]:
n_gpu = torch.cuda.device_count()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
logger.info("device: {} n_gpu: {}".format(device, n_gpu))

I1027 05:30:47.838646 140488245839680 <ipython-input-16-aacf97904110>:1] device: cuda n_gpu: 2


Для классификации будем использовать BERT, реализацию https://github.com/huggingface/transformers/. Веса возьмем у DeepPavlov. http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz
Checkpoint для tensorflow. Он был сконвертирован в pytorch скриптом: https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py , когда он еще был в репозитории.
Ссылка на сконвертированную модель:


In [17]:
pretrained_bert_dir = Path('/work/work/word_embeddings/for_torch/rubert_cased_L-12_H-768_A-12_v1/dump_path/') 

In [18]:
tokenizer = BertTokenizer.from_pretrained(pretrained_bert_dir,
                                          do_lower_case=False)

I1027 05:30:48.030088 140488245839680 tokenization_utils.py:301] Model name '/work/work/word_embeddings/for_torch/rubert_cased_L-12_H-768_A-12_v1/dump_path/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc). Assuming '/work/work/word_embeddings/for_torch/rubert_cased_L-12_H-768_A-12_v1/dump_path/' is a path or url to a directory containing tokenizer files.
I1027 05:30:48.032034 140488245839680 tokenization_utils.py:330] Didn't find file /work/work/word_embeddings/for_torch/rubert_cased_L-12_H-768_A-12_v1/dump_path/added_tokens.json. We won't load it.
I1027 05:30:48.033525 140488245839680 tokenization_utils.p

In [19]:
bert_preprocessor = BertPreprocessor(tokenizer)

Разобьем на трейн и валидацию

In [20]:
def get_train_val_data(data, return_subsample=False):
    train, val = train_test_split(data, test_size=TEST_SIZE, random_state=RANDOM_SEED)
    if return_subsample:
        train = train.sample(100, random_state=RANDOM_SEED).copy()
        val = val.sample(100, random_state=RANDOM_SEED).copy()
    y_train = np.vstack(train['tags'].values)
    y_val = np.vstack(val['tags'].values)
    return train, val, y_train, y_val

In [22]:
# train, val, y_train, y_val = get_train_val_data(train_clean, return_subsample=True)

train, val, y_train, y_val = get_train_val_data(train_clean)

In [None]:
train_features = bert_preprocessor.transform(train['concatted_text'])

val_features = bert_preprocessor.transform(val['concatted_text'])

In [None]:
def get_tensor_dataset(features, target_column):
    all_input_ids = torch.tensor(features['input_ids'], dtype=torch.long)
    all_token_type_ids = torch.tensor(features['token_type_ids'], dtype=torch.long)
    all_attention_mask = torch.tensor(features['attention_mask'], dtype=torch.long)
    
    all_label_map = torch.tensor(target_column, dtype=torch.float32)
    
    return TensorDataset(all_input_ids, all_token_type_ids, all_attention_mask, all_label_map)

In [None]:
train_dataset = get_tensor_dataset(train_features, y_train)

In [None]:
val_dataset = get_tensor_dataset(val_features, y_val)

Самая похожая по назначению вариация БЕРТа у huggingface - BertForSequenceClassification - однаклассовая классификация текстов. Чтобы превратить ее в многоклассовую, будем использовать другой лосс - бинарную кросс-энтропию

In [None]:
class BertForMCSequenceClassification(BertPreTrainedModel):
    r"""
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
    Examples::
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
    """
    def __init__(self, config):
        super(BertForMCSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, labels=None):

        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids, 
                            head_mask=head_mask)

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
#             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            loss = loss_fct(logits, labels)
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [None]:
try:
    mlb
except NameError:
    mlb = None   

num_labels = len(mlb.classes_) if mlb else 100

In [None]:
model = BertForMCSequenceClassification.from_pretrained(pretrained_bert_dir,
                                                      num_labels=num_labels,
                                                      output_attentions=False)

In [None]:
model = model.to(device)

if n_gpu > 1:
    model = torch.nn.DataParallel(model)

Обучение модели

In [None]:
num_train_epochs = 5

learning_rate = 0.00003
warmup_proportion = 0.1

train_batch_size = 7 * n_gpu
gradient_accumulation_steps = 1

test_batch_size = train_batch_size

In [None]:
train_sampler = RandomSampler(train_dataset)
val_sampler = SequentialSampler(val_dataset)

In [None]:
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=test_batch_size)

In [None]:
num_train_optimization_steps = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
num_train_optimization_steps

In [None]:
num_warmup_steps = int(num_train_optimization_steps * warmup_proportion)
num_warmup_steps

In [None]:
param_needs_decay = lambda name: False if re.search(r'bias|LayerNorm\.weight', name) else True
optimizer_grouped_parameters = [
    {'weight_decay': 0.01, 'params': [p for n, p in model.named_parameters() if param_needs_decay(n)]},
    {'weight_decay': 0.0, 'params': [p for n, p in model.named_parameters() if not param_needs_decay(n)]}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=learning_rate,
                  correct_bias=False, # to reproduce BertAdam specific behavior
)

In [None]:
scheduler = WarmupLinearSchedule(optimizer,
                                 warmup_steps=num_warmup_steps,
                                 t_total=num_train_optimization_steps
)

In [None]:
criterion = BCEWithLogitsLoss()

In [None]:
optimizer.zero_grad()

In [None]:
def process_function(engine, batch):
    model.train()
    
    input_ids, token_type_ids, attention_mask, label_ids = (t.to(device) for t in batch)
    
    logits = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    
    loss = criterion(logits, label_ids)
    loss /= gradient_accumulation_steps

    loss.backward()

    if (engine.state.iteration + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

    return loss.item(), logits, label_ids

In [None]:
def eval_function(engine, batch):
    model.eval()
    with torch.no_grad():
        input_ids, token_type_ids, attention_mask, label_ids = (t.to(device) for t in batch)

        logits = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
        
        return logits, label_ids

In [None]:
trainer = Engine(process_function)
evaluator = Engine(eval_function)

In [None]:
RunningAverage(output_transform=lambda output: output[0]).attach(trainer, 'loss')

In [None]:
Loss(criterion).attach(evaluator, 'val_loss')

In [None]:
pbar = ProgressBar(persist=True, bar_format='')
pbar.attach(trainer, ['loss'])

In [None]:
def log_results(engine):
    metrics = engine.state.metrics
    
    evaluator.run(val_dataloader)
    val_metrics = evaluator.state.metrics
    
    engine.state.val_metrics = val_metrics
    
    logger.info(f'Training Results - Epoch: {engine.state.epoch:2} loss={metrics["loss"]:.3f}')
    
    logger.info(f'Validation Results - Epoch: {engine.state.epoch:2} loss={val_metrics["val_loss"]:.3f}')
    
    pbar.n = pbar.last_print_n = 0


trainer.add_event_handler(Events.EPOCH_COMPLETED, log_results) 

In [None]:
checkpointer = ModelCheckpoint(f'saved_models/{EXPERIMENT_NAME}',
                               'bert_classifier',
                               score_function=lambda engine: -engine.state.val_metrics['val_loss'],
                               score_name='val_loss',
                               n_saved=2,
                               create_dir=True,
                               save_as_state_dict=True,
                               require_empty=True)

trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model_state_dict': model.module if hasattr(model, 'module') else model})

In [None]:
tb_logger = TensorboardLogger(f'runs/{EXPERIMENT_NAME}')

# Attach the logger to the trainer to log training loss at each iteration
tb_logger.attach(trainer,
                 log_handler=OutputHandler(tag="training", output_transform=lambda output: {'loss': output[0]}),
                 event_name=Events.ITERATION_COMPLETED)

# Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration
tb_logger.attach(trainer,
                 log_handler=OptimizerParamsHandler(optimizer),
                 event_name=Events.ITERATION_STARTED)

# Attach the logger to the evaluator and metrics after each epoch
# We setup `another_engine=trainer` to take the epoch of the `trainer` instead of `evaluator`.
tb_logger.attach(evaluator,
                 log_handler=OutputHandler(tag="validation",
                                           metric_names=["val_loss"],
                                           another_engine=trainer),
                 event_name=Events.EPOCH_COMPLETED)

In [None]:
num_train_epochs

In [None]:
# vault.train_data_sigmoid = train_dataset
# vault.val_data_sigmoid = val_dataset

In [None]:
trainer.run(train_dataloader, max_epochs=num_train_epochs)

tb_logger.close()