In [595]:
import numpy as np 
import pandas as pd
import os
import re

import wandb

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score

from datasets import Dataset, list_metrics, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

import torch
from torch import nn

np.random.seed(0)

In [596]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wan_db_key = user_secrets.get_secret("wandb-key")
os.environ['WANDB_API_KEY'] = wan_db_key

wandb.login(key = wan_db_key)



True

In [597]:
# clear html tags
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def stripdates(data):
    p = re.compile(r'[0-9]{2}[\/,:][0-9]{2}[\/,:][0-9]{2,4}')
    return p.sub('', data)

def stripdigits(data):
    p = re.compile(r'\d+')
    return p.sub('', data)

def strip_urls(data):
    p = re.compile(r'[A-Za-z0-9]+://[A-Za-z0-9%-_]+(/[A-Za-z0-9%-_])*(#|\\?)[A-Za-z0-9%-_&=]*')
    return p.sub('', data)

https://huggingface.co/docs/transformers/training

In [598]:
train = pd.read_csv('../input/classification-of-citizens-appeals/train_dataset_train.csv', index_col=0)
test = pd.read_csv('../input/classification-of-citizens-appeals/test_dataset_test.csv', index_col=0)
sample_subm = pd.read_csv('../input/classification-of-citizens-appeals/Kursk/sample_solution.csv')

In [599]:
def rnd_perm_text_by_parts(train_example, n_splits=5):
    if len(train_example) < n_splits:
        n_splits = len(train_example)
    text_parts = np.array_split(train_example.split(' '), n_splits)
    rnd_idxs = np.random.choice(len(text_parts), size=len(text_parts), replace=False)
    return ' '.join([' '.join(text_parts[idx]) for idx in rnd_idxs])

def rnd_perm_text_by_seq(train_example, n_splits=3):
    import nltk.data
    tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
    sent_text = nltk.sent_tokenize(train_example)
    sent_count = len(sent_text) 
    if sent_count < n_splits:
        n_splits = sent_count
    step = round(sent_count / n_splits)
    text_parts = []
    for i in range(0, n_splits):
        text_parts.append(sent_text[step-3:step])
        step += step
    
    rnd_idxs = np.random.choice(n_splits, size=n_splits, replace=False)
    return ' '.join([' '.join(text_parts[idx]) for idx in rnd_idxs])

Drop

In [600]:
test = test.drop(columns=['Тематика', 'Ответственное лицо'])
train = train.drop(columns=['Тематика', 'Ответственное лицо'])

train = train.drop(train[train['Категория'] == 12].index)
train['Категория'].value_counts()

3     954
0     478
16    149
8     139
4     108
10     48
7      27
1      25
11     19
5      12
13     11
6      10
15      7
9       5
14      4
2       3
Name: Категория, dtype: int64

In [601]:
train['Категория'] = train['Категория'].apply(lambda x: x-1 if x >=12 else x)

In [602]:
train['Категория'].value_counts()

3     954
0     478
15    149
8     139
4     108
10     48
7      27
1      25
11     19
5      12
12     11
6      10
14      7
9       5
13      4
2       3
Name: Категория, dtype: int64

Clean

In [603]:
train['Текст Сообщения'] = train['Текст Сообщения'].apply(striphtml)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(striphtml)

train['Текст Сообщения'] = train['Текст Сообщения'].str.replace('&nbsp;', ' ')
test['Текст Сообщения'] = test['Текст Сообщения'].str.replace('&nbsp;', ' ')

train['Текст Сообщения'] = train['Текст Сообщения'].apply(stripdates)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(stripdates)

train['Текст Сообщения'] = train['Текст Сообщения'].apply(stripdigits)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(stripdigits)

train['Текст Сообщения'] = train['Текст Сообщения'].apply(strip_urls)
test['Текст Сообщения'] = test['Текст Сообщения'].apply(strip_urls)

In [604]:
train['Категория'].value_counts(), len(train)

(3     954
 0     478
 15    149
 8     139
 4     108
 10     48
 7      27
 1      25
 11     19
 5      12
 12     11
 6      10
 14      7
 9       5
 13      4
 2       3
 Name: Категория, dtype: int64,
 1999)

Split

In [605]:
test_val_split = True
test_size = 0.33

if test_val_split:
    X_train, X_test = train_test_split(train, test_size=test_size, random_state=43, stratify=train['Категория']) #
else:
    X_train, X_test = train, train

Text augmentations

In [606]:
cats_for_augs = [2, 13, 9, 14, 6, 12, 5]

nums_of_perm = 5

def text_augment(train, func, nums_of_perm=nums_of_perm):
    for category in cats_for_augs:
        rows_for_augs = train[train['Категория'] == category]
        for i, row in rows_for_augs.iterrows():
            rows = []
            for i in range(0, nums_of_perm):
                msg_text = row['Текст Сообщения']
                new_msg_text = func(msg_text)
                row = row.copy()
                row['Текст Сообщения'] = new_msg_text
                rows.append(row)
            train = pd.concat([train, pd.DataFrame(rows)]).sort_index()
    return train

X_train = text_augment(X_train, rnd_perm_text_by_seq)  # rnd_perm_text_by_parts rnd_perm_text_by_seq
X_test = text_augment(X_test, rnd_perm_text_by_seq, nums_of_perm=3)

In [607]:
X_train['Категория'].value_counts()

3     639
0     320
15    100
8      93
4      72
5      48
12     42
6      42
10     32
14     30
7      18
13     18
9      18
1      17
11     13
2      12
Name: Категория, dtype: int64

In [608]:
X_test['Категория'].value_counts()

3     315
0     158
15     49
8      46
4      36
10     16
12     16
5      16
6      12
7       9
1       8
14      8
9       8
11      6
2       4
13      4
Name: Категория, dtype: int64

In [609]:
# train_dict = train.rename(columns={'Текст Сообщения':'text', 'Категория': 'label'}).to_dict(orient='records')

# test_size = 0.33
# train_dict_train, train_dict_val = train_test_split(train_dict, test_size=test_size, stratify = train['Категория'], random_state=42)

# dataset = {
#     'train': train_dict_train,
#     'val': train_dict_val
# }

In [610]:
train_dataset = Dataset.from_pandas(X_train.reset_index(drop=True).rename(columns={'Текст Сообщения':'text', 'Категория': 'label'}))
eval_dataset = Dataset.from_pandas(X_test.reset_index(drop=True).rename(columns={'Текст Сообщения':'text', 'Категория': 'label'}))
test_dataset = Dataset.from_pandas(test.reset_index(drop=True).rename(columns={'Текст Сообщения':'text'}))
train_dataset[0]

{'text': 'Три неделю пытаюсь достучаться до Водоканала.  Необходимо в связи со сменой владельца изменить лицевой счет на мое имя.  письма со сканами документов - ответа нет. Телефон или занят или не отвечает. Неужели по такой обстановке необходимо лично ехать в Водоканал и мерзнуть на улице в очереди? ',
 'label': 3}

Model

DeepPavlov/rubert-base-cased

DeepPavlov/rubert-base-cased-sentence

sberbank-ai/ruBert-base

DeepPavlov/rubert-base-cased-conversational

cointegrated/rubert-tiny2

In [611]:
model_name = "sberbank-ai/ruBert-base" #   

In [612]:
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512) 
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/sberbank-ai/ruBert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3ff2b30ffd2e83991ada1f23ca4d7adad284baa741ea21704f02d83b72405c79.b7ac951e56a7d9c2e7e295337ac13c91834fc4cd1578bc46e5ebc1fb8ac81fb5
Model config BertConfig {
  "_name_or_path": "sberbank-ai/ruBert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head

In [613]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [614]:
# small_train_dataset = tokenized_datasets.shuffle(seed=42).select(range(1000))
# small_eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000))

In [615]:
# del model
# # del pytorch_model
# del trainer
# torch.cuda.empty_cache()

In [616]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=16) 

loading configuration file https://huggingface.co/sberbank-ai/ruBert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3ff2b30ffd2e83991ada1f23ca4d7adad284baa741ea21704f02d83b72405c79.b7ac951e56a7d9c2e7e295337ac13c91834fc4cd1578bc46e5ebc1fb8ac81fb5
Model config BertConfig {
  "_name_or_path": "sberbank-ai/ruBert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {


In [617]:
# model = AutoModel.from_pretrained("./model")

In [618]:
# metrics_list = list_metrics()
# print(metrics_list)

epochs = 5

training_args = TrainingArguments(output_dir="test_trainer", per_device_train_batch_size = 12, num_train_epochs = epochs, evaluation_strategy="epoch", \
                                 seed=0, per_device_eval_batch_size=12)
# load_best_model_at_end=True

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
roc_auc_metric = load_metric("roc_auc", "multiclass")

def softmax(z):
    return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    predictions_score = softmax(logits)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)['accuracy']
    f1_score = f1_metric.compute(predictions=predictions, references=labels, average="macro")['f1']
    
    roc_auc_score = roc_auc_metric.compute(references=labels, prediction_scores=predictions_score, average="macro", multi_class="ovo")['roc_auc']
    return {"accuracy": accuracy, "f1": f1_score, "roc_auc": roc_auc_score}

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [619]:
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=-1,
log_level=-1,
log_level_replica=-1,
log_on_each_node=True,
lo

In [620]:
class_sample_count = np.unique(X_train['Категория'], return_counts=True)[1]
weight = 1. / class_sample_count
samples_weight = weight[X_train['Категория']]
samples_weight = torch.from_numpy(samples_weight)

In [621]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weight, dtype=torch.float32).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [622]:
# Define model
# def model_init():
#   return AutoModelForSequenceClassification.from_pretrained(model_name, 
#                                                            num_labels=3,
#                                                            output_attentions = False, # Whether the model returns attentions weights.
#                                                            output_hidden_states = False,
#                                                            return_dict=True 
#                                                            )


In [623]:
trainer = CustomTrainer(
    model=model, # instead model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1514
  Num Epochs = 5
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 635
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,No log,1.305354,0.603376,0.475481,0.924534
2,No log,1.095274,0.748242,0.50585,0.895956
3,No log,1.114874,0.753868,0.486382,0.88996
4,0.780800,1.217955,0.745429,0.505344,0.89302


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 711
  Batch size = 12
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 711
  Batch size = 12
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 711
  Batch size = 12
Saving model 

Bert cased (no augs, random , sentence):

roc auc 0.866100 epoch 3

roc auc 0.915015 epoch 1

roc auc 0.920037 epoch 1

Bert cased sentence (no augs, random , sentence):

roc auc 0.885978 epoch 2 

roc auc 0.920589 epoch 2

roc auc 0.923592 epoch 1

save model

In [None]:
# os.makedirs("./model")
# model.save_pretrained('./model')

Predict

In [None]:
test_preds = trainer.predict(test_dataset)
print(test_preds.predictions.shape)

test_preds_classes = np.argmax(test_preds.predictions, axis=-1)
test_preds_classes = np.where(test_preds_classes >=12, test_preds_classes + 1, test_preds_classes)

In [None]:
np.unique(test_preds_classes, return_counts=True)

In [None]:
sort_idxs = np.unique(test_preds_classes, return_counts=True)[1].argsort()
np.unique(test_preds_classes, return_counts=True)[1][sort_idxs[::-1]]

In [None]:
np.unique(test_preds_classes, return_counts=True)[0][sort_idxs[::-1]]

Submit

In [None]:
submit_csv_file_name = 'sberrubert_epoch_5_0_33_augs_sent.csv'
sample_subm['Категория'] = test_preds_classes
sample_subm.to_csv(submit_csv_file_name, index=False)
sample_subm.head()

In [None]:
from IPython.display import FileLink
FileLink(submit_csv_file_name)

Try augmentations

https://towardsdatascience.com/nlp-data-augmentation-using-transformers-89a44a993bab

In [None]:
val_preds = trainer.predict(eval_dataset)
print(val_preds.predictions.shape)

val_preds_classes = np.argmax(val_preds.predictions, axis=-1)
val_preds_classes = np.where(val_preds_classes >=12, val_preds_classes + 1, val_preds_classes)

In [None]:
roc_auc_metric.compute(references=X_test['Категория'], prediction_scores=softmax(val_preds.predictions), average="macro", multi_class="ovo")['roc_auc']

In [None]:
# roc_auc_score(label_binarize(X_test['Категория'], classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), label_binarize(np.argmax(val_preds.predictions, axis=1), classes=[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15,16]), average="macro", multi_class="ovo")

In [None]:
print(classification_report(X_test['Категория'], val_preds_classes))

matrix = confusion_matrix(X_test['Категория'], val_preds_classes)
matrix.diagonal()/matrix.sum(axis=0)

plt.figure(figsize=(8, 6), dpi=80)
sns.heatmap(matrix, annot=True, fmt='d')