In [None]:
import os

#os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from torch import nn
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer, BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader
from transformers.modeling_outputs import SequenceClassifierOutput

from tqdm import tqdm_notebook, trange

from sklearn.metrics import confusion_matrix, classification_report

from transformers import DataCollatorForTokenClassification
import torch
from torch.autograd import Variable
from pathlib import Path
import re
from datasets import load_dataset

from sklearn.model_selection import train_test_split

import time
import wandb

from sklearn import metrics,utils

from matplotlib import pyplot as plt

In [None]:
data = pd.read_csv('data.csv')
print(data.describe())
plt.hist(data['label'])
plt.show()

In [None]:
data = pd.read_csv('data.csv')
classes = np.sort(data['label'].drop_duplicates().values)
classes_dict = {}
for i, cls in enumerate(classes):
    classes_dict[str(cls)] = i
classes_dict

In [None]:

num_labels = len(classes_dict)
print(num_labels)

data['label'] = data['label'].map(lambda x: classes_dict[str(x)])
data['label'] = data['label'].map(lambda x: 1 if x>4  else 0)

max_len = [len(line.split(' ')) for line in data['text']]
print(max(max_len))
data.head(5)

In [None]:
plt.hist(data['label'], bins=20)
plt.show()

In [None]:
plt.hist(max_len, bins=20)
plt.show()

In [None]:
id2label = {str(i): label for i, label in enumerate(classes)}
label2id = {label: str(i) for i, label in enumerate(classes)}

dataset = Dataset.from_pandas(data, preserve_index=False)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
model_name='sberbank-ai/ruBert-base'
# model_name = 'distilbert-base-multilingual-cased'
# model_name='distilbert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name, model_max_length=32)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.config.id2label = id2label
model.config.label2id = label2id
model.config._num_labels = len(id2label)
model.config.num_labels = len(id2label)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
batch_size = 32
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size)

In [None]:
class_weights = utils.class_weight.compute_class_weight(class_weight ='balanced',  classes=range(0,2), y=dataset['train']['label'])
class_weights=np.array(class_weights).astype(np.float32)
class_weights

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4,weight_decay=1e-3)
device = 'cuda'
criterion = nn.CrossEntropyLoss(weight=torch.from_numpy(class_weights).to(device))
epochs = 5


model.train()
model.to(device)

metric = load_metric("seqeval")

for i in trange(epochs, desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm_notebook(train_dataloader, desc="Iteration")):
        labels = torch.from_numpy(np.asarray(batch['label'])).to(device)

        input_ids = batch['input_ids']
        input_ids = torch.stack((input_ids)).to(device)
        input_ids = input_ids.permute(1, 0)

        attention_mask = batch['attention_mask']
        attention_mask = torch.stack((attention_mask)).to(device)
        attention_mask = attention_mask.permute(1, 0)

        output = model(input_ids, attention_mask, token_type_ids=None, labels=labels)
        logits = output['logits']

        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()

        msg = f'Epoch: {i + 1}/{epochs}\tStep: {step + 1}/{len(train_dataloader)}\tLoss: {tr_loss / (step + 1):.3f}\n'
        print(msg, end='')

    tr_loss_test = 0
    with torch.no_grad():
        true_predictions=[]
        true_labels=[]
        for step, batch in enumerate(tqdm_notebook(test_dataloader, desc="Iteration")):
            labels = torch.from_numpy(np.asarray(batch['label'])).to(device)

            input_ids = batch['input_ids']
            input_ids = torch.stack((input_ids)).to(device)
            input_ids = input_ids.permute(1, 0)

            attention_mask = batch['attention_mask']
            attention_mask = torch.stack((attention_mask)).to(device)
            attention_mask = attention_mask.permute(1, 0)

            output = model(input_ids, attention_mask, token_type_ids=None, labels=labels)
            logits = output['logits']

            loss = criterion(logits, labels)

            true_predictions.extend(logits.detach().cpu().numpy())
            true_labels.extend(labels.detach().cpu().numpy())

            tr_loss_test += loss.item()

    preds=np.argmax(true_predictions,axis=1).reshape((-1,1))

    # results = metric.compute(predictions=true_predictions, references=true_labels)

    # msg = f'Epoch: {i + 1}/{epochs}\t Validation Loss: {tr_loss_test / (len(test_dataloader)):.3f}\tRecall{results["overall_recall"]:.3f}\t' \
    #       f'Precision: {results["overall_precision"]:.3f}\tAccuracy: {results["overall_accuracy"]:.3f}\tF1: {results["overall_f1"]:.3f} \n'

    print(classification_report(true_labels,preds))
    # print(msg, end='')


# Finetune with  trainer (not completed)

In [None]:

def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True):
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid:
        y_pred = y_pred.sigmoid()
    return ((y_pred > thresh) == y_true.bool()).float().mean().item()


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

In [None]:
batch_size = 16
epochs = 4

args = TrainingArguments(
    f"test-classify-{str(time.time())}",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    learning_rate=1e-3,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    logging_strategy='steps',
    logging_first_step=True,
    logging_steps=5,
    report_to='wandb',
    # fp16=True,
    weight_decay=1e-5,
    dataloader_num_workers=4,
    metric_for_best_model='accuracy'
)

In [None]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='max_length')

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    # data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()