# Fine-tuning on pair classification task

## Imports

In [1]:
!pip3 install gdown stanza spacy nltk torch==1.13.1 accelerate transformers evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive

from ast import literal_eval
from collections import Counter, defaultdict
import locale
import logging
import re
import random
import os
import pandas as pd
import pickle
import numpy as np
import sklearn
from sklearn.metrics import classification_report
import stanza
import spacy
import torch

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from tqdm import tqdm

import evaluate

stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [5]:
!gdown https://drive.google.com/drive/folders/1bloHcXb5tiBaHjNuX6ZTVPBrm4lXWsNN -O automobiles --folder

Retrieving folder list
Retrieving folder 1--gufzIJ_sFCQAG41l6gwPCQl3PXwjsv .ipynb_checkpoints
Processing file 1-PyXsFOOJiTFv_u7qi7W9tl4W3LiGJvL eval_bert_dataset.csv
Processing file 1Y2E6471hlLBGlh2TKNXRBy6_8V2zt9TG eval_model_dataset.csv
Processing file 1-HuGW0DaY9tRuZCYKW6a9u4w9G4KRTrx eval_xlmroberta_dataset.csv
Processing file 1-Oa6L8yfQhriXTuOFxm835-dJli9vf-5 test_bert_dataset.csv
Processing file 1GmAL_4MfntMBmFqfUbhoCMoP4Zbd0Fq1 test_model_dataset.csv
Processing file 1-FbxwKGr8GV27qmOv5DTgRp0NeweqHHp test_xlmroberta_dataset.csv
Processing file 1-QXsb8atrpV948xH0Zq-IUB0EmM6Baqd train_bert_dataset.csv
Processing file 1aZOiztlKWmMPqhjcBbTel_9gTls-gYZz train_model_dataset.csv
Processing file 1-N9fBOka5zHcnQm0uU_9FvtGKTafP_ZE train_xlmroberta_dataset.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1-PyXsFOOJiTFv_u7qi7W9tl4W3LiGJvL
To: /content/automobiles/eval_bert_dataset.csv


In [6]:
def seed_everything(seed=42) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

In [7]:
seed_everything()

In [9]:
SENTIMENT = ['neutral', 'positive', 'negative']
CATEGORIES = ['Comfort', 'Appearance', 'Reliability', 'Safety', 'Driveability', 'Whole', 'Costs']
CATEGORIES_SENTIMENT = [f'{cat} {sent}' for cat in CATEGORIES for sent in SENTIMENT]

In [10]:
id2label_category = {i: label for i, label in enumerate(CATEGORIES)}
label2id_category = {v: k for k, v in id2label_category.items()}

id2label_sentiment = {i: label for i, label in enumerate(SENTIMENT)}
label2id_sentiment = {v: k for k, v in id2label_sentiment.items()}

id2label_category_sentiment = {i: label for i, label in enumerate(CATEGORIES_SENTIMENT)}
label2id_category_sentiment = {v: k for k, v in id2label_category_sentiment.items()}

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## ABSA as NLI


A free interpretation of [this](https://aclanthology.org/W19-6120.pdf) article. In short: aspect-oriented sentiment analysis can be turned into the task of finding a connection between a premise and a hypothesis. In our case, a sentence containing an aspect is a premise, the aspect itself is a hypothesis, and they have some label (for example, tonality or category). How to do?
* Prepare the data. We need triples of the form premise-hypothesis-category.
* Fine-tune models
* Find aspects in the test data. It is possible again with the help of a thesaurus and the best collocations for some metric.

## Data

In [12]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
def get_dataset(path: str, tokenizer: AutoTokenizer) -> tuple:
    '''
    Get dataset from files.
    '''
    dataset = pd.read_csv(path, delimiter='\t')
    data = dataset['text'].values.tolist()
    category_ids = dataset['category'].values.tolist()
    sentiment_ids = dataset['sentiment'].values.tolist()

    category_sentiment_ids = [label2id_category_sentiment[f'{id2label_category[cat]} {id2label_sentiment[sent]}'] for cat, sent in zip(category_ids, sentiment_ids)]

    encodings = tokenizer(data, truncation=True, max_length=100, padding=True, add_special_tokens=False)

    category_ds = ReviewsDataset(encodings, category_ids)
    sentiment_ds = ReviewsDataset(encodings, sentiment_ids)
    category_sentiment_ds = ReviewsDataset(encodings, category_sentiment_ids)

    return category_ds, sentiment_ds, category_sentiment_ds

In [14]:
train_bert = '/content/automobiles/train_bert_dataset.csv'
eval_bert = '/content/automobiles/eval_bert_dataset.csv'
test_bert = '/content/automobiles/test_bert_dataset.csv'

train_xlmroberta = '/content/automobiles/train_xlmroberta_dataset.csv'
eval_xlmroberta = '/content/automobiles/eval_xlmroberta_dataset.csv'
test_xlmroberta = '/content/automobiles/test_xlmroberta_dataset.csv'

In [15]:
df = pd.read_csv(train_bert, delimiter='\t')

In [16]:
df['text'].values.tolist()[0]

'[CLS] Используем данный автомобиль в целях всей семьи . [SEP] автомобиль [SEP]'

## Model

In [17]:
class NLITrainPipeline:

    def __init__(self, train_ds, val_ds, model_checkpoint, new_model_checkpoint, labels):
        self.train_ds = train_ds
        self.val_ds = val_ds

        self.model_checkpoint = model_checkpoint
        self.new_model_checkpoint = new_model_checkpoint

        self.num_labels = len(labels)
        self.label2id = {label: i for i, label in enumerate(labels)}
        self.id2label = {i: label for i, label in enumerate(labels)}

        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_checkpoint,
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
            )
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

        self.training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=64,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
        )

        self.clf_report = None

    def train(self):
        '''
        Fine-tune model on the downstream task.
        '''
        self.model.to(device)
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_ds,
            eval_dataset=self.val_ds
        )

        trainer.train()
        trainer.save_model(self.new_model_checkpoint)

    def inference(self, path, label):
        '''
        Evaluate model.
        '''
        new_model = AutoModelForSequenceClassification.from_pretrained(
            self.new_model_checkpoint,
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
            )
        new_model.to(device)

        true_labels = []
        predicted_labels = []

        test_ds = pd.read_csv(path, delimiter='\t')

        for idx, row in test_ds.iterrows():
            tokenized = self.tokenizer(
                row['text'], return_tensors='pt',
                truncation=True, max_length=100,
                padding=True, add_special_tokens=False
                )
            tokenized = tokenized.to(device)
            with torch.no_grad():
                classification_logits = new_model(**tokenized).logits
            predicted_class_id = classification_logits.to('cpu').argmax().item()
            true_labels.append(new_model.config.id2label[row[label]])
            predicted_labels.append(new_model.config.id2label[predicted_class_id])

        self.clf_report = classification_report(true_labels, predicted_labels)

        return self.clf_report

In [18]:
class NLITrainPipelineCatSent(NLITrainPipeline):

    def __init__(self, *args, **kwargs):
        super(NLITrainPipelineCatSent, self).__init__(*args, **kwargs)

    def inference(self, path):
        '''
        Evaluate model.
        '''
        new_model = AutoModelForSequenceClassification.from_pretrained(
            self.new_model_checkpoint,
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
            )
        new_model.to(device)

        true_labels = []
        predicted_labels = []

        test_ds = pd.read_csv(path, delimiter='\t')

        for idx, row in test_ds.iterrows():
            tokenized = self.tokenizer(
                row['text'], return_tensors='pt',
                truncation=True, max_length=100,
                padding=True, add_special_tokens=False
                )
            tokenized = tokenized.to(device)
            with torch.no_grad():
                classification_logits = new_model(**tokenized).logits
            predicted_class_id = classification_logits.to('cpu').argmax().item()
            true_labels.append(f"{id2label_category[row['category']]} {id2label_sentiment[row['sentiment']]}")
            predicted_labels.append(new_model.config.id2label[predicted_class_id])

        self.clf_report = classification_report(true_labels, predicted_labels, output_dict=True)

        return classification_report(true_labels, predicted_labels)

    def sentiment_inference(self):
        '''
        Inference on sentiment to the category-sentiment model.
        '''
        if not self.clf_report:
            raise ValueError('Classification report is empty!')

        sentiment_report = {
            'neutral': {'precision': 0, 'recall': 0, 'f1-score': 0},
            'positive': {'precision': 0, 'recall': 0, 'f1-score': 0},
            'negative': {'precision': 0, 'recall': 0, 'f1-score': 0}
        }

        for label, metrics in self.clf_report.items():
            if label not in ['accuracy', 'macro avg', 'weighted avg']:
                sent_label = label.split()[1]
                if sent_label in sentiment_report:
                    sent_dict = sentiment_report[sent_label]
                    for metric, score in metrics.items():
                        if metric in sent_dict:
                            sent_dict[metric] += (score / len(CATEGORIES))

        return pd.DataFrame(sentiment_report)

In [19]:
# usage
# tokenized = tokenizer(text, return_tensors="pt")
# classification_logits = model(**tokenized).logits
# results = torch.softmax(classification_logits, dim=1).tolist()[0]
# for i in range(len(CATEGORIES)):
#     print(f"{CATEGORIES[i]}: {int(results[i] * 100)}%")

## ruBERT

In [20]:
model_checkpoint = 'ai-forever/ruBert-base'

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [22]:
train_category_ds, train_sentiment_ds, train_category_sentiment_ds = get_dataset(train_bert, tokenizer)
eval_category_ds, eval_sentiment_ds, eval_category_sentiment_ds = get_dataset(eval_bert, tokenizer)

In [23]:
train_category_ds[0]

{'input_ids': tensor([  101, 53515,  1260,   667,   378,  6938,   113,  9617,   780,   378,
          5017,   126,   102,  6938,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 

### Sentiment model

In [24]:
nli_trainer_rubert = NLITrainPipeline(
    train_sentiment_ds, eval_sentiment_ds,
    model_checkpoint, 'rubert-sentiment-nli_automobiles', SENTIMENT
)
nli_trainer_rubert.train()

Downloading pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

Step,Training Loss
100,0.9952
200,0.6455
300,0.5875
400,0.5055
500,0.4281
600,0.3732
700,0.4078
800,0.3655
900,0.3158
1000,0.1958


In [25]:
print(nli_trainer_rubert.inference(test_bert, 'sentiment'))
del nli_trainer_rubert

              precision    recall  f1-score   support

    negative       0.82      0.84      0.83       343
     neutral       0.71      0.59      0.64       256
    positive       0.89      0.93      0.91       759

    accuracy                           0.84      1358
   macro avg       0.81      0.79      0.79      1358
weighted avg       0.84      0.84      0.84      1358



### Category model

In [26]:
nli_trainer_rubert = NLITrainPipeline(
    train_category_ds, eval_category_ds,
    model_checkpoint, 'rubert-category-nli_automobiles', CATEGORIES
)
nli_trainer_rubert.train()

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

Step,Training Loss
100,1.7911
200,1.0917
300,0.6599
400,0.5439
500,0.4369
600,0.4085
700,0.3842
800,0.3808
900,0.3523
1000,0.213


In [27]:
print(nli_trainer_rubert.inference(test_bert, 'category'))
del nli_trainer_rubert

              precision    recall  f1-score   support

  Appearance       0.86      0.68      0.76       103
     Comfort       0.87      0.89      0.88       349
       Costs       0.89      0.85      0.87       125
Driveability       0.79      0.83      0.81       338
 Reliability       0.74      0.77      0.75       178
      Safety       0.70      0.66      0.68        29
       Whole       0.83      0.83      0.83       236

    accuracy                           0.82      1358
   macro avg       0.81      0.79      0.80      1358
weighted avg       0.82      0.82      0.82      1358



### Category and sentiment model

In [28]:
nli_trainer_rubert = NLITrainPipelineCatSent(
    train_category_sentiment_ds, eval_category_sentiment_ds,
    model_checkpoint, 'rubert-category-sentiment-nli_automobiles', CATEGORIES_SENTIMENT
)
nli_trainer_rubert.train()

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

Step,Training Loss
100,2.8575
200,2.3202
300,1.6708
400,1.2577
500,1.0663
600,0.9595
700,0.8862
800,0.8281
900,0.7503
1000,0.5047


In [29]:
print(nli_trainer_rubert.inference(test_bert))
print(nli_trainer_rubert.sentiment_inference())
del nli_trainer_rubert

                       precision    recall  f1-score   support

  Appearance negative       0.75      0.30      0.43        30
   Appearance neutral       0.39      0.47      0.43        19
  Appearance positive       0.83      0.74      0.78        54
     Comfort negative       0.61      0.86      0.71        76
      Comfort neutral       0.06      0.06      0.06        18
     Comfort positive       0.92      0.89      0.90       255
       Costs negative       0.60      0.63      0.62        41
        Costs neutral       1.00      0.05      0.09        21
       Costs positive       0.68      0.78      0.73        63
Driveability negative       0.59      0.57      0.58        58
 Driveability neutral       0.56      0.43      0.49        76
Driveability positive       0.84      0.81      0.83       204
 Reliability negative       0.75      0.76      0.76        92
  Reliability neutral       0.00      0.00      0.00         8
 Reliability positive       0.55      0.79      0.65  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## mBERT

In [30]:
model_checkpoint = 'bert-base-multilingual-cased'

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [32]:
train_category_ds, train_sentiment_ds, train_category_sentiment_ds = get_dataset(train_bert, tokenizer)
eval_category_ds, eval_sentiment_ds, eval_category_sentiment_ds = get_dataset(eval_bert, tokenizer)

### Sentiment model

In [33]:
nli_trainer_mbert = NLITrainPipeline(
    train_sentiment_ds, eval_sentiment_ds,
    model_checkpoint, 'mbert-sentiment-nli_automobiles', SENTIMENT
)
nli_trainer_mbert.train()

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss
100,0.997
200,0.8365
300,0.7241
400,0.6652
500,0.5971
600,0.5742
700,0.5712
800,0.4937
900,0.433
1000,0.3171


In [34]:
print(nli_trainer_mbert.inference(test_bert, 'sentiment'))
del nli_trainer_mbert

              precision    recall  f1-score   support

    negative       0.66      0.67      0.66       343
     neutral       0.58      0.54      0.56       256
    positive       0.81      0.82      0.81       759

    accuracy                           0.73      1358
   macro avg       0.68      0.67      0.68      1358
weighted avg       0.73      0.73      0.73      1358



### Category model

In [35]:
nli_trainer_mbert = NLITrainPipeline(
    train_category_ds, eval_category_ds,
    model_checkpoint, 'mbert-category-nli_automobiles', CATEGORIES
)
nli_trainer_mbert.train()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss
100,1.7994
200,1.2992
300,0.9443
400,0.8197
500,0.7362
600,0.6926
700,0.551
800,0.5933
900,0.5067
1000,0.3674


In [36]:
print(nli_trainer_mbert.inference(test_bert, 'category'))
del nli_trainer_mbert

              precision    recall  f1-score   support

  Appearance       0.83      0.66      0.74       103
     Comfort       0.81      0.85      0.83       349
       Costs       0.79      0.82      0.81       125
Driveability       0.80      0.79      0.80       338
 Reliability       0.66      0.74      0.70       178
      Safety       0.60      0.62      0.61        29
       Whole       0.82      0.75      0.78       236

    accuracy                           0.78      1358
   macro avg       0.76      0.75      0.75      1358
weighted avg       0.78      0.78      0.78      1358



### Category and sentiment model

In [37]:
nli_trainer_mbert = NLITrainPipelineCatSent(
    train_category_sentiment_ds, eval_category_sentiment_ds,
    model_checkpoint, 'mbert-category-sentiment-nli_automobiles', CATEGORIES_SENTIMENT
)
nli_trainer_mbert.train()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss
100,2.8926
200,2.4874
300,2.0185
400,1.7586
500,1.6013
600,1.4603
700,1.3061
800,1.2206
900,1.1749
1000,0.8755


In [38]:
print(nli_trainer_mbert.inference(test_bert))
print(nli_trainer_mbert.sentiment_inference())
del nli_trainer_mbert

                       precision    recall  f1-score   support

  Appearance negative       0.09      0.03      0.05        30
   Appearance neutral       0.20      0.21      0.21        19
  Appearance positive       0.64      0.70      0.67        54
     Comfort negative       0.49      0.64      0.55        76
      Comfort neutral       0.00      0.00      0.00        18
     Comfort positive       0.80      0.85      0.82       255
       Costs negative       0.33      0.32      0.32        41
        Costs neutral       0.00      0.00      0.00        21
       Costs positive       0.59      0.75      0.66        63
Driveability negative       0.32      0.40      0.36        58
 Driveability neutral       0.55      0.39      0.46        76
Driveability positive       0.69      0.70      0.69       204
 Reliability negative       0.65      0.62      0.63        92
  Reliability neutral       0.00      0.00      0.00         8
 Reliability positive       0.55      0.65      0.60  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## XLM-RoBERTa

In [39]:
model_checkpoint = 'xlm-roberta-base'

In [40]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [41]:
train_category_ds, train_sentiment_ds, train_category_sentiment_ds = get_dataset(train_xlmroberta, tokenizer)
eval_category_ds, eval_sentiment_ds, eval_category_sentiment_ds = get_dataset(eval_xlmroberta, tokenizer)

### Sentiment model

In [42]:
nli_trainer_xlmroberta = NLITrainPipeline(
    train_sentiment_ds, eval_sentiment_ds,
    model_checkpoint, 'xlmroberta-sentiment-nli_automobiles', SENTIMENT
)
nli_trainer_xlmroberta.train()

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Step,Training Loss
100,1.0302
200,0.9125
300,0.7213
400,0.6812
500,0.6593
600,0.6293
700,0.6069
800,0.5042
900,0.474
1000,0.3935


In [43]:
print(nli_trainer_xlmroberta.inference(test_xlmroberta, 'sentiment'))
del nli_trainer_xlmroberta

              precision    recall  f1-score   support

    negative       0.80      0.80      0.80       343
     neutral       0.68      0.59      0.63       256
    positive       0.87      0.91      0.89       759

    accuracy                           0.82      1358
   macro avg       0.78      0.77      0.77      1358
weighted avg       0.82      0.82      0.82      1358



### Category model

In [44]:
nli_trainer_xlmroberta = NLITrainPipeline(
    train_category_ds, eval_category_ds,
    model_checkpoint, 'xlmroberta-category-nli_automobiles', CATEGORIES
)
nli_trainer_xlmroberta.train()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Step,Training Loss
100,1.8493
200,1.3911
300,0.8977
400,0.7527
500,0.6941
600,0.6574
700,0.5919
800,0.543
900,0.4963
1000,0.3785


In [45]:
print(nli_trainer_xlmroberta.inference(test_xlmroberta, 'category'))
del nli_trainer_xlmroberta

              precision    recall  f1-score   support

  Appearance       0.82      0.73      0.77       103
     Comfort       0.84      0.91      0.87       349
       Costs       0.88      0.86      0.87       125
Driveability       0.86      0.81      0.83       338
 Reliability       0.71      0.80      0.76       178
      Safety       0.70      0.55      0.62        29
       Whole       0.83      0.80      0.81       236

    accuracy                           0.82      1358
   macro avg       0.80      0.78      0.79      1358
weighted avg       0.83      0.82      0.82      1358



### Category and sentiment model

In [46]:
nli_trainer_xlmroberta = NLITrainPipelineCatSent(
    train_category_sentiment_ds, eval_category_sentiment_ds,
    model_checkpoint, 'xlmroberta-category-sentiment-nli_automobiles', CATEGORIES_SENTIMENT
)
nli_trainer_xlmroberta.train()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Step,Training Loss
100,2.985
200,2.4855
300,1.9072
400,1.5518
500,1.5112
600,1.3575
700,1.2376
800,1.1035
900,1.0747
1000,0.8678


In [47]:
print(nli_trainer_xlmroberta.inference(test_xlmroberta))
print(nli_trainer_xlmroberta.sentiment_inference())
del nli_trainer_xlmroberta

                       precision    recall  f1-score   support

  Appearance negative       0.44      0.40      0.42        30
   Appearance neutral       0.50      0.26      0.34        19
  Appearance positive       0.70      0.72      0.71        54
     Comfort negative       0.61      0.83      0.70        76
      Comfort neutral       0.00      0.00      0.00        18
     Comfort positive       0.82      0.90      0.86       255
       Costs negative       0.54      0.68      0.60        41
        Costs neutral       0.00      0.00      0.00        21
       Costs positive       0.74      0.83      0.78        63
Driveability negative       0.56      0.53      0.55        58
 Driveability neutral       0.61      0.51      0.56        76
Driveability positive       0.77      0.74      0.75       204
 Reliability negative       0.67      0.72      0.69        92
  Reliability neutral       0.00      0.00      0.00         8
 Reliability positive       0.58      0.79      0.67  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
!mv -f /content/rubert-sentiment-nli_automobiles /content/drive/MyDrive/models
!mv -f /content/mbert-sentiment-nli_automobiles /content/drive/MyDrive/models
!mv -f /content/xlmroberta-sentiment-nli_automobiles /content/drive/MyDrive/models

# Evaluation on laptops

In [None]:
laptop_aspects = pd.read_csv('/content/drive/MyDrive/Summarization/laptop data/aspects-surface.tsv', delimiter='\t')

laptop_aspects.head()

In [None]:
laptop_reviews = pd.read_csv('/content/drive/MyDrive/Summarization/laptop data/reviews.tsv', delimiter='\t')

laptop_reviews.head()

In [None]:
LAPTOP_CATEGORIES = list(laptop_aspects.category.unique())
LAPTOP_SENTIMENT = SENTIMENT

LAPTOP_CATEGORIES_SENTIMENT = [f'{cat} {sent}' for cat in LAPTOP_CATEGORIES for sent in LAPTOP_SENTIMENT]

In [None]:
laptop_id2label_category = {i: label for i, label in enumerate(LAPTOP_CATEGORIES)}
laptop_label2id_category = {v: k for k, v in id2label_category.items()}

laptop_id2label_sentiment = {i: label for i, label in enumerate(LAPTOP_SENTIMENT)}
laptop_label2id_sentiment = {v: k for k, v in id2label_sentiment.items()}

laptop_id2label_category_sentiment = {i: label for i, label in enumerate(LAPTOP_CATEGORIES_SENTIMENT)}
laptop_label2id_category_sentiment = {v: k for k, v in id2label_category_sentiment.items()}

In [None]:
nlp = stanza.Pipeline('ru', processors='tokenize')

In [None]:
def get_laptop_data(reviews: pd.DataFrame, aspects: pd.DataFrame) -> tuple:

    review_ids = []
    data = []
    category_ids = []
    sentiment_ids = []

    reviews_sentences = defaultdict(list)

    logging.warning('Start getting data...')

    reviews = reviews[reviews['id'].isin(aspects['text_id'].unique())]

    for rev_idx, rev in reviews.iterrows():
        text_id = rev['id']
        product_id = rev['product_id']
        text = rev['text']

        logging.warning('Text ID: %s' % text_id)

        # stanza processing to parse sentences and start and end characters
        doc = nlp(text)
        logging.warning('Processed by stanza')
        sents_with_end = {}
        for sent in doc.sentences:
            sentence = []
            for token in sent.tokens:
                sentence.append(token.text)
            
            end_idx = token.end_char  # last token in the current sentence
            
            sentence = ' '.join(sentence)
            sents_with_end[end_idx] = sentence
            reviews_sentences['text_id'].append(sentence)

        logging.warning('Got sentences and their ends')

        # catch needed sentence
        rev_aspects = aspects[aspects['text_id'] == text_id]
        logging.warning('Got aspects for the current review')

        for asp_idx, asp in rev_aspects.iterrows():
            mention_end_char = int(asp['term_to'])

            for end_char in sents_with_end:
                if end_char >= mention_end_char:
                    sentence = sents_with_end[end_char]
                    mention = asp['term']
                    data.append(f'[CLS] {sentence} [SEP] {mention} [SEP]')

                    category_ids.append(LAPTOP_CATEGORIES.index(asp['category']))
                    sentiment_ids.append(SENTIMENT.index(asp['sentiment']))

                    review_ids.append(text_id)

                    break  # just one sentence

        logging.warning('Got data for the current review')

    return review_ids, data, category_ids, sentiment_ids, reviews_sentences

In [None]:
laptop_review_ids, laptop_data, laptop_category_ids, laptop_sentiment_ids, laptop_reviews_sentences = get_laptop_data(laptop_reviews, laptop_aspects)

In [None]:
new_sent_model = AutoModelForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/models/rubert-sentiment-nli_automobiles',
    num_labels=3,
    id2label=id2label_sentiment,
    label2id=label2id_sentiment)
new_sent_tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/ruBert-base')

In [None]:
new_sent_model.to(device)

In [None]:
true_sentiment = []
predicted_sentiment = []
for text, sent_id in zip(laptop_data, laptop_sentiment_ids):
    tokenized = new_sent_tokenizer(text, return_tensors='pt', truncation=True, max_length=100, padding=True, add_special_tokens=False)
    tokenized = tokenized.to(device)
    with torch.no_grad():
        classification_logits = new_sent_model(**tokenized).logits
    predicted_class_id = classification_logits.to('cpu').argmax().item()
    # label = new_cat_model.config.id2label[predicted_class_id]
    true_sentiment.append(new_sent_model.config.id2label[sent_id])
    predicted_sentiment.append(new_sent_model.config.id2label[predicted_class_id])

In [None]:
print(classification_report(true_sentiment, predicted_sentiment))