# Fine-tuning on pair classification task

## Imports

In [1]:
!pip3 install gdown stanza spacy nltk torch==1.13.1 accelerate transformers evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m887.5/887.5 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m93.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive

from ast import literal_eval
from collections import Counter, defaultdict
import locale
import logging
import re
import random
import os
import pandas as pd
import pickle
import numpy as np
import sklearn
from sklearn.metrics import classification_report
import stanza
import spacy
import torch

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from tqdm import tqdm

import evaluate

stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [5]:
!gdown https://drive.google.com/drive/folders/1HEqeIrbSkopfQd0JrHLxjq_jCS31tkaf -O restaurants --folder

Retrieving folder list
Processing file 1r4_x5yAzsEBB80zGA8R-xL1v8xhhPqIY eval_bert_dataset.csv
Processing file 1Q8ur86LbQHVLAtdOtSFeql4ju1WZM8bP eval_model_dataset.csv
Processing file 1-3XCoUmG1lNMUC0tNrGBB3Ehbb7nl1kn eval_xlmroberta_dataset.csv
Processing file 1VKPX4BN7YiJ25kQXAkvvSxaIUAT11tii test_bert_dataset.csv
Processing file 1j8bsfdNa6lF3XRFaQTCe4LgjX4kZ2X_O test_model_dataset.csv
Processing file 1--9malKd5FB7eTe_PW1sD_Yr0ZyBL8vl test_xlmroberta_dataset.csv
Processing file 150DdpYTzZBYknSflp5KW0qGtR-qGXma8 train_bert_dataset.csv
Processing file 1seaBHNEyDMfM4WxfIEIff_DIg2b7yxn8 train_model_dataset.csv
Processing file 1-9OuzuNy1MrObe9vpZL3q1km7n9ATPHc train_xlmroberta_dataset.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1r4_x5yAzsEBB80zGA8R-xL1v8xhhPqIY
To: /content/restaurants/eval_bert_dataset.csv
100% 355k/355k [00:00<00:00, 130MB/s]
Downloading...
From: https://driv

In [6]:
def seed_everything(seed=42) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

In [7]:
seed_everything()

In [8]:
SENTIMENT = ['neutral', 'positive', 'negative']
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']
CATEGORIES_SENTIMENT = [f'{cat} {sent}' for cat in CATEGORIES for sent in SENTIMENT]

In [9]:
id2label_category = {i: label for i, label in enumerate(CATEGORIES)}
label2id_category = {v: k for k, v in id2label_category.items()}

id2label_sentiment = {i: label for i, label in enumerate(SENTIMENT)}
label2id_sentiment = {v: k for k, v in id2label_sentiment.items()}

id2label_category_sentiment = {i: label for i, label in enumerate(CATEGORIES_SENTIMENT)}
label2id_category_sentiment = {v: k for k, v in id2label_category_sentiment.items()}

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## ABSA as NLI


A free interpretation of [this](https://aclanthology.org/W19-6120.pdf) article. In short: aspect-oriented sentiment analysis can be turned into the task of finding a connection between a premise and a hypothesis. In our case, a sentence containing an aspect is a premise, the aspect itself is a hypothesis, and they have some label (for example, tonality or category). How to do?
* Prepare the data. We need triples of the form premise-hypothesis-category.
* Fine-tune models
* Find aspects in the test data. It is possible again with the help of a thesaurus and the best collocations for some metric.

## Data

In [11]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
def get_dataset(path: str, tokenizer: AutoTokenizer) -> tuple:
    '''
    Get dataset from files.
    '''
    dataset = pd.read_csv(path, delimiter='\t')
    data = dataset['text'].values.tolist()
    category_ids = dataset['category'].values.tolist()
    sentiment_ids = dataset['sentiment'].values.tolist()

    category_sentiment_ids = [label2id_category_sentiment[f'{id2label_category[cat]} {id2label_sentiment[sent]}'] for cat, sent in zip(category_ids, sentiment_ids)]

    encodings = tokenizer(data, truncation=True, max_length=100, padding=True, add_special_tokens=False)

    category_ds = ReviewsDataset(encodings, category_ids)
    sentiment_ds = ReviewsDataset(encodings, sentiment_ids)
    category_sentiment_ds = ReviewsDataset(encodings, category_sentiment_ids)

    return category_ds, sentiment_ds, category_sentiment_ds

In [13]:
train_bert = '/content/restaurants/train_bert_dataset.csv'
eval_bert = '/content/restaurants/eval_bert_dataset.csv'
test_bert = '/content/restaurants/test_bert_dataset.csv'

train_xlmroberta = '/content/restaurants/train_xlmroberta_dataset.csv'
eval_xlmroberta = '/content/restaurants/eval_xlmroberta_dataset.csv'
test_xlmroberta = '/content/restaurants/test_xlmroberta_dataset.csv'

In [14]:
df = pd.read_csv(train_bert, delimiter='\t')

In [15]:
df['text'].values.tolist()[0]

'[CLS] Я несколько раз была в этом заведении , о кухне могу сказать только самое хорошее . [SEP] заведении [SEP]'

## Model

In [16]:
class NLITrainPipeline:

    def __init__(self, train_ds, val_ds, model_checkpoint, new_model_checkpoint, labels):
        self.train_ds = train_ds
        self.val_ds = val_ds

        self.model_checkpoint = model_checkpoint
        self.new_model_checkpoint = new_model_checkpoint

        self.num_labels = len(labels)
        self.label2id = {label: i for i, label in enumerate(labels)}
        self.id2label = {i: label for i, label in enumerate(labels)}

        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_checkpoint,
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
            )
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

        self.training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=64,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=100,
        )

        self.clf_report = None

    def train(self):
        '''
        Fine-tune model on the downstream task.
        '''
        self.model.to(device)
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_ds,
            eval_dataset=self.val_ds
        )

        trainer.train()
        trainer.save_model(self.new_model_checkpoint)

    def inference(self, path, label):
        '''
        Evaluate model.
        '''
        new_model = AutoModelForSequenceClassification.from_pretrained(
            self.new_model_checkpoint,
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
            )
        new_model.to(device)

        true_labels = []
        predicted_labels = []

        test_ds = pd.read_csv(path, delimiter='\t')

        for idx, row in test_ds.iterrows():
            tokenized = self.tokenizer(
                row['text'], return_tensors='pt',
                truncation=True, max_length=100,
                padding=True, add_special_tokens=False
                )
            tokenized = tokenized.to(device)
            with torch.no_grad():
                classification_logits = new_model(**tokenized).logits
            predicted_class_id = classification_logits.to('cpu').argmax().item()
            true_labels.append(new_model.config.id2label[row[label]])
            predicted_labels.append(new_model.config.id2label[predicted_class_id])

        self.clf_report = classification_report(true_labels, predicted_labels)

        return self.clf_report

In [17]:
class NLITrainPipelineCatSent(NLITrainPipeline):

    def __init__(self, *args, **kwargs):
        super(NLITrainPipelineCatSent, self).__init__(*args, **kwargs)

    def inference(self, path):
        '''
        Evaluate model.
        '''
        new_model = AutoModelForSequenceClassification.from_pretrained(
            self.new_model_checkpoint,
            num_labels=self.num_labels,
            id2label=self.id2label,
            label2id=self.label2id
            )
        new_model.to(device)

        true_labels = []
        predicted_labels = []

        test_ds = pd.read_csv(path, delimiter='\t')

        for idx, row in test_ds.iterrows():
            tokenized = self.tokenizer(
                row['text'], return_tensors='pt',
                truncation=True, max_length=100,
                padding=True, add_special_tokens=False
                )
            tokenized = tokenized.to(device)
            with torch.no_grad():
                classification_logits = new_model(**tokenized).logits
            predicted_class_id = classification_logits.to('cpu').argmax().item()
            true_labels.append(f"{id2label_category[row['category']]} {id2label_sentiment[row['sentiment']]}")
            predicted_labels.append(new_model.config.id2label[predicted_class_id])

        self.clf_report = classification_report(true_labels, predicted_labels, output_dict=True)

        return classification_report(true_labels, predicted_labels)

    def sentiment_inference(self):
        '''
        Inference on sentiment to the category-sentiment model.
        '''
        if not self.clf_report:
            raise ValueError('Classification report is empty!')

        sentiment_report = {
            'neutral': {'precision': 0, 'recall': 0, 'f1-score': 0},
            'positive': {'precision': 0, 'recall': 0, 'f1-score': 0},
            'negative': {'precision': 0, 'recall': 0, 'f1-score': 0}
        }

        for label, metrics in self.clf_report.items():
            if label not in ['accuracy', 'macro avg', 'weighted avg']:
                sent_label = label.split()[1]
                if sent_label in sentiment_report:
                    sent_dict = sentiment_report[sent_label]
                    for metric, score in metrics.items():
                        if metric in sent_dict:
                            sent_dict[metric] += (score / len(CATEGORIES))

        return pd.DataFrame(sentiment_report)

In [18]:
# usage
# tokenized = tokenizer(text, return_tensors="pt")
# classification_logits = model(**tokenized).logits
# results = torch.softmax(classification_logits, dim=1).tolist()[0]
# for i in range(len(CATEGORIES)):
#     print(f"{CATEGORIES[i]}: {int(results[i] * 100)}%")

## ruBERT

In [19]:
model_checkpoint = 'ai-forever/ruBert-base'

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [21]:
train_category_ds, train_sentiment_ds, train_category_sentiment_ds = get_dataset(train_bert, tokenizer)
eval_category_ds, eval_sentiment_ds, eval_category_sentiment_ds = get_dataset(eval_bert, tokenizer)

In [22]:
train_category_ds[0]

{'input_ids': tensor([  101,   119,  1456,   768,  1073,   113,  1019, 46789,   121,   104,
         14726,  1385,  2312,   966,  3627, 13093,   126,   102, 46789,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 

### Sentiment model

In [23]:
nli_trainer_rubert = NLITrainPipeline(
    train_sentiment_ds, eval_sentiment_ds,
    model_checkpoint, 'rubert-sentiment-nli_restaurants', SENTIMENT
)
nli_trainer_rubert.train()

Downloading pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

Step,Training Loss
100,0.9189
200,0.5484
300,0.4684
400,0.4204
500,0.3651
600,0.3451
700,0.3173
800,0.3108
900,0.2487
1000,0.1486


In [24]:
print(nli_trainer_rubert.inference(test_bert, 'sentiment'))
del nli_trainer_rubert

              precision    recall  f1-score   support

    negative       0.74      0.81      0.78       129
     neutral       0.69      0.65      0.67       189
    positive       0.93      0.92      0.92       872

    accuracy                           0.87      1190
   macro avg       0.79      0.80      0.79      1190
weighted avg       0.87      0.87      0.87      1190



### Category model

In [25]:
nli_trainer_rubert = NLITrainPipeline(
    train_category_ds, eval_category_ds,
    model_checkpoint, 'rubert-category-nli_restaurants', CATEGORIES
)
nli_trainer_rubert.train()

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

Step,Training Loss
100,1.3827
200,0.4785
300,0.274
400,0.1983
500,0.144
600,0.1862
700,0.1661
800,0.144
900,0.096
1000,0.0787


In [26]:
print(nli_trainer_rubert.inference(test_bert, 'category'))
del nli_trainer_rubert

              precision    recall  f1-score   support

        Food       0.98      0.97      0.98       447
    Interior       0.94      0.95      0.94       228
       Price       1.00      0.97      0.99        36
     Service       0.92      0.96      0.94       328
       Whole       0.99      0.93      0.96       151

    accuracy                           0.96      1190
   macro avg       0.97      0.96      0.96      1190
weighted avg       0.96      0.96      0.96      1190



### Category and sentiment model

In [27]:
nli_trainer_rubert = NLITrainPipelineCatSent(
    train_category_sentiment_ds, eval_category_sentiment_ds,
    model_checkpoint, 'rubert-category-sentiment-nli_restaurants', CATEGORIES_SENTIMENT
)
nli_trainer_rubert.train()

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

Step,Training Loss
100,2.4967
200,1.5795
300,0.9803
400,0.7405
500,0.6494
600,0.5708
700,0.4885
800,0.5031
900,0.4212
1000,0.3058


In [28]:
print(nli_trainer_rubert.inference(test_bert))
print(nli_trainer_rubert.sentiment_inference())
del nli_trainer_rubert

                   precision    recall  f1-score   support

    Food negative       0.53      0.63      0.58        30
     Food neutral       0.64      0.61      0.62       117
    Food positive       0.85      0.85      0.85       300
Interior negative       0.71      0.74      0.73        27
 Interior neutral       0.36      0.36      0.36        11
Interior positive       0.90      0.91      0.90       190
   Price negative       0.80      0.50      0.62         8
    Price neutral       0.00      0.00      0.00         6
   Price positive       0.72      0.95      0.82        22
 Service negative       0.75      0.88      0.81        56
  Service neutral       0.70      0.35      0.47        20
 Service positive       0.93      0.96      0.95       252
   Whole negative       0.50      0.38      0.43         8
    Whole neutral       0.70      0.80      0.75        35
   Whole positive       0.93      0.83      0.88       108

         accuracy                           0.83      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## mBERT

In [29]:
model_checkpoint = 'bert-base-multilingual-cased'

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [31]:
train_category_ds, train_sentiment_ds, train_category_sentiment_ds = get_dataset(train_bert, tokenizer)
eval_category_ds, eval_sentiment_ds, eval_category_sentiment_ds = get_dataset(eval_bert, tokenizer)

### Sentiment model

In [32]:
nli_trainer_mbert = NLITrainPipeline(
    train_sentiment_ds, eval_sentiment_ds,
    model_checkpoint, 'mbert-sentiment-nli_restaurants', SENTIMENT
)
nli_trainer_mbert.train()

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss
100,0.9093
200,0.6817
300,0.6206
400,0.6306
500,0.5872
600,0.5427
700,0.5354
800,0.4977
900,0.4324
1000,0.3067


In [33]:
print(nli_trainer_mbert.inference(test_bert, 'sentiment'))
del nli_trainer_mbert

              precision    recall  f1-score   support

    negative       0.57      0.75      0.65       129
     neutral       0.56      0.50      0.53       189
    positive       0.89      0.87      0.88       872

    accuracy                           0.80      1190
   macro avg       0.67      0.71      0.68      1190
weighted avg       0.80      0.80      0.80      1190



### Category model

In [34]:
nli_trainer_mbert = NLITrainPipeline(
    train_category_ds, eval_category_ds,
    model_checkpoint, 'mbert-category-nli_restaurants', CATEGORIES
)
nli_trainer_mbert.train()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss
100,1.4264
200,0.6774
300,0.5019
400,0.3933
500,0.3529
600,0.4028
700,0.3465
800,0.3181
900,0.2377
1000,0.1957


In [35]:
print(nli_trainer_mbert.inference(test_bert, 'category'))
del nli_trainer_mbert

              precision    recall  f1-score   support

        Food       0.96      0.97      0.96       447
    Interior       0.95      0.92      0.94       228
       Price       0.92      0.94      0.93        36
     Service       0.94      0.94      0.94       328
       Whole       0.94      0.94      0.94       151

    accuracy                           0.95      1190
   macro avg       0.94      0.94      0.94      1190
weighted avg       0.95      0.95      0.95      1190



### Category and sentiment model

In [36]:
nli_trainer_mbert = NLITrainPipelineCatSent(
    train_category_sentiment_ds, eval_category_sentiment_ds,
    model_checkpoint, 'mbert-category-sentiment-nli_restaurants', CATEGORIES_SENTIMENT
)
nli_trainer_mbert.train()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Step,Training Loss
100,2.4536
200,1.6939
300,1.3324
400,1.1909
500,1.0413
600,1.0319
700,0.8614
800,0.8155
900,0.7151
1000,0.56


In [37]:
print(nli_trainer_mbert.inference(test_bert))
print(nli_trainer_mbert.sentiment_inference())
del nli_trainer_mbert

                   precision    recall  f1-score   support

    Food negative       0.39      0.40      0.39        30
     Food neutral       0.59      0.62      0.60       117
    Food positive       0.82      0.80      0.81       300
Interior negative       0.42      0.48      0.45        27
 Interior neutral       0.11      0.09      0.10        11
Interior positive       0.86      0.85      0.85       190
   Price negative       0.57      0.50      0.53         8
    Price neutral       0.00      0.00      0.00         6
   Price positive       0.72      0.82      0.77        22
 Service negative       0.69      0.77      0.73        56
  Service neutral       0.71      0.25      0.37        20
 Service positive       0.89      0.93      0.91       252
   Whole negative       0.00      0.00      0.00         8
    Whole neutral       0.51      0.86      0.64        35
   Whole positive       0.86      0.72      0.78       108

         accuracy                           0.77      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## XLM-RoBERTa

In [38]:
model_checkpoint = 'xlm-roberta-base'

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [40]:
train_category_ds, train_sentiment_ds, train_category_sentiment_ds = get_dataset(train_xlmroberta, tokenizer)
eval_category_ds, eval_sentiment_ds, eval_category_sentiment_ds = get_dataset(eval_xlmroberta, tokenizer)

### Sentiment model

In [41]:
nli_trainer_xlmroberta = NLITrainPipeline(
    train_sentiment_ds, eval_sentiment_ds,
    model_checkpoint, 'xlmroberta-sentiment-nli_restaurants', SENTIMENT
)
nli_trainer_xlmroberta.train()

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Step,Training Loss
100,0.954
200,0.7537
300,0.6567
400,0.652
500,0.6038
600,0.5947
700,0.5356
800,0.5204
900,0.4632
1000,0.368


In [42]:
print(nli_trainer_xlmroberta.inference(test_xlmroberta, 'sentiment'))
del nli_trainer_xlmroberta

              precision    recall  f1-score   support

    negative       0.69      0.78      0.73       129
     neutral       0.64      0.61      0.62       189
    positive       0.92      0.91      0.91       872

    accuracy                           0.85      1190
   macro avg       0.75      0.76      0.76      1190
weighted avg       0.85      0.85      0.85      1190



### Category model

In [43]:
nli_trainer_xlmroberta = NLITrainPipeline(
    train_category_ds, eval_category_ds,
    model_checkpoint, 'xlmroberta-category-nli_restaurants', CATEGORIES
)
nli_trainer_xlmroberta.train()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Step,Training Loss
100,1.4739
200,0.725
300,0.4813
400,0.3766
500,0.3201
600,0.3779
700,0.3431
800,0.308
900,0.2087
1000,0.1943


In [44]:
print(nli_trainer_xlmroberta.inference(test_xlmroberta, 'category'))
del nli_trainer_xlmroberta

              precision    recall  f1-score   support

        Food       0.97      0.97      0.97       447
    Interior       0.95      0.94      0.95       228
       Price       1.00      0.97      0.99        36
     Service       0.93      0.95      0.94       328
       Whole       0.95      0.95      0.95       151

    accuracy                           0.95      1190
   macro avg       0.96      0.96      0.96      1190
weighted avg       0.95      0.95      0.95      1190



### Category and sentiment model

In [45]:
nli_trainer_xlmroberta = NLITrainPipelineCatSent(
    train_category_sentiment_ds, eval_category_sentiment_ds,
    model_checkpoint, 'xlmroberta-category-sentiment-nli_restaurants', CATEGORIES_SENTIMENT
)
nli_trainer_xlmroberta.train()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

Step,Training Loss
100,2.5836
200,1.8371
300,1.2984
400,1.0938
500,0.9905
600,0.9329
700,0.7963
800,0.7804
900,0.731
1000,0.5812


In [46]:
print(nli_trainer_xlmroberta.inference(test_xlmroberta))
print(nli_trainer_xlmroberta.sentiment_inference())
del nli_trainer_xlmroberta

                   precision    recall  f1-score   support

    Food negative       0.39      0.50      0.44        30
     Food neutral       0.69      0.58      0.63       117
    Food positive       0.83      0.87      0.85       300
Interior negative       0.58      0.78      0.67        27
 Interior neutral       1.00      0.18      0.31        11
Interior positive       0.91      0.92      0.92       190
   Price negative       0.64      0.88      0.74         8
    Price neutral       0.00      0.00      0.00         6
   Price positive       0.80      0.91      0.85        22
 Service negative       0.70      0.79      0.74        56
  Service neutral       0.57      0.20      0.30        20
 Service positive       0.93      0.94      0.94       252
   Whole negative       0.00      0.00      0.00         8
    Whole neutral       0.58      0.83      0.68        35
   Whole positive       0.87      0.81      0.84       108

         accuracy                           0.81      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
!mv -f /content/rubert-sentiment-nli_restaurants /content/drive/MyDrive/models/
!mv -f /content/mbert-sentiment-nli_restaurants /content/drive/MyDrive/models/
!mv -f /content/xlmroberta-sentiment-nli_restaurants /content/drive/MyDrive/models/

mv: cannot stat '/content/mbert-sentiment-nli_restaurants': No such file or directory


# Evaluation on laptops

In [48]:
laptop_aspects = pd.read_csv('/content/drive/MyDrive/Summarization/laptop data/aspects-surface.tsv', delimiter='\t')

laptop_aspects.head()

Unnamed: 0,id,text_id,category,sent_from,sent_to,sentiment,sent,sent_term,normalized_sent_term,term_from,term_to,term,normalized_term,type
0,0,0,Non-performance,12,19,positive,Трекпад,Трекпад,Трекпад,12,19,Трекпад,Трекпад,explicit
1,1,0,Non-performance,20,32,positive,Качественный,Качественный экран,Качественный экран,33,38,экран,экран,explicit
2,2,0,Appearance,48,54,neutral,сборки,сборки,сборки,48,54,сборки,сборка,explicit
3,3,0,Performance,78,90,positive,аккумулятора,аккумулятора,аккумулятора,78,90,аккумулятора,аккумулятор,explicit
4,4,0,Non-performance,108,115,positive,колонок,колонок,колонки,108,115,колонок,колонки,explicit


In [49]:
laptop_reviews = pd.read_csv('/content/drive/MyDrive/Summarization/laptop data/reviews.tsv', delimiter='\t')

laptop_reviews.head()

Unnamed: 0,id,product_id,text,pluses,minuses,review,stars
0,0,0,Плюсы: Unix Трекпад Качественный экран Качеств...,Unix Трекпад Качественный экран Качество сборк...,Трекпад начал пощелкивать спустя 3 месяца посл...,Я работаю разработчиком и покупал ноутбук имен...,5
1,1,0,Плюсы: - качество картинки на мониторе - тачпа...,- качество картинки на мониторе - тачпад прост...,"- корпус хрупковат, чуть ударил - вмятина на а...","Успел купить пару месяцев назад, сейчас смотрю...",5
2,2,0,"Плюсы: алюминий, марка, батарейка, экран, вес,...","алюминий, марка, батарейка, экран, вес, звук",софт ооооооооочень дорогой и многое нет!,"купил, первое впечатление ВАУ . Потом когда до...",4
3,3,0,Плюсы: - Экран - Тачпад - Качество сборки - Фи...,- Экран - Тачпад - Качество сборки - Фишки Mac...,"- Отсутствие Ethernet порта, все-таки ethernet...",Отличный ноутбук. После работы за таким экрано...,5
4,4,0,Плюсы: + вес + производительность + Дизайн + у...,+ вес + производительность + Дизайн + удобная ...,- со временем появляются битые пиксели - кабел...,Пользуюсь моделью 2013 года уже 2.5 года. Моде...,5


In [50]:
LAPTOP_CATEGORIES = list(laptop_aspects.category.unique())
LAPTOP_SENTIMENT = SENTIMENT

LAPTOP_CATEGORIES_SENTIMENT = [f'{cat} {sent}' for cat in LAPTOP_CATEGORIES for sent in LAPTOP_SENTIMENT]

In [51]:
laptop_id2label_category = {i: label for i, label in enumerate(LAPTOP_CATEGORIES)}
laptop_label2id_category = {v: k for k, v in id2label_category.items()}

laptop_id2label_sentiment = {i: label for i, label in enumerate(LAPTOP_SENTIMENT)}
laptop_label2id_sentiment = {v: k for k, v in id2label_sentiment.items()}

laptop_id2label_category_sentiment = {i: label for i, label in enumerate(LAPTOP_CATEGORIES_SENTIMENT)}
laptop_label2id_category_sentiment = {v: k for k, v in id2label_category_sentiment.items()}

In [52]:
nlp = stanza.Pipeline('ru', processors='tokenize')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

INFO:stanza:Using device: cuda
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


In [53]:
def get_laptop_data(reviews: pd.DataFrame, aspects: pd.DataFrame) -> tuple:

    review_ids = []
    data = []
    category_ids = []
    sentiment_ids = []

    reviews_sentences = defaultdict(list)

    logging.warning('Start getting data...')

    reviews = reviews[reviews['id'].isin(aspects['text_id'].unique())]

    for rev_idx, rev in reviews.iterrows():
        text_id = rev['id']
        product_id = rev['product_id']
        text = rev['text']

        logging.warning('Text ID: %s' % text_id)

        # stanza processing to parse sentences and start and end characters
        doc = nlp(text)
        logging.warning('Processed by stanza')
        sents_with_end = {}
        for sent in doc.sentences:
            sentence = []
            for token in sent.tokens:
                sentence.append(token.text)
            
            end_idx = token.end_char  # last token in the current sentence
            
            sentence = ' '.join(sentence)
            sents_with_end[end_idx] = sentence
            reviews_sentences['text_id'].append(sentence)

        logging.warning('Got sentences and their ends')

        # catch needed sentence
        rev_aspects = aspects[aspects['text_id'] == text_id]
        logging.warning('Got aspects for the current review')

        for asp_idx, asp in rev_aspects.iterrows():
            mention_end_char = int(asp['term_to'])

            for end_char in sents_with_end:
                if end_char >= mention_end_char:
                    sentence = sents_with_end[end_char]
                    mention = asp['term']
                    data.append(f'[CLS] {sentence} [SEP] {mention} [SEP]')

                    category_ids.append(LAPTOP_CATEGORIES.index(asp['category']))
                    sentiment_ids.append(SENTIMENT.index(asp['sentiment']))

                    review_ids.append(text_id)

                    break  # just one sentence

        logging.warning('Got data for the current review')

    return review_ids, data, category_ids, sentiment_ids, reviews_sentences

In [54]:
laptop_review_ids, laptop_data, laptop_category_ids, laptop_sentiment_ids, laptop_reviews_sentences = get_laptop_data(laptop_reviews, laptop_aspects)



In [55]:
new_sent_model = AutoModelForSequenceClassification.from_pretrained(
    '/content/drive/MyDrive/models/rubert-sentiment-nli_restaurants',
    num_labels=3,
    id2label=id2label_sentiment,
    label2id=label2id_sentiment)
new_sent_tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/ruBert-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [56]:
new_sent_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [57]:
true_sentiment = []
predicted_sentiment = []
for text, sent_id in zip(laptop_data, laptop_sentiment_ids):
    tokenized = new_sent_tokenizer(text, return_tensors='pt', truncation=True, max_length=100, padding=True, add_special_tokens=False)
    tokenized = tokenized.to(device)
    with torch.no_grad():
        classification_logits = new_sent_model(**tokenized).logits
    predicted_class_id = classification_logits.to('cpu').argmax().item()
    # label = new_cat_model.config.id2label[predicted_class_id]
    true_sentiment.append(new_sent_model.config.id2label[sent_id])
    predicted_sentiment.append(new_sent_model.config.id2label[predicted_class_id])

In [58]:
print(classification_report(true_sentiment, predicted_sentiment))

              precision    recall  f1-score   support

    negative       0.50      0.65      0.56        74
     neutral       0.38      0.05      0.08        63
    positive       0.66      0.81      0.73       155

    accuracy                           0.60       292
   macro avg       0.51      0.50      0.46       292
weighted avg       0.56      0.60      0.55       292

