# Aspect-based summarization: sequence labeling and clustering

## Imports

In [None]:
!pip3 install stanza transformers evaluate seqeval --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m353.7/353.7 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from abc import ABC, abstractmethod
from collections import defaultdict, Counter
from copy import deepcopy
from functools import reduce
from google.colab import drive
from ast import literal_eval
import logging
import os
import random
from string import punctuation
from typing import Union

import numpy as np
import pandas as pd

import spacy
import stanza

import torch

import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import pipeline

from sklearn.cluster import AffinityPropagation
from scipy.spatial import distance

import evaluate

In [None]:
PUNCTUATION = punctuation.replace('\'', '').replace('"', '')

In [None]:
ner = evaluate.load('seqeval')
stanza.download('ru')

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: ru (Russian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [None]:
def seed_everything(seed=42) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.deterministic = False

In [None]:
seed_everything()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
BIO = ['B-ASPECT', 'I-ASPECT', 'O']
BIO_sent = ['B-POS', 'I-POS', 'B-NEG', 'I-NEG', 'B-NEUT', 'I-NEUT', 'O']

label2id = {label: i for i, label in enumerate(BIO)}
id2label = {i: label for i, label in enumerate(BIO)}
label2id_sent = {label: i for i, label in enumerate(BIO_sent)}
id2label_sent = {i: label for i, label in enumerate(BIO_sent)}

In [None]:
nlp = stanza.Pipeline('ru', processors='tokenize,pos,lemma', verbose=False)

## Pipeline

1. Load data and process it
2. Load model for sequence labeling, inference and evaluate it on the data
3. Cluster aspects and polarity
4. Evaluate

## Sentiment labeling

### Data

In [None]:
laptop_reviews = pd.read_csv('/content/drive/MyDrive/Summarization/laptop data/reviews.tsv', delimiter='\t')
laptop_aspects = pd.read_excel('/content/drive/MyDrive/Summarization/laptop data/datasets/aspects-surface.xlsx')

In [None]:
laptop_reviews.head()

Unnamed: 0,id,product_id,text,pluses,minuses,review,stars
0,0,0,Плюсы: Unix Трекпад Качественный экран Качеств...,Unix Трекпад Качественный экран Качество сборк...,Трекпад начал пощелкивать спустя 3 месяца посл...,Я работаю разработчиком и покупал ноутбук имен...,5
1,1,0,Плюсы: - качество картинки на мониторе - тачпа...,- качество картинки на мониторе - тачпад прост...,"- корпус хрупковат, чуть ударил - вмятина на а...","Успел купить пару месяцев назад, сейчас смотрю...",5
2,2,0,"Плюсы: алюминий, марка, батарейка, экран, вес,...","алюминий, марка, батарейка, экран, вес, звук",софт ооооооооочень дорогой и многое нет!,"купил, первое впечатление ВАУ . Потом когда до...",4
3,3,0,Плюсы: - Экран - Тачпад - Качество сборки - Фи...,- Экран - Тачпад - Качество сборки - Фишки Mac...,"- Отсутствие Ethernet порта, все-таки ethernet...",Отличный ноутбук. После работы за таким экрано...,5
4,4,0,Плюсы: + вес + производительность + Дизайн + у...,+ вес + производительность + Дизайн + удобная ...,- со временем появляются битые пиксели - кабел...,Пользуюсь моделью 2013 года уже 2.5 года. Моде...,5


In [None]:
laptop_aspects.head()

Unnamed: 0,id,text_id,category,sent_from,sent_to,sentiment,sent,sent_term,normalized_sent_term,term_from,term_to,term,normalized_term,type
0,0,10,Non-performance,12,19,positive,Трекпад,Трекпад,Трекпад,12,19,Трекпад,Трекпад,explicit
1,1,10,Non-performance,20,32,positive,Качественный,Качественный экран,Качественный экран,33,38,экран,экран,explicit
2,2,10,Appearance,48,54,neutral,сборки,сборки,сборки,48,54,сборки,сборка,explicit
3,3,10,Performance,78,90,positive,аккумулятора,аккумулятора,аккумулятора,78,90,аккумулятора,аккумулятор,explicit
4,4,10,Non-performance,108,115,positive,колонок,колонок,колонки,108,115,колонок,колонки,explicit


In [None]:
# функция должна быть для минимальной единицы, в нашем случае это текст
def get_data(review: pd.DataFrame, aspects: pd.DataFrame) -> tuple:
    '''
    Get tokens and labels for sentiment labeling.
    '''
    bad_labels = 0

    sentences = []
    data = []
    aspect_labels = []
    aspect_ids = []
    sentiment_labels = []
    tokens_starts = []
    tokens_ends = []

    # get text id and text
    text_id = int(review['id'])
    text = str(review['text'])

    logging.warning('Text ID: %s' % text_id)

    # stanza processing to parse sentences
    doc = nlp(text)
    logging.warning('Processed by stanza...')

    # get needed mentions and aspects
    rev_aspects = aspects[aspects['text_id'] == text_id]

    mentions = rev_aspects['term'].values.tolist()
    starts = rev_aspects['term_from'].values.tolist()
    ends = rev_aspects['term_to'].values.tolist()
    sentiment = rev_aspects['sentiment'].values.tolist()

    # print(mentions)
    # print(starts)

    assert len(starts) == len(ends)

    # parse sentences
    logging.warning('Parse sentences...')

    for sent in doc.sentences:

        sentences.append(sent.text)

        sentence = []
        sentence_aspect_labels = []
        sentence_aspect_ids = []
        sentence_sentiment_labels = []
        sentence_starts = []
        sentence_ends = []

        # current state for multiple token aspects
        current_start = None
        current_end = 0
        current_id = None

        aspect_start = None
        aspect_end = None

        # print(sent.text)

        for token_idx, token in enumerate(sent.tokens):
            sentence.append(token.text)

            # save state of the token and indexes
            token_text = token.text
            start_char = token.start_char
            end_char = token.end_char

            sentence_starts.append(start_char)
            sentence_ends.append(end_char)

            # print(token_text, start_char, end_char)

            # remove punctuation from token string
            if token_text not in PUNCTUATION:
                right_text = token_text.lstrip(PUNCTUATION)
                start_char = start_char + (len(token_text) - len(right_text))

                token_text = right_text.rstrip(PUNCTUATION)
                end_char = end_char - (len(right_text) - len(token_text))

            # if after that string does not match start or start+end
            # it is outside label
            # print(token_text, start_char, end_char)

            # if we have multiple tokens in one aspect
            # if aspects start are enclosed then prevent new beginning
            if current_start is not None:
                # print('CURRENT START')
                sentiment_value = sentiment[current_id]
                sentence_aspect_labels.append('I-ASPECT')
                if sentiment_value == 'positive':
                    sentence_sentiment_labels.append('I-POS')
                elif sentiment_value == 'negative':
                    sentence_sentiment_labels.append('I-NEG')
                elif sentiment_value == 'neutral' or sentiment_value == 'both':
                    sentence_sentiment_labels.append('I-NEUT')

                if end_char >= current_end:
                    # print('END OF INSIDE')
                    # print(token_idx, token.text, 'INSIDE')
                    # if it is the last token in the current aspect
                    # update state
                    current_start = None
                    current_end = 0
                    current_id = None
                    aspect_end = token_idx

                    sentence_aspect_ids.append((aspect_start, aspect_end))
                    aspect_start = None
                    aspect_end = None

            elif start_char in starts:
                # print('START')
                
                # starts may be not unique
                dupl_start_idxs = [i for i, x in enumerate(starts) if x == start_char]
                # get the largest span
                if len(dupl_start_idxs) > 1:
                    # print(dupl_start_idxs)
                    local_end = 0
                    for idx in dupl_start_idxs:
                        new_end = ends[idx]
                        if new_end > local_end:
                            local_end = new_end
                            current_id = idx
                            # current_end = new_end
                else:
                    current_id = dupl_start_idxs[0]

                # print(current_end)

                if end_char < ends[current_id]:
                    # print('doesnt match', ends[current_id])
                    # print(token_idx, token.text, 'BEGIN')

                    current_start = starts[current_id]
                    current_end = ends[current_id]

                sentiment_value = sentiment[current_id]
                sentence_aspect_labels.append('B-ASPECT')
                if sentiment_value == 'positive':
                    sentence_sentiment_labels.append('B-POS')
                elif sentiment_value == 'negative':
                    sentence_sentiment_labels.append('B-NEG')
                elif sentiment_value == 'neutral' or sentiment_value == 'both':
                    sentence_sentiment_labels.append('B-NEUT')

                aspect_start = token_idx
                aspect_end = token_idx  # default ending
                # print(aspect_start, aspect_end)

            else:  # other cases
                # print('OTHER')
                # print(token_idx, token.text, 'OUTSIDE')
                sentence_aspect_labels.append('O')
                sentence_sentiment_labels.append('O')
                # print(aspect_start, aspect_end)

                if aspect_start is not None and aspect_end is not None:
                    # print('Found!')
                    sentence_aspect_ids.append((aspect_start, aspect_end))
                    aspect_start = None
                    aspect_end = None

        if len(sentence_aspect_labels) != len(sentence) or\
        len(sentence_sentiment_labels) != len(sentence):
            print('MISMATCHED LABELING')
            print(sentence_aspect_labels)
            print('length of sentence aspect labels', len(sentence_aspect_labels))
            print('length of sentence sentiment labels', len(sentence_sentiment_labels))
            print(sentence)
            print('length of sentence', len(sentence))
            print(sent.text)

            bad_labels += 1

        data.append(sentence)
        sentiment_labels.append(sentence_sentiment_labels)
        aspect_ids.append(sentence_aspect_ids)
        tokens_starts.append(sentence_starts)
        tokens_ends.append(sentence_ends)

    # logging.warning('Bad labels %d' % bad_labels)

    return sentences, data, sentiment_labels, aspect_ids, tokens_starts, tokens_ends

In [None]:
laptop_sentences = []
laptop_data = []
laptop_sentiment_labels = []
laptop_aspect_ids = []
laptop_starts = []
laptop_ends = []

for i in range(1, 8):  # product
    product_sentences = []
    product_data = []
    product_sentiment_labels = []
    product_aspect_ids = []
    product_starts = []
    product_ends = []

    for idx, review in laptop_reviews[laptop_reviews['product_id'] == i].iterrows():  # review
        all_data_from_laptops = get_data(review, laptop_aspects)

        sentences = all_data_from_laptops[0]
        data = all_data_from_laptops[1]
        sentiment_labels = all_data_from_laptops[2]
        aspect_ids = all_data_from_laptops[3]
        starts = all_data_from_laptops[4]
        ends = all_data_from_laptops[5]

        product_sentences.append(sentences)
        product_data.append(data)
        product_sentiment_labels.append(sentiment_labels)
        product_aspect_ids.append(aspect_ids)
        product_starts.append(starts)
        product_ends.append(ends)

    laptop_sentences.append(product_sentences)
    laptop_data.append(product_data)
    laptop_sentiment_labels.append(product_sentiment_labels)
    laptop_aspect_ids.append(product_aspect_ids)
    laptop_starts.append(product_starts)
    laptop_ends.append(product_ends)



In [None]:
print(len(laptop_data))

7


In [None]:
print(len(laptop_sentences[0][1]))

20


In [None]:
print(laptop_sentences[0][0])
print(laptop_data[0][0])
print(laptop_sentiment_labels[0][0])
print(laptop_aspect_ids[0][0])

['Плюсы: Unix Трекпад Качественный экран Качество сборки Долгое время работы от аккумулятора Отличный звук от колонок Минусы:', 'Трекпад начал пощелкивать спустя 3 месяца после покупки Аллюминиевый корпус вминается, если уронить/ударить ноутбук  Отзыв:', 'Я работаю разработчиком и покупал ноутбук именно для работы.', 'Пользуюсь ноутбуком уже 5-й месяц.', 'До этого пару лет пользовался Thinkpad x220 и Linux.', 'Очень радует то, что операционная система семейства Unix, а также то, что в интернете огромное количество информации под macOS и OS X для разработчиков, когда нужно что-то скомпилировать/собрать и т.д.', 'Это как Linux, только очень качественный и доведенный до ума без его болячек, вроде того, что что-нибудь отвалится в процессе работы само по себе и т.д.', 'Из недостатков: - корпус можно было бы сделать из более прочного материала - я один раз уронил ноутбук на кафель и получил неплохую вмятину на крышке.', '- не знаю с чем это связано, но трекпад начал немного потрескивать при 

In [None]:
len(laptop_data)

7

## Sequence Labeling

In [None]:
rubert_model_checkpoint = '/content/drive/MyDrive/models/rubert-sentiment-seqlabeling_both'
mbert_model_checkpoint = '/content/drive/MyDrive/models/mbert-sentiment-seqlabeling_both'
xlmroberta_model_checkpoint = '/content/drive/MyDrive/models/xlmroberta-sentiment-seqlabeling_both'
rubert_tokenizer_checkpoint = 'ai-forever/ruBert-base'
mbert_tokenizer_checkpoint = 'bert-base-multilingual-cased'
xlmroberta_tokenizer_checkpoint = 'xlm-roberta-base'

In [None]:
def load_model_and_tokenizer(model_path: str, tokenizer_path: str) -> tuple:
    '''
    Load model and tokenizer from paths.
    '''
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

In [None]:
v = np.zeros((1, 16))
print('Empty v:', v)

new_v = np.random.rand(1, 16)
v += new_v

print('Full v:', v)

Empty v: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
Full v: [[0.37454012 0.95071431 0.73199394 0.59865848 0.15601864 0.15599452
  0.05808361 0.86617615 0.60111501 0.70807258 0.02058449 0.96990985
  0.83244264 0.21233911 0.18182497 0.18340451]]


In [None]:
# sample ner inference
def get_ner_sentiment(model: AutoModelForTokenClassification, tokenizer: AutoTokenizer, data: list):
    '''
    Get sentiment labels using sequence labeling model.
    '''
    predicted_labels = []
    predicted_aspects = []
    predicted_embeddings = []
    predicted_sentiment = []

    for sent in data:
        encodings = tokenizer(sent, truncation=True, padding=True, is_split_into_words=True)
        inputs = tokenizer.encode(sent, truncation=True, padding=True, is_split_into_words=True, return_tensors="pt")

        # get model output
        output = model(inputs, output_hidden_states=True)
        outputs = output[0].to(device)
        embeddings = output.hidden_states[0][0]
        preds = torch.argmax(outputs.to('cpu'), dim=2)[0].tolist()

        # align predictions
        aligned_preds = []
        aligned_embs = []
        word_ids = encodings.word_ids()
        # print(len(word_ids))
        # print(word_ids)
        previous_word_idx = None
        aligned_embedding = np.zeros(768)
        aligned_num = 0
        for idx, word_idx in enumerate(word_ids):
            # print(idx, word_idx)
            if word_idx is not None:
                # print(len(embeddings[idx]))
                # print(type(embeddings[idx]))
                aligned_embedding += embeddings[idx].detach().numpy()
                aligned_num += 1
                # print('Just prediction', preds[idx])
                if word_idx != previous_word_idx:
                    # print('End of word', word_idx)
                    previous_word_idx = word_idx
                    # print('Append', preds[idx])
                    aligned_preds.append(preds[idx])

                    aligned_embedding /= aligned_num
                    aligned_embs.append(aligned_embedding)

                    aligned_embedding = np.zeros(768)
                    aligned_num = 0

        assert len(aligned_preds) == len(aligned_embs)

        # get sentiment, aspects ids and embeddings of aspects
        sent_aspects = []
        sent_embs = []
        sent_labels = []
        sent_sentiment = []

        cur_embedding = np.zeros(768)
        cur_num = 0
        cur_start = None
        cur_end = None
        cur_sent = None
        for idx, (sent_idx, sent_emb) in enumerate(zip(aligned_preds, aligned_embs)):
            sent = id2label_sent.get(sent_idx, None)
            cur_sent = sent
            sent_labels.append(sent)

            # not O
            if sent != 'O':
                if sent.startswith('B'):
                    # if B but previous was I
                    if cur_start is not None:
                        cur_embedding /= cur_num
                        sent_embs.append(cur_embedding)

                        sent_aspects.append((cur_start, cur_end))
                        if 'NEUT' in cur_sent:
                            sent_sentiment.append('neutral')
                        elif 'NEG' in cur_sent:
                            sent_sentiment.append('negative')
                        else:
                            sent_sentiment.append('positive')

                        cur_embedding = np.zeros(768)
                        cur_num = 0
                        cur_start = None
                        cur_end = None

                # I and B
                cur_embedding += sent_emb
                # print(cur_embedding.shape)
                cur_num += 1
                if cur_start is None:
                    cur_start = idx
                cur_end = idx

            # O
            else:
                if cur_embedding is not None and \
                cur_num is not None and \
                cur_start is not None and \
                cur_end is not None:
                    cur_embedding /= cur_num
                    sent_embs.append(cur_embedding)

                    sent_aspects.append((cur_start, cur_end))
                    if 'NEUT' in cur_sent:
                        sent_sentiment.append('neutral')
                    elif 'NEG' in cur_sent:
                        sent_sentiment.append('negative')
                    else:
                        sent_sentiment.append('positive')

                    cur_embedding = np.zeros(768)
                    cur_num = 0
                    cur_start = None
                    cur_end = 0

        predicted_labels.append(sent_labels)
        predicted_embeddings.append(sent_embs)
        predicted_aspects.append(sent_aspects)
        predicted_sentiment.append(sent_sentiment)

        assert len(sent_aspects) == len(sent_embs)
        assert len(sent_aspects) == len(sent_sentiment)

    return predicted_labels, predicted_aspects, predicted_embeddings, predicted_sentiment

In [None]:
def evaluate_aspects(gold_aspects: list, predicted_aspects: list) -> tuple:
    '''
    Evaluate aspect extraction using exact and partial matching.
    '''
    exact = 0
    partial = 0
    review_predicted = 0
    for gold_asp, pred_asp in zip(gold_aspects, predicted_aspects):  # sentence
        review_predicted += len(pred_asp)
        for pred in pred_asp:
            part = False
            for gold in gold_asp:
                if pred == gold:
                    exact += 1
                if pred[0] in range(gold[0], gold[1]+1) or \
                pred[1] in range(gold[0], gold[1]+1) or \
                gold[0] in range(pred[0], pred[1]+1) or \
                gold[1] in range(pred[0], pred[1]+1):
                    if part is False:
                        partial += 1
                        part = True

    exact /= review_predicted
    partial /= review_predicted

    return exact, partial

### ruBERT

In [None]:
model, tokenizer = load_model_and_tokenizer(rubert_model_checkpoint, rubert_tokenizer_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

In [None]:
laptop_aspect_ids[0][0][0]

[(3, 3), (5, 5), (7, 7), (12, 12), (16, 16)]

In [None]:
len(laptop_aspect_ids[0][0])

9

In [None]:
laptop_reviews[laptop_reviews['product_id'] == 1].values[0][2]

'Плюсы: Unix Трекпад Качественный экран Качество сборки Долгое время работы от аккумулятора Отличный звук от колонок Минусы: Трекпад начал пощелкивать спустя 3 месяца после покупки Аллюминиевый корпус вминается, если уронить/ударить ноутбук  Отзыв: Я работаю разработчиком и покупал ноутбук именно для работы. Пользуюсь ноутбуком уже 5-й месяц. До этого пару лет пользовался Thinkpad x220 и Linux. Очень радует то, что операционная система семейства Unix, а также то, что в интернете огромное количество информации под macOS и OS X для разработчиков, когда нужно что-то скомпилировать/собрать и т.д. Это как Linux, только очень качественный и доведенный до ума без его болячек, вроде того, что что-нибудь отвалится в процессе работы само по себе и т.д. Из недостатков: - корпус можно было бы сделать из более прочного материала - я один раз уронил ноутбук на кафель и получил неплохую вмятину на крышке. - не знаю с чем это связано, но трекпад начал немного потрескивать при касании в верхнем правом 

In [None]:
rubert_products_labels = []
rubert_products_aspects = []
rubert_products_embeddings = []
rubert_products_sentiment = []

exact_match = 0
partial_match = 0
length_reviews = 0
for prod_idx, prod_data, prod_aspects, prod_starts, prod_ends in zip(range(1, 8), laptop_data, laptop_aspect_ids, laptop_starts, laptop_ends):
    product_labels = []
    product_aspects = []
    product_embeddings = []
    product_sentiment = []

    reviews = laptop_reviews[laptop_reviews['product_id'] == prod_idx].values

    for review_idx, review_data, review_ids, review_starts, review_ends in zip(range(10), prod_data, prod_aspects, prod_starts, prod_ends):
        text = reviews[review_idx][2]
        length_reviews += 1
        predicted_labels, predicted_aspects, predicted_embeddings, predicted_sentiment = get_ner_sentiment(model, tokenizer, review_data)

        exact, partial = evaluate_aspects(review_ids, predicted_aspects)
        exact_match += exact
        partial_match += partial

        product_labels.extend(predicted_labels)
        product_sentiment.extend(predicted_sentiment)
        product_embeddings.extend(predicted_embeddings)

        for sent_aspects, sent_starts, sent_ends in zip(predicted_aspects, review_starts, review_ends):
            for aspect in sent_aspects:
                start = sent_starts[aspect[0]]
                end = sent_ends[aspect[1]]
                mention = text[start:end]
                product_aspects.append(mention)

    rubert_products_labels.append(product_labels)
    rubert_products_aspects.append(product_aspects)
    rubert_products_embeddings.append(product_embeddings)
    rubert_products_sentiment.append(product_sentiment)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
rubert_products_embeddings[0][0][0]

array([ 4.96308059e-02, -1.34820843e-01,  2.09761679e-01,  4.84964408e-01,
       -1.11202141e+00,  1.90307733e-01,  3.97298895e-01, -2.65984874e-01,
        1.63795583e-01,  9.19599548e-01, -5.59066385e-02,  7.12144002e-02,
       -3.63126136e-01,  9.03446704e-01,  1.46868795e-01, -8.55608359e-02,
        1.33253038e-02, -1.41299907e-01,  6.95190929e-01, -9.13301744e-02,
        3.30272272e-01, -3.99382703e-01,  4.26105238e-02,  5.79230547e-01,
        6.09528005e-01,  1.39435023e-01, -1.34074710e-01,  2.16704856e-01,
        5.39025757e-01, -4.83687595e-02, -1.17410466e-01,  3.06925475e-01,
        3.20490356e-01,  6.94918483e-01,  6.50924973e-01,  9.37668979e-03,
       -1.14951648e+00, -1.22901317e-01, -8.21064532e-01,  1.78127065e-02,
       -2.86318589e-01, -5.02646081e-02, -6.51861925e-01, -2.64013712e-01,
        2.34653493e-01,  3.21980488e-01, -1.69091128e-01, -1.65062189e-01,
        6.67685866e-02,  2.20214650e-02, -4.48146321e-01,  4.03519850e-01,
        1.21982187e-01,  

In [None]:
print(rubert_products_aspects[0])

['Unix Трекпад', 'Качественный', 'экран', 'Качество сборки', 'аккумулятора', 'звук от колонок', 'Трекпад', 'пощелкивать', 'корпус', 'вминается', 'ноутбук', 'ноутбуком', 'Thinkpad x220', 'Linux', 'операционная система', 'Linux', 'качественный', 'отвалится', 'корпус', 'прочного', 'материала', 'уронил', 'ноутбук на кафель', 'вмятину', 'крышке', 'трекпад', 'потрескивать', 'качество картинки на мониторе', 'тачпад', 'шустрый', 'ssd', 'авторегулировка яркости и экрана', 'подсветки клавиатуры', 'разъемы USB, HDMI', 'отверстие для SD карточек', 'USB-С', 'аккумулятор', 'POSIX операционная система', 'Linux', 'графическим интерфейсом', 'интеграция с айфоном', 'выхлоп вентиляторов', 'корпус', 'хрупковат', 'вмятина на алюминии', 'экран', 'глянцевый', 'экране', 'рамке', 'подвывает вентиляторами', 'прошка', 'thinkpad w510', 'линуксами', 'синкпадом', 'макбуке', 'нет ничего лишнего', 'диалап модемов', 'expresscard слотов', 'разъемы', 'датчика света', 'модель', 'ноутбук', 'Экран', 'картинки', 'монитора',

In [None]:
print(rubert_products_labels[0][0])

['O', 'O', 'B-POS', 'I-POS', 'B-POS', 'B-POS', 'B-POS', 'I-POS', 'O', 'O', 'O', 'O', 'I-POS', 'O', 'B-POS', 'I-POS', 'I-POS', 'O', 'O']


In [None]:
print(rubert_products_sentiment[0][0])

['positive', 'positive', 'positive', 'positive', 'positive', 'positive']


In [None]:
print('Exact match:', exact_match / length_reviews)
print('Partial match:', partial_match / length_reviews)

Exact match: 0.3013932900783797
Partial match: 0.51099214950965


In [None]:
print(len(rubert_products_sentiment))
print(len(rubert_products_aspects))
print(len(rubert_products_embeddings))

7
7
7


In [None]:
predictions = [sent_labels for prod_labels in rubert_products_labels for sent_labels in prod_labels]
references = [sent_labels for prod_sentiment in laptop_sentiment_labels for review_sentiment in prod_sentiment for sent_labels in review_sentiment]

In [None]:
print(ner.compute(predictions=predictions, references=references))

{'NEG': {'precision': 0.13263785394932937, 'recall': 0.39555555555555555, 'f1': 0.19866071428571427, 'number': 225}, 'NEUT': {'precision': 0.09923664122137404, 'recall': 0.20967741935483872, 'f1': 0.13471502590673576, 'number': 124}, 'POS': {'precision': 0.31203007518796994, 'recall': 0.527542372881356, 'f1': 0.3921259842519686, 'number': 472}, 'overall_precision': 0.21028307336799537, 'overall_recall': 0.44336175395858707, 'overall_f1': 0.2852664576802508, 'overall_accuracy': 0.7674065321968099}


### mBERT

In [None]:
model, tokenizer = load_model_and_tokenizer(mbert_model_checkpoint, mbert_tokenizer_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
mbert_products_labels = []
mbert_products_aspects = []
mbert_products_embeddings = []
mbert_products_sentiment = []

exact_match = 0
partial_match = 0
length_reviews = 0
for prod_idx, prod_data, prod_aspects, prod_starts, prod_ends in zip(range(1, 8), laptop_data, laptop_aspect_ids, laptop_starts, laptop_ends):
    product_labels = []
    product_aspects = []
    product_embeddings = []
    product_sentiment = []

    reviews = laptop_reviews[laptop_reviews['product_id'] == prod_idx].values

    for review_idx, review_data, review_ids, review_starts, review_ends in zip(range(10), prod_data, prod_aspects, prod_starts, prod_ends):
        text = reviews[review_idx][2]
        length_reviews += 1
        predicted_labels, predicted_aspects, predicted_embeddings, predicted_sentiment = get_ner_sentiment(model, tokenizer, review_data)

        exact, partial = evaluate_aspects(review_ids, predicted_aspects)
        exact_match += exact
        partial_match += partial

        product_labels.extend(predicted_labels)
        product_sentiment.extend(predicted_sentiment)
        product_embeddings.extend(predicted_embeddings)

        for sent_aspects, sent_starts, sent_ends in zip(predicted_aspects, review_starts, review_ends):
            for aspect in sent_aspects:
                start = sent_starts[aspect[0]]
                end = sent_ends[aspect[1]]
                mention = text[start:end]
                product_aspects.append(mention)

    mbert_products_labels.append(product_labels)
    mbert_products_aspects.append(product_aspects)
    mbert_products_embeddings.append(product_embeddings)
    mbert_products_sentiment.append(product_sentiment)

In [None]:
print('Exact match:', exact_match / length_reviews)
print('Partial match:', partial_match / length_reviews)

Exact match: 0.28419888076162814
Partial match: 0.5186624366275082


In [None]:
predictions = [sent_labels for prod_labels in mbert_products_labels for sent_labels in prod_labels]
references = [sent_labels for prod_sentiment in laptop_sentiment_labels for review_sentiment in prod_sentiment for sent_labels in review_sentiment]

In [None]:
print(ner.compute(predictions=predictions, references=references))

{'NEG': {'precision': 0.10743801652892562, 'recall': 0.28888888888888886, 'f1': 0.1566265060240964, 'number': 225}, 'NEUT': {'precision': 0.08053691275167785, 'recall': 0.1935483870967742, 'f1': 0.11374407582938388, 'number': 124}, 'POS': {'precision': 0.31565329883570503, 'recall': 0.5169491525423728, 'f1': 0.39196787148594375, 'number': 472}, 'overall_precision': 0.19868735083532219, 'overall_recall': 0.4056029232643118, 'overall_f1': 0.26672006407689225, 'overall_accuracy': 0.7661406025824964}


### XLM-RoBERTa

In [None]:
model, tokenizer = load_model_and_tokenizer(xlmroberta_model_checkpoint, xlmroberta_tokenizer_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
xlmroberta_products_labels = []
xlmroberta_products_aspects = []
xlmroberta_products_embeddings = []
xlmroberta_products_sentiment = []

exact_match = 0
partial_match = 0
length_reviews = 0
for prod_idx, prod_data, prod_aspects, prod_starts, prod_ends in zip(range(1, 8), laptop_data, laptop_aspect_ids, laptop_starts, laptop_ends):
    product_labels = []
    product_aspects = []
    product_embeddings = []
    product_sentiment = []

    reviews = laptop_reviews[laptop_reviews['product_id'] == prod_idx].values

    for review_idx, review_data, review_ids, review_starts, review_ends in zip(range(10), prod_data, prod_aspects, prod_starts, prod_ends):
        text = reviews[review_idx][2]
        length_reviews += 1
        predicted_labels, predicted_aspects, predicted_embeddings, predicted_sentiment = get_ner_sentiment(model, tokenizer, review_data)

        exact, partial = evaluate_aspects(review_ids, predicted_aspects)
        exact_match += exact
        partial_match += partial

        product_labels.extend(predicted_labels)
        product_sentiment.extend(predicted_sentiment)
        product_embeddings.extend(predicted_embeddings)

        for sent_aspects, sent_starts, sent_ends in zip(predicted_aspects, review_starts, review_ends):
            for aspect in sent_aspects:
                start = sent_starts[aspect[0]]
                end = sent_ends[aspect[1]]
                mention = text[start:end]
                product_aspects.append(mention)

    print(len(product_aspects))

    xlmroberta_products_labels.append(product_labels)
    xlmroberta_products_aspects.append(product_aspects)
    xlmroberta_products_embeddings.append(product_embeddings)
    xlmroberta_products_sentiment.append(product_sentiment)

316
284
171
202
182
264
210


In [None]:
print('Exact match:', exact_match / length_reviews)
print('Partial match:', partial_match / length_reviews)

Exact match: 0.31426945727377303
Partial match: 0.5411090958891839


In [None]:
predictions = [sent_labels for product_labels in xlmroberta_products_labels for sent_labels in product_labels]
references = [sent_labels for product_sentiment in laptop_sentiment_labels for review_sentiment in product_sentiment for sent_labels in review_sentiment]

In [None]:
print(ner.compute(predictions=predictions, references=references))

{'NEG': {'precision': 0.1378299120234604, 'recall': 0.4177777777777778, 'f1': 0.20727673649393605, 'number': 225}, 'NEUT': {'precision': 0.07804878048780488, 'recall': 0.12903225806451613, 'f1': 0.0972644376899696, 'number': 124}, 'POS': {'precision': 0.35233160621761656, 'recall': 0.576271186440678, 'f1': 0.43729903536977494, 'number': 472}, 'overall_precision': 0.23025919228450875, 'overall_recall': 0.46528623629719856, 'overall_f1': 0.3080645161290323, 'overall_accuracy': 0.7760148535741412}


## Summarization

In [None]:
laptop_summarization = pd.read_excel('/content/drive/MyDrive/Summarization/laptop data/datasets/summarization.xlsx')

laptop_summarization.head()

Unnamed: 0,id,product_id,term,sentiment
0,0,1,тачпад,positive
1,1,1,экран,positive
2,2,1,аккумулятор,positive
3,3,1,колонки,positive
4,4,1,сборка,positive


In [None]:
class Clusterisator(ABC):

    def __init__(self, n_clusters=None, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state

    @abstractmethod
    def _clusterisation(self):
        pass

    def _get_aspects(self, embeddings: list):
        '''
        Identify aspects after clusterisation.
        '''
        labels, centers = self._clusterisation(embeddings)

        labels = list(labels)

        min_dist_states = {}

        for idx, (label, embedding) in enumerate(zip(labels, embeddings)):
            if label not in min_dist_states:
                min_dist_states[label] = {'min_dist': 1, 'min_idx': None, 'embedding': None}
            # get embedding with minimum cosine distance
            # from center of cluster
            dist = distance.cosine(embedding, centers[label])
            min_dist = min_dist_states[label].get('min_dist', None)
            if min_dist:
                if min_dist > dist:
                    if min_dist_states[label]['embedding'] is not None:
                        # return previous embedding with minimum distance into the list
                        emb_idx = min_dist_states[label]['min_idx']
                        emb = min_dist_states[label]['embedding']
                        embeddings[emb_idx] = emb

                    min_dist_states[label]['min_dist'] = dist
                    min_dist_states[label]['min_idx'] = idx
                    min_dist_states[label]['embedding'] = embedding

                    # remove main aspect and it's embedding from clusters
                    # leave other embeddings of clusters
                    # to get summarized polarity
                    embeddings[idx] = None

        return labels, embeddings

    def get_summarized_aspects(self, embeddings: list, mentions: list, sentiment: list):
        '''
        Summarize sentiment by aspect in the clusters.
        '''
        embeddings = deepcopy(embeddings)
        labels, embeddings = self._get_aspects(embeddings)

        labels = [i - 1 for i in deepcopy(labels)]

        aspects = [0 for _ in range(len(set(labels)))]
        other_sentiment = [Counter() for _ in range(len(set(labels)))]

        for idx, (label, embedding) in enumerate(zip(labels, embeddings)):

            # get main aspect
            if embedding is None:
                aspects[label] = mentions[idx]
            other_sentiment[label][sentiment[idx]] += 1

        sentiments = [sentiment.most_common()[0][0] for sentiment in other_sentiment]

        return aspects, sentiments

In [None]:
class AffinityPropagationClusterisator(Clusterisator):

    def _clusterisation(self, embeddings: list) -> tuple:

        affp = AffinityPropagation(random_state=self.random_state, damping=0.7).fit(embeddings)
        labels = affp.labels_
        centers = affp.cluster_centers_

        return labels, centers

In [None]:
affp_clusterisator = AffinityPropagationClusterisator()

In [None]:
gold_summarization = laptop_summarization[laptop_summarization['product_id'] == 1]

gold_summarization.head()

Unnamed: 0,id,product_id,term,sentiment
0,0,1,тачпад,positive
1,1,1,экран,positive
2,2,1,аккумулятор,positive
3,3,1,колонки,positive
4,4,1,сборка,positive


In [None]:
gold_summarization.groupby('sentiment')['term'].apply(lambda x: x.tolist()).to_dict()

{'negative': ['алюминий',
  'корпус',
  'вентиляторы',
  'Отсутствие Ethernet порта',
  'Отсутствие поддержки NTFS',
  'ремонтопригодность',
  '2 usb',
  'кулер',
  'цена',
  'тепловыделению'],
 'neutral': ['Клавиатура', 'наворотики'],
 'positive': ['тачпад',
  'экран',
  'аккумулятор',
  'колонки',
  'сборка',
  'ноутбук',
  'операционная система',
  'ssd',
  'разъемы',
  'датчика света',
  'можно класть ноутбук на любой бок',
  'есть самые нужные разъемы',
  'марка',
  'вес',
  'звук',
  'железо',
  'габаритах',
  'производительность',
  'Thunderbolt',
  'дизайн',
  'Эргономичность',
  'открывания одной рукой',
  'отклик']}

In [None]:
def evaluate_summarization(summarization: list, gold_summarization: list) -> None:
    '''
    Evaluate summarization by exact and partial match.
    '''

    gold_summarization = [gold_coll.lower() for gold_coll in gold_summarization]
    exact_match = len([coll for coll in summarization if coll.lower() in gold_summarization])

    partial_match = []
    for coll in summarization:
        coll = coll.lower()
        for gold_coll in gold_summarization:
            if coll in gold_coll or gold_coll in coll:
                if coll not in partial_match:
                    partial_match.append(coll)
    partial_match = len(partial_match)

    return exact_match, partial_match

### ruBERT

In [None]:
len(rubert_products_aspects[0])

324

In [None]:
for l in rubert_products_aspects:
    print(len(l))

324
307
180
198
204
268
218


In [None]:
for l in mbert_products_aspects:
    print(len(l))

292
285
179
185
186
285
208


In [None]:
for l in xlmroberta_products_aspects:
    print(len(l))

316
284
171
202
182
264
210


In [None]:
rubert_summarization = []

summarization_length = 0
exact_match = 0
partial_match = 0

for product_embeddings, product_aspects, product_sentiment, product_id in\
zip(rubert_products_embeddings, rubert_products_aspects, rubert_products_sentiment, range(1, 8)):

    # now all sentences form one big review
    # it is needed to do summarization

    predicted_embeddings = reduce(lambda x, y: x + y, product_embeddings)
    predicted_sentiment = reduce(lambda x, y: x + y, product_sentiment)

    # print(len(product_aspects))
    # print(len(predicted_sentiment))
    # print(len(predicted_embeddings))

    aspects, sentiments = affp_clusterisator.get_summarized_aspects(
        predicted_embeddings,
        product_aspects,
        predicted_sentiment
        )
    
    summarization = defaultdict(list)
    for aspect, sentiment in zip(aspects, sentiments):
        summarization[sentiment].append(aspect)

    g_sum = gold_summarization.groupby('sentiment')['term'].apply(lambda x: x.tolist()).to_dict()

    for sent in ['positive', 'negative', 'neutral']:
        gold_colls = g_sum.get(sent, None)
        colls = summarization.get(sent, None)
        if gold_colls and colls:
            summarization_length += len(colls)
            sent_exact_match, sent_partial_match = evaluate_summarization(colls, gold_colls)
            exact_match += sent_exact_match
            partial_match += sent_partial_match

    rubert_summarization.append(summarization)

In [None]:
rubert_summarization[0]

defaultdict(list,
            {'positive': ['Linux',
              'корпус',
              'качество картинки на мониторе',
              'тачпад',
              'авторегулировка яркости и экрана',
              'вмятина на алюминии',
              'модель',
              'Аккумулятор',
              'софту',
              'красивый',
              'батарейка',
              'экран',
              'Windows',
              'Mac OS',
              'точка',
              'поддержку NTFS',
              'ssd диск',
              'зарядка',
              'качественный',
              'звук',
              'ноутбук',
              'дороже',
              'изоляция кабеля зарядного устройства',
              'apple',
              'Цена',
              'Дисплей',
              'удобно',
              'ремонт',
              'Мышка',
              'Клавиатура',
              'запятая',
              'Количество',
              'подсветка клавиатуры',
              'usb',
              'переход

In [None]:
print('Exact match:', exact_match / summarization_length)
print('Partial match:', partial_match / summarization_length)

Exact match: 0.09268292682926829
Partial match: 0.16097560975609757


### mBERT

In [None]:
mbert_summarization = []

summarization_length = 0
exact_match = 0
partial_match = 0

for product_embeddings, product_aspects, product_sentiment, product_id in\
zip(mbert_products_embeddings, mbert_products_aspects, mbert_products_sentiment, range(1, 8)):

    # now all sentences form one big review
    # it is needed to do summarization

    predicted_embeddings = reduce(lambda x, y: x + y, product_embeddings)
    predicted_sentiment = reduce(lambda x, y: x + y, product_sentiment)

    # print(len(product_aspects))
    # print(len(predicted_sentiment))
    # print(len(predicted_embeddings))

    aspects, sentiments = affp_clusterisator.get_summarized_aspects(
        predicted_embeddings,
        product_aspects,
        predicted_sentiment
        )
    
    summarization = defaultdict(list)
    for aspect, sentiment in zip(aspects, sentiments):
        summarization[sentiment].append(aspect)

    g_sum = gold_summarization.groupby('sentiment')['term'].apply(lambda x: x.tolist()).to_dict()

    for sent in ['positive', 'negative', 'neutral']:
        gold_colls = g_sum.get(sent, None)
        colls = summarization.get(sent, None)
        if gold_colls and colls:
            summarization_length += len(colls)
            sent_exact_match, sent_partial_match = evaluate_summarization(colls, gold_colls)
            exact_match += sent_exact_match
            partial_match += sent_partial_match

    mbert_summarization.append(summarization)

In [None]:
mbert_summarization[0]

defaultdict(list,
            {'positive': ['корпус',
              'трекпад',
              'качество картинки на мониторе',
              'хрупковат',
              'отпечатки',
              'экране',
              'корпус алюминиевый',
              'софту',
              'Интеграция с айфоном',
              'марка',
              'батарейка',
              'в appstore',
              'Экран',
              'Фишки Mac OS',
              'Много',
              'роутер',
              'не ацки тяжелый',
              'производительность',
              'качественный',
              'звук',
              'уголок',
              'ремонтопригодный',
              'ноутбук',
              'цена',
              'дисплеи',
              'изоляция кабеля зарядного устройства',
              'Цена',
              'тратой денег',
              'не шумят',
              'мышку',
              'дешевле',
              'Трекпад',
              'Корпус',
              'запятая',
              'п

In [None]:
print('Exact match:', exact_match / summarization_length)
print('Partial match:', partial_match / summarization_length)

Exact match: 0.08333333333333333
Partial match: 0.13157894736842105


### XLM-RoBERTa

In [None]:
xlmroberta_summarization = []

summarization_length = 0
exact_match = 0
partial_match = 0

for product_embeddings, product_aspects, product_sentiment, product_id in\
zip(xlmroberta_products_embeddings, xlmroberta_products_aspects, xlmroberta_products_sentiment, range(1, 8)):

    # now all sentences form one big review
    # it is needed to do summarization

    predicted_embeddings = reduce(lambda x, y: x + y, product_embeddings)
    predicted_sentiment = reduce(lambda x, y: x + y, product_sentiment)

    # print(len(product_aspects))
    # print(len(predicted_sentiment))
    # print(len(predicted_embeddings))

    aspects, sentiments = affp_clusterisator.get_summarized_aspects(
        predicted_embeddings,
        product_aspects,
        predicted_sentiment
        )
    
    summarization = defaultdict(list)
    for aspect, sentiment in zip(aspects, sentiments):
        summarization[sentiment].append(aspect)

    g_sum = gold_summarization.groupby('sentiment')['term'].apply(lambda x: x.tolist()).to_dict()

    for sent in ['positive', 'negative', 'neutral']:
        gold_colls = g_sum.get(sent, None)
        colls = summarization.get(sent, None)
        if gold_colls and colls:
            summarization_length += len(colls)
            sent_exact_match, sent_partial_match = evaluate_summarization(colls, gold_colls)
            exact_match += sent_exact_match
            partial_match += sent_partial_match

    xlmroberta_summarization.append(summarization)

In [None]:
xlmroberta_summarization[0]

defaultdict(list,
            {'positive': ['Трекпад',
              'пощелкивать',
              'вминается',
              'авторегулировка яркости и экрана',
              'разъемы USB, HDMI',
              'айфоном',
              'продуманный',
              'залапывается',
              'красивый',
              'опенсорс софт',
              'алюминий',
              'Софт',
              'Экран',
              'Mac OS',
              'тачпадом',
              'удобно',
              'производительность',
              'Дизайн',
              'удобная',
              'качественный',
              'переходник',
              'ноутбук',
              'не устарел',
              'корпусы',
              'дисплеи',
              'изоляция кабеля зарядного устройства',
              'Цена',
              'ноут',
              'тишине',
              'мышку',
              'трещать',
              'Трекпад',
              'Клавиатура',
              'дешевым',
              'Трекпад с

In [None]:
print('Exact match:', exact_match / summarization_length)
print('Partial match:', partial_match / summarization_length)

Exact match: 0.08947368421052632
Partial match: 0.1736842105263158
