# Итоговый проект по АвтОбрЕЯ
## Aspect-Based Sentiment Analysis
### Команда №4: Алла Горбунова, Елизавета Клыкова, Анастасия Панасюк, Яна Шишкина

## Импорты и подготовка данных

In [1]:
!pip install stanza
!pip install transformers
!pip install fasttext

Collecting stanza
  Downloading stanza-1.3.0-py3-none-any.whl (432 kB)
     |████████████████████████████████| 432 kB 4.4 MB/s            
Installing collected packages: stanza
Successfully installed stanza-1.3.0


In [2]:
import re

import random
import numpy as np
import pandas as pd

from collections import defaultdict, Counter
from string import punctuation
from tqdm.auto import tqdm
from copy import copy

import stanza

import torch
from transformers import BertTokenizerFast, BertConfig, \
    BertForTokenClassification, AutoModelForSequenceClassification

import fasttext

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [3]:
seed = 200
random.seed(seed)
np.random.seed(seed)

In [4]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_reviews.txt

--2021-12-27 12:19:01--  https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_reviews.txt
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt [following]
--2021-12-27 12:19:01--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_reviews.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 110515 (108K) [text/plain]
Saving to: ‘dev_reviews.txt’


2021-12-27 12:19:01 (5.21 MB/s) - ‘dev_reviews.txt’ saved [110515/110515]



In [5]:
reviews = {}
with open('dev_reviews.txt') as f:
    for line in f:
        line = line.rstrip('\r\n').split('\t')
        reviews[int(line[0])] = line[1]

In [6]:
len(reviews)

71

## Этап 1: парсим тексты Станзой

In [7]:
stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize,pos,depparse,lemma')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.3.0/models/default.zip:   0%|          | 0…

In [8]:
def parse_with_stanza(nlp, reviews: dict):
    parsed_reviews = {}
    for text_idx, text in tqdm(list(reviews.items())):
        doc = nlp(text)
        doc_dict = {}
        for sent_num, sentence in enumerate(doc.sentences):
            sent_dict = {}
            for tok_num, token in enumerate(sentence.tokens):
                tok_dict = {'id': token.id[0] - 1,
                            'text': token.text,
                            'start': token.start_char,
                            'end': token.end_char,
                            'pos': token.words[0].upos,
                            'head': token.words[0].head - 1,
                            'deprel': token.words[0].deprel,
                            'sent_index': sent_num,
                            'text_index': text_idx}
                sent_dict[tok_num] = tok_dict
            doc_dict[sent_num] = sent_dict
        parsed_reviews[text_idx] = doc_dict
    return parsed_reviews

In [9]:
test_parsed = parse_with_stanza(nlp, reviews)

  0%|          | 0/71 [00:00<?, ?it/s]

In [10]:
test_parsed[13823][0][0]

{'id': 0,
 'text': 'Зашли',
 'start': 0,
 'end': 5,
 'pos': 'VERB',
 'head': -1,
 'deprel': 'root',
 'sent_index': 0,
 'text_index': 13823}

## Этап 2: размечаем аспекты Бертом
Обучение модели [здесь](https://colab.research.google.com/drive/1e37Ek7kQjaOvyuutef2AHx57XrfTI7BJ?usp=sharing); в этой тетрадке будем использовать готовую. Сама модель лежит [тут](https://drive.google.com/drive/folders/1BIz1jpXK4GLQYHFL3-vy7NXuApIEhMbW?usp=sharing).

In [11]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [15]:
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased')

aspect_model = BertForTokenClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased', num_labels=3)
aspect_model.to(device)

# поменять пути, если в Колабе
aspect_model_path = '../input/aspect-model/aspect_model/pytorch_model.bin'
aspect_model.load_state_dict(torch.load(aspect_model_path))

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

<All keys matched successfully>

In [16]:
def bio_sentence(sentence, tokenizer, model, device, ids_to_labels):
    # подаем токены из Станзы
    input_tokens = [tok['text'] for tok in list(sentence.values())]
    inputs = tokenizer(input_tokens,
                       is_split_into_words=True,
                       return_offsets_mapping=True,
                       padding='max_length',
                       truncation=True,
                       max_length=512,
                       return_tensors='pt')

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1)

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i]
                         for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))

    sent_prediction = []
    for token_pred, mapping in zip(wp_preds,
                                   inputs["offset_mapping"].squeeze().tolist()):
        if mapping[0] == 0 and mapping[1] != 0:
            sent_prediction.append(token_pred[1])
        else:
            continue

    output = copy(sentence)
    for i, tag in enumerate(sent_prediction):
        output[i]['bio'] = tag

    return output

In [17]:
def bio_corpus(reviews, tokenizer, model, device):
    bio_tags = ['O', 'B', 'I']
    ids_to_labels = {v: k for v, k in enumerate(bio_tags)}

    tagged_reviews = {}
    for idx, review in tqdm(list(reviews.items())):
        tagged_text = {}
        for i, sentence in list(review.items()):
            tagged_sent = bio_sentence(sentence, tokenizer, model,
                                         device, ids_to_labels)
            tagged_text[i] = tagged_sent
        tagged_reviews[idx] = tagged_text

    return tagged_reviews

In [18]:
test_with_bio = bio_corpus(test_parsed, tokenizer, aspect_model, device)

  0%|          | 0/71 [00:00<?, ?it/s]

In [19]:
test_with_bio[13823][0][0]

{'id': 0,
 'text': 'Зашли',
 'start': 0,
 'end': 5,
 'pos': 'VERB',
 'head': -1,
 'deprel': 'root',
 'sent_index': 0,
 'text_index': 13823,
 'bio': 'O'}

## Этап 3: оцениваем тональность

### 3.1. Fine-Tuned BERT
Обучение модели [здесь](https://colab.research.google.com/drive/1uqYLiWqBpdiDZKCBMFMiLFA88lJitTa8?usp=sharing) (это не та же, что в предыдущем пункте); в этой тетрадке будем использовать готовую. Сама модель лежит [тут](https://drive.google.com/drive/folders/191PmR-fVBdkCs3Xxx4n7KVTeBnVJhY8V?usp=sharing), в той же тетрадке можно посмотреть на ее оценку.

При таком подходе нам не нужно думать о выделении однословных или многословных аспектов: модель приписывает сентимент-тег каждому токену.

In [20]:
tokenizer = BertTokenizerFast.from_pretrained('DeepPavlov/rubert-base-cased')

sentim_model = BertForTokenClassification.from_pretrained(
    'DeepPavlov/rubert-base-cased', num_labels=4)
sentim_model.to(device)

sentim_model_path = '../input/aspect-sentiment-model/aspect_sentiment_model/pytorch_model.bin'
sentim_model.load_state_dict(torch.load(sentim_model_path))

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initializ

<All keys matched successfully>

In [21]:
def sentiment_sentence(sentence, tokenizer, model, device, ids_to_labels):
    # подаем токены из Станзы
    input_tokens = [tok['text'] for tok in list(sentence.values())]
    inputs = tokenizer(input_tokens,
                       is_split_into_words=True,
                       return_offsets_mapping=True,
                       padding='max_length',
                       truncation=True,
                       max_length=512,
                       return_tensors='pt')

    ids = inputs['input_ids'].to(device)
    mask = inputs['attention_mask'].to(device)
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1)
    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i]
                         for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions))

    sent_prediction = []
    for token_pred, mapping in zip(wp_preds,
                                   inputs['offset_mapping'].squeeze().tolist()):
        if mapping[0] == 0 and mapping[1] != 0:
            sent_prediction.append(token_pred[1])
        else:
            continue

    output = copy(sentence)
    for i, tag in enumerate(sent_prediction):
        output[i]['sentiment1'] = tag

    return output

In [22]:
def sentiment_corpus(reviews, tokenizer, model, device):
    sent_tags = ['neutral', 'positive', 'negative', 'both']
    ids_to_labels = {v: k for v, k in enumerate(sent_tags)}
    # дизайнерское решение, объясняется в тетрадке с обучением
    ids_to_labels[-100] = 'O'

    tagged_reviews = {}
    for idx, review in tqdm(list(reviews.items())):
        tagged_text = {}
        for i, sentence in list(review.items()):
            tagged_sent = sentiment_sentence(sentence, tokenizer, model,
                                             device, ids_to_labels)
            tagged_text[i] = tagged_sent
        tagged_reviews[idx] = tagged_text

    return tagged_reviews

In [23]:
test_with_sentiment = sentiment_corpus(test_with_bio, tokenizer,
                                       sentim_model, device)

  0%|          | 0/71 [00:00<?, ?it/s]

In [24]:
test_with_sentiment[13823][0][0]

{'id': 0,
 'text': 'Зашли',
 'start': 0,
 'end': 5,
 'pos': 'VERB',
 'head': -1,
 'deprel': 'root',
 'sent_index': 0,
 'text_index': 13823,
 'bio': 'O',
 'sentiment1': 'neutral'}

### 3.2. Выделение аспектов
Для каждого токена мы получили указание на то, является ли токен частью аспекта, и его тональность. Теперь нам нужно собрать части аспектов в целые сущности.

In [25]:
def get_sentence_aspects(one_sent):

    aspects = []
    one_aspect = []
    prev = None

    for i, token in list(one_sent.items()):

        # если токен -- начало сущности, запоминаем
        if token['bio'] == 'B' and prev != 'B' and prev != 'I':
            one_aspect.append(token)

        # если токен -- начало нового аспекта, запоминаем предыдущий и новый
        elif token['bio'] == 'B' and (prev == 'B' or prev == 'I'):
            aspects.append(one_aspect)
            one_aspect = [token]

        # если токен -- продолжение сущности, запоминаем
        # сюда же попадут сущности, по ошибке не начинающиеся с B
        elif token['bio'] == 'I':
            one_aspect.append(token)

        # если сущность закончилась, запоминаем и обнуляем аспект
        elif token['bio'] == 'O' and (prev == 'I' or prev == 'B'):
            aspects.append(one_aspect)
            one_aspect = []

        # запоминаем, какой тег только что видели
        prev = token['bio']

    return aspects

Мы научились получать аспекты на уровне одного предложения. Теперь приведем их в нужный формат: соединим через пробел, припишем тональность, индекс начала и конца и т.д. Тональность аспекта выбираем как тональность первого слова.

In [26]:
all_aspects = []  # здесь все аспекты корпуса, чтобы было удобнее смотреть
text_aspects = []  # здесь аспекты, разделенные по текстам
for idx, text in tqdm(list(test_with_sentiment.items())):
    one_text = []
    for i, sent in list(text.items()):
        one_sent = []
        sent_aspects = [asp for asp in get_sentence_aspects(sent) if asp]
        for aspect in sent_aspects:
            clean_aspect = {'asp_idx': aspect[0]['text_index'],
                            'asp_text': ' '.join([pt['text'] for pt in aspect]),
                            'asp_start': aspect[0]['start'],
                            'asp_end': aspect[-1]['end'],
                            'sentiment1': aspect[0]['sentiment1']}
            all_aspects.append(clean_aspect)
            one_sent.append(clean_aspect)
        one_text.append(one_sent)
    text_aspects.append(one_text)

  0%|          | 0/71 [00:00<?, ?it/s]

Ниже еще один способ выбирать тональность -- с подсчетом тональностей всех слов и выбором наиболее частотной. На второй задаче этот подход показал себя чуть хуже, и мы остановились на предыдущем.

In [27]:
# all_aspects = []  # здесь все аспекты корпуса, чтобы было удобнее смотреть
# text_aspects = []  # здесь аспекты, разделенные по текстам
# for idx, text in tqdm(list(test_with_sentiment.items())):
#     one_text = []
#     for i, sent in list(text.items()):
#         sent_aspects = [asp for asp in get_sentence_aspects(sent) if asp]
#         for aspect in sent_aspects:
#             clean_aspect = {'asp_idx': aspect[0]['text_index'],
#                             'asp_text': ' '.join([pt['text'] for pt in aspect]),
#                             'asp_start': aspect[0]['start'],
#                             'asp_end': aspect[-1]['end']}
#             tones = Counter([pt['sentiment1'] for pt in aspect]).most_common()
#             if len(tones) == 1:
#                 clean_aspect['sentiment1'] = tones[0][0]
#             else:
#                 if tones[0][1] > tones[1][1]:
#                     clean_aspect['sentiment1'] = tones[0][0]
#                 else:
#                     clean_aspect['sentiment1'] = 'both'
#             all_aspects.append(clean_aspect)
#             one_text.append(clean_aspect)
#     text_aspects.append(one_text)

In [28]:
len(all_aspects)

1180

Посмотрим, сколько аспектов вообще должно быть в этих данных.

In [31]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_aspects.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2021-12-27 12:27:13--  https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_aspects.txt
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt [following]
--2021-12-27 12:27:13--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109

In [32]:
gold_aspects = []
keys = ('asp_idx', 'category', 'asp_text', 'asp_start', 'asp_end', 'sentiment1')

with open('dev_aspects.txt') as f:
    for line in f:
        line = line.rstrip('\r\n').split('\t')
        line_dict = {}
        for i, key in enumerate(keys):
            line_dict[key] = line[i]
        gold_aspects.append(line_dict)

In [33]:
len(gold_aspects)

1190

## Этап 4: определяем категории
Для определения категорий будем использовать классификацию. Попробуем несколько алгоритмов.

#### Подготовка данных
Нам нужно взять те же данные, на которых обучались предыдущие две модели, и таким же образом поделить их на обучающие и тестовые.

In [34]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_aspects.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2021-12-27 12:27:14--  https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_aspects.txt
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_aspects.txt [following]
--2021-12-27 12:27:14--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.1

In [35]:
asp_df = pd.read_csv('train_aspects.txt', sep='\t', usecols=[0, 1, 2],
                     names=['text_idx', 'category', 'aspect'], header=None)
asp_df

Unnamed: 0,text_idx,category,aspect
0,3976,Whole,ресторане
1,3976,Whole,ресторанах
2,3976,Whole,ресторане
3,3976,Service,Столик бронировали
4,3976,Service,администратор
...,...,...,...
4758,16630,Service,обслуживание
4759,16630,Food,Еда
4760,16630,Service,персоналу
4761,16630,Whole,ресторан


In [36]:
mapper = {topic: t for t, topic in enumerate(asp_df['category'].unique())}
print(mapper)

asp_df['target'] = asp_df['category'].map(mapper)
asp_df.head(5)

{'Whole': 0, 'Service': 1, 'Food': 2, 'Interior': 3, 'Price': 4}


Unnamed: 0,text_idx,category,aspect,target
0,3976,Whole,ресторане,0
1,3976,Whole,ресторанах,0
2,3976,Whole,ресторане,0
3,3976,Service,Столик бронировали,1
4,3976,Service,администратор,1


In [37]:
def process(text):
    splitted = re.split(' |/', text.lower())
    tokens = [re.sub('[^а-яёa-z0-9-]', '', word) for word in splitted]
    return tokens

In [38]:
asp_df['aspect'] = asp_df['aspect'].apply(process)
asp_df.head(5)

Unnamed: 0,text_idx,category,aspect,target
0,3976,Whole,[ресторане],0
1,3976,Whole,[ресторанах],0
2,3976,Whole,[ресторане],0
3,3976,Service,"[столик, бронировали]",1
4,3976,Service,[администратор],1


#### Fasttext-эмбеддинги

In [39]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2021-12-27 12:27:15--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4496459151 (4.2G) [application/octet-stream]
Saving to: ‘cc.ru.300.bin.gz’


2021-12-27 12:30:48 (20.2 MB/s) - ‘cc.ru.300.bin.gz’ saved [4496459151/4496459151]



In [40]:
!gunzip cc.ru.300.bin.gz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [41]:
ft_model = fasttext.load_model('cc.ru.300.bin')



In [42]:
ft_model['ристоран'].shape

(300,)

#### Обучение
Чтобы обучить классификатор, поделим данные на обучающие и тестовые.

In [43]:
# тут мы поняли, что поделить данные мы поделили, а вот сохранили без индексов))
test_indices = [30808, 2495, 6668, 34956, 27629, 6376, 3152, 22015, 7824, 19503,
                12203, 12341, 2692, 1751, 1511, 26887, 3906, 21059, 33252,
                33816, 11770, 15335, 19746, 35904, 785, 1749, 17999, 19817,
                36948, 34402, 3552, 11355, 33798, 2006, 2364, 10825, 37070,
                35620, 1308, 16378, 7193, 7154, 33043]

In [44]:
test_df = asp_df[asp_df['text_idx'].isin(test_indices)]
train_df = asp_df[~asp_df['text_idx'].isin(test_indices)]

test_df

Unnamed: 0,text_idx,category,aspect,target
20,30808,Whole,[ресторане],0
21,30808,Interior,"[первом, этаже]",3
22,30808,Whole,"[руководству, ресторана]",0
23,30808,Service,"[обслуживающему, персоналу]",1
24,30808,Service,[сотрудникам],1
...,...,...,...,...
4751,33043,Service,[заказ],1
4752,33043,Service,[принесли],1
4753,33043,Food,[приготовили],2
4754,33043,Service,[оставил],1


In [45]:
x_train = []
for wordlist in train_df['aspect']:
    if len(wordlist) == 1:
        vector = ft_model[wordlist[0]]
        x_train.append(vector)
    else:
        vectors = [ft_model[word] for word in wordlist]
        vector = np.mean(vectors, axis=0)
        x_train.append(vector)

x_test = []
for wordlist in test_df['aspect']:
    if len(wordlist) == 1:
        vector = ft_model[wordlist[0]]
        x_test.append(vector)
    else:
        vectors = [ft_model[word] for word in wordlist]
        vector = np.mean(vectors, axis=0)
        x_test.append(vector)

y_train = train_df['target'].values
y_test = test_df['target'].values

In [46]:
classifiers = [LogisticRegression(random_state=200),
               MLPClassifier(max_iter=300, random_state=200),
               GaussianNB(),
               KNeighborsClassifier(), SVC(random_state=200),
               DecisionTreeClassifier(random_state=200)]
names = []
f1s = []
accuracies = []
for clf in tqdm(classifiers):
    clf.fit(x_train, y_train)
    preds = clf.predict(x_test)
    f1 = f1_score(y_test, preds, average='weighted')
    accuracy = f1_score(y_test, preds, average='micro')

    names.append(str(clf))
    f1s.append(f1)
    accuracies.append(accuracy)

  0%|          | 0/6 [00:00<?, ?it/s]

In [47]:
clf_df = pd.DataFrame({'classifier': names,
                        'f1': f1s,
                        'accuracy': accuracies})
clf_df.style.highlight_max(['f1', 'accuracy'])

Unnamed: 0,classifier,f1,accuracy
0,LogisticRegression(random_state=200),0.925796,0.926145
1,"MLPClassifier(max_iter=300, random_state=200)",0.934825,0.935007
2,GaussianNB(),0.801363,0.800591
3,KNeighborsClassifier(),0.910671,0.911374
4,SVC(random_state=200),0.94061,0.940916
5,DecisionTreeClassifier(random_state=200),0.889412,0.889217


Видим, что для нашей задачи хороши MLP и SVC (на другой выборке MLP лидировал с незначительными отличиями).

#### Ошибки SVC

In [48]:
svc = SVC(random_state=200)
svc.fit(x_train, y_train)

preds = svc.predict(x_test)
print(f1_score(y_test, preds, average='weighted'))
print(f1_score(y_test, preds, average='micro'))

0.9406100129182599
0.9409158050221565


In [49]:
inv_mapper = {i: target for target, i in list(mapper.items())}

In [50]:
mistakes = []
test_aspects = test_df['aspect'].tolist()
for i, (true, pred) in enumerate(zip(y_test, preds)):
    if true != pred:
        mistakes.append((inv_mapper[true], inv_mapper[pred], test_aspects[i]))

In [51]:
mistakes_df = pd.DataFrame(mistakes, columns=['true', 'pred', 'aspect'])
mistakes_df

Unnamed: 0,true,pred,aspect
0,Whole,Service,"[руководству, ресторана]"
1,Food,Interior,[стол]
2,Service,Food,[поварам]
3,Food,Service,[бивштексы]
4,Whole,Food,"[квартиру, 55]"
5,Whole,Food,"[сетевых, заведений, японской, кухни]"
6,Interior,Service,[караоке-программа]
7,Price,Whole,"[оставили, 10, тысяч]"
8,Whole,Interior,"[чердак, художника]"
9,Interior,Whole,[обтановка]


### Классифицируем наши аспекты
Ура, у нас есть классификатор! Применим его к полученным ранее аспектам.

In [52]:
def prepare_for_classification(aspects_list):
    aspects_sent = [asp['asp_text'] for asp in aspects_list]
    asp_test = []
    # если в предложении нет аспектов, возвращаем вектор нулей
    # потом просто выбросим это предсказание
    if not aspects_sent:
        asp_test = [np.zeros(300)]
    for aspect in aspects_sent:
        asp_parts = aspect.split()
        if len(asp_parts) == 1:
            vector = ft_model[asp_parts[0]]
            asp_test.append(vector)
        else:
            vectors = [ft_model[word] for word in asp_parts]
            vector = np.mean(vectors, axis=0)
            asp_test.append(vector)
    return asp_test

In [53]:
full_aspects_for_clf = []
for one_text_asps in text_aspects:
    asps_for_clf = []
    for one_sent_asps in one_text_asps:
        asps_for_clf.append(prepare_for_classification(one_sent_asps))
    full_aspects_for_clf.append(asps_for_clf)

In [54]:
def classify_aspects(aspects_for_clf, svc, inv_mapper):
    preds_by_sent = []
    for sent_asps in aspects_for_clf:
        preds = svc.predict(sent_asps)
        mapped_preds = [inv_mapper[pred] for pred in preds]
        preds_by_sent.append(mapped_preds)
    return preds_by_sent

In [55]:
all_pred_labels = []
for one_text in full_aspects_for_clf:
    pred_labels = classify_aspects(one_text, svc, inv_mapper)
    all_pred_labels.append(pred_labels)

In [56]:
all_pred_labels[0]  # для первого текста

[['Whole'],
 ['Service'],
 ['Service', 'Service', 'Service', 'Service', 'Service'],
 ['Service'],
 ['Service', 'Service'],
 ['Service'],
 ['Whole', 'Whole'],
 ['Service'],
 ['Food', 'Price'],
 ['Whole', 'Food', 'Price', 'Food', 'Service'],
 ['Whole']]

Добавляем предсказанные классы к выделенным ранее аспектам:

In [57]:
for i, text in enumerate(text_aspects):
    for j, sentence in enumerate(text):
        for n, aspect in enumerate(sentence):
            aspect['category'] = all_pred_labels[i][j][n]

In [58]:
text_aspects[0][0:3]  # аспекты первых 3 предложений первого текста

[[{'asp_idx': 13823,
   'asp_text': 'аппетит "',
   'asp_start': 8,
   'asp_end': 16,
   'sentiment1': 'neutral',
   'category': 'Whole'}],
 [],
 [{'asp_idx': 13823,
   'asp_text': 'встретил',
   'asp_start': 138,
   'asp_end': 146,
   'sentiment1': 'neutral',
   'category': 'Service'},
  {'asp_idx': 13823,
   'asp_text': 'менеджер',
   'asp_start': 147,
   'asp_end': 155,
   'sentiment1': 'neutral',
   'category': 'Service'},
  {'asp_idx': 13823,
   'asp_text': 'девушка',
   'asp_start': 179,
   'asp_end': 186,
   'sentiment1': 'neutral',
   'category': 'Service'},
  {'asp_idx': 13823,
   'asp_text': 'проводила к столу',
   'asp_start': 188,
   'asp_end': 205,
   'sentiment1': 'neutral',
   'category': 'Service'},
  {'asp_idx': 13823,
   'asp_text': 'дала меню',
   'asp_start': 208,
   'asp_end': 217,
   'sentiment1': 'neutral',
   'category': 'Service'}]]

Для подсчета тональности по категориям пока выбран самый простой способ: нет упоминания -- *absence*, все упоминания одной тональности -- эта тональность, есть упоминания разных тональностей -- *both*.

In [59]:
text_sentiments = {}

for text in text_aspects:
    text_sent = {}
    whole, interior, food, service, price = [], [], [], [], []
    
    for sentence in text:

        for aspect in sentence:

            text_idx = aspect['asp_idx']
            categ = aspect['category']
            sentim = aspect['sentiment1']
            if categ == 'Whole':
                whole.append(sentim)
            elif categ == 'Interior':
                interior.append(sentim)
            elif categ == 'Food':
                food.append(sentim)
            elif categ == 'Service':
                service.append(sentim)
            elif categ == 'Price':
                price.append(sentim)

    food = Counter(food).most_common()
    interior = Counter(interior).most_common()
    price = Counter(price).most_common()
    whole = Counter(whole).most_common()
    service = Counter(service).most_common()

    if len(food) == 0:
        text_sent['Food'] = 'absence'
    elif len(food) == 1:
        text_sent['Food'] = food[0][0]
    elif food[0][1] > food[1][1]:
        text_sent['Food'] = food[0][0]
    else:
        text_sent['Food'] = 'both'

    if len(interior) == 0:
        text_sent['Interior'] = 'absence'
    elif len(interior) == 1:
        text_sent['Interior'] = interior[0][0]
    elif interior[0][1] > interior[1][1]:
        text_sent['Interior'] = interior[0][0]
    else:
        text_sent['Interior'] = 'both'

    if len(price) == 0:
        text_sent['Price'] = 'absence'
    elif len(price) == 1:
        text_sent['Price'] = price[0][0]
    elif price[0][1] > price[1][1]:
        text_sent['Price'] = price[0][0]
    else:
        text_sent['Price'] = 'both'

    if len(whole) == 0:
        text_sent['Whole'] = 'absence'
    elif len(whole) == 1:
        text_sent['Whole'] = whole[0][0]
    elif whole[0][1] > whole[1][1]:
        text_sent['Whole'] = whole[0][0]
    else:
        text_sent['Whole'] = 'both'

    if len(service) == 0:
        text_sent['Service'] = 'absence'
    elif len(service) == 1:
        text_sent['Service'] = service[0][0]
    elif service[0][1] > service[1][1]:
        text_sent['Service'] = service[0][0]
    else:
        text_sent['Service'] = 'both'

    text_sentiments[text_idx] = text_sent

In [60]:
text_sentiments[13823]

{'Food': 'positive',
 'Interior': 'absence',
 'Price': 'positive',
 'Whole': 'positive',
 'Service': 'neutral'}

Запишем результаты разметки в итоговые файлы.

In [61]:
all_aspects_with_cats = []
for text in text_aspects:
    for sent in text:
        all_aspects_with_cats.extend(sent)

In [62]:
all_aspects_with_cats[0]

{'asp_idx': 13823,
 'asp_text': 'аппетит "',
 'asp_start': 8,
 'asp_end': 16,
 'sentiment1': 'neutral',
 'category': 'Whole'}

In [63]:
full_tagged_df = pd.DataFrame(all_aspects_with_cats)
full_tagged_df = full_tagged_df[['asp_idx', 'category', 'asp_text',
                                 'asp_start', 'asp_end', 'sentiment1']]
full_tagged_df

Unnamed: 0,asp_idx,category,asp_text,asp_start,asp_end,sentiment1
0,13823,Whole,"аппетит """,8,16,neutral
1,13823,Service,встретил,138,146,neutral
2,13823,Service,менеджер,147,155,neutral
3,13823,Service,девушка,179,186,neutral
4,13823,Service,проводила к столу,188,205,neutral
...,...,...,...,...,...,...
1175,11770,Food,стейк,831,836,positive
1176,11770,Food,блюдо тартар с сырой рыбой и сырым яйцом,896,936,positive
1177,11770,Service,Официанты,938,947,positive
1178,11770,Service,обстановкая,976,987,positive


In [64]:
full_tagged_df.to_csv('sentiment_by_aspect.tsv', sep='\t', index=False, header=False)

In [65]:
categs_for_df = []
for text, sentiments in list(text_sentiments.items()):
    for categ, sentim in list(sentiments.items()):
        categs_for_df.append([text, categ, sentim])

In [66]:
categ_tagged_df = pd.DataFrame(categs_for_df, columns=['idx', 'category', 'sentiment'])
categ_tagged_df

Unnamed: 0,idx,category,sentiment
0,13823,Food,positive
1,13823,Interior,absence
2,13823,Price,positive
3,13823,Whole,positive
4,13823,Service,neutral
...,...,...,...
350,11770,Food,both
351,11770,Interior,absence
352,11770,Price,absence
353,11770,Whole,neutral


In [67]:
categ_tagged_df.to_csv('sentiment_by_category.tsv', sep='\t', index=False, header=False)

## Этап 5 (эксперимент): оцениваем тональность готовой моделью

Мы будем определять тональность не каждого слова, а n-граммы "аспект + контекст".

Что у нас есть?
* test_with_sentiment -- потокенная разметка с делением на предложения и тексты
* text_aspects -- аспекты в виде списков токенов с делением на предложения и тексты

Что нужно получить?
* для каждого аспекта -- его контекст в рамках предложения
* для пары "аспект - контекст" -- тональность с помощью [готовой модели](https://huggingface.co/blanchefort/rubert-base-cased-sentiment)
* для каждого текста -- список входящих в него аспектов, их индексов и тональностей

Проблема:
* выбранная модель (и все другие, которые мы нашли) не умеет выделять *both* (такие употребления она помечает как *neutral*)
* попробуем решить это, добавив свой способ выбора сентимента из полученных вероятностей

In [68]:
window = 3
new_text_aspects = []
# идем по текстам
for text_idx, (rev_idx, text) in enumerate(list(test_with_sentiment.items())):
    one_text_aspects = []  # аспекты одного предложения в контексте
    # идем по предложению
    for sent_idx, sentence in enumerate(list(text.values())):
        one_sent_aspects = []
        # получаем аспекты, относящиеся к этому предложению
        sent_aspects = get_sentence_aspects(sentence)
        # проходим по аспектам и получаем контекст
        for aspect in sent_aspects:
            asp_with_context = []  # сюда положим аспект с контекстом
            first_word = aspect[0]['id']  # первое слово аспекта
            last_word = aspect[-1]['id']  # последнее слово аспекта
            first_context = first_word - window  # индекс первого слова контекста
            # если мы в рамках предложения, берем нужные токены
            for i in range(3):  # берем три слова до аспекта
                if first_context >= 0:
                    asp_with_context.append(sentence[first_context]['text'])
                    first_context += 1  # переходим к следующему слову
            for word in aspect:  # берем все слова аспекта
                asp_with_context.append(word['text'])
            last_context = last_word
            for i in range(3):  # берем три слова после аспекта
                last_context += 1
                # проверяем, что предложение еще не кончилось
                if last_context in set(sentence.keys()):
                    asp_with_context.append(sentence[last_context]['text'])
            asp_info = {'asp_idx': rev_idx,  # индекс ревью в файле
                        'asp_text': ' '.join([pt['text'] for pt in aspect]),
                        'asp_start': aspect[0]['start'],
                        'asp_end': aspect[-1]['end'],
                        'asp_with_context': ' '.join(asp_with_context)}
            one_sent_aspects.append(asp_info)
        one_text_aspects.append(one_sent_aspects)
    new_text_aspects.append(one_text_aspects)

In [69]:
new_text_aspects[0][2]  # аспекты третьего предложения первого текста

[{'asp_idx': 13823,
  'asp_text': 'встретил',
  'asp_start': 138,
  'asp_end': 146,
  'asp_with_context': 'встретил менеджер - темноволосая'},
 {'asp_idx': 13823,
  'asp_text': 'менеджер',
  'asp_start': 147,
  'asp_end': 155,
  'asp_with_context': 'менеджер - темноволосая стройная'},
 {'asp_idx': 13823,
  'asp_text': 'девушка',
  'asp_start': 179,
  'asp_end': 186,
  'asp_with_context': '- темноволосая стройная девушка , проводила к'},
 {'asp_idx': 13823,
  'asp_text': 'проводила к столу',
  'asp_start': 188,
  'asp_end': 205,
  'asp_with_context': 'стройная девушка , проводила к столу и дала меню'},
 {'asp_idx': 13823,
  'asp_text': 'дала меню',
  'asp_start': 208,
  'asp_end': 217,
  'asp_with_context': 'к столу и дала меню . .'}]

Теперь получим для каждого аспекта в контексте его тональность.

In [70]:
tokenizer = BertTokenizerFast.from_pretrained(
    'blanchefort/rubert-base-cased-sentiment')
pretrained_model = AutoModelForSequenceClassification.from_pretrained(
    'blanchefort/rubert-base-cased-sentiment', return_dict=True)

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/943 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679M [00:00<?, ?B/s]

In [71]:
@torch.no_grad()
def predict_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, max_length=512, padding=True,
                       truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    return predicted

In [72]:
predict_sentiment(new_text_aspects[0][0][0]['asp_with_context'],
                  tokenizer, pretrained_model)

tensor([[0.8271, 0.1146, 0.0583]])

In [73]:
def get_sentiment_labels(aspect, tokenizer, model):
    text = aspect['asp_with_context']
    preds = predict_sentiment(text, tokenizer, model)
    pred_dict = {'neutral': preds[0][0],
                 'positive': preds[0][1],
                 'negative': preds[0][2]}
    pred_dict = {k: v for k, v in sorted(pred_dict.items(),
                                         key=lambda item: item[1],
                                         reverse=True)}
    most_prob_label = list(pred_dict.items())[0][0]
    highest_prob = list(pred_dict.items())[0][1]
    if highest_prob > 0.65:
        aspect['sentiment2'] = most_prob_label
    else:
        aspect['sentiment2'] = 'both'

In [74]:
for text in tqdm(new_text_aspects):
    for sentence in text:
        for aspect in sentence:
            get_sentiment_labels(aspect, tokenizer, pretrained_model)

  0%|          | 0/71 [00:00<?, ?it/s]

In [75]:
new_text_aspects[0][2]

[{'asp_idx': 13823,
  'asp_text': 'встретил',
  'asp_start': 138,
  'asp_end': 146,
  'asp_with_context': 'встретил менеджер - темноволосая',
  'sentiment2': 'neutral'},
 {'asp_idx': 13823,
  'asp_text': 'менеджер',
  'asp_start': 147,
  'asp_end': 155,
  'asp_with_context': 'менеджер - темноволосая стройная',
  'sentiment2': 'neutral'},
 {'asp_idx': 13823,
  'asp_text': 'девушка',
  'asp_start': 179,
  'asp_end': 186,
  'asp_with_context': '- темноволосая стройная девушка , проводила к',
  'sentiment2': 'neutral'},
 {'asp_idx': 13823,
  'asp_text': 'проводила к столу',
  'asp_start': 188,
  'asp_end': 205,
  'asp_with_context': 'стройная девушка , проводила к столу и дала меню',
  'sentiment2': 'both'},
 {'asp_idx': 13823,
  'asp_text': 'дала меню',
  'asp_start': 208,
  'asp_end': 217,
  'asp_with_context': 'к столу и дала меню . .',
  'sentiment2': 'negative'}]

Ощущение, что качество хуже, чем у предыдущего метода :(

**Дальше происходит все то же самое, что было выше для другого метода.**

In [76]:
new_full_aspects_for_clf = []
for one_text_asps in new_text_aspects:
    asps_for_clf = []
    for one_sent_asps in one_text_asps:
        asps_for_clf.append(prepare_for_classification(one_sent_asps))
    new_full_aspects_for_clf.append(asps_for_clf)

new_all_pred_labels = []
for one_text in new_full_aspects_for_clf:
    pred_labels = classify_aspects(one_text, svc, inv_mapper)
    new_all_pred_labels.append(pred_labels)

In [77]:
for i, text in enumerate(new_text_aspects):
    for j, sentence in enumerate(text):
        for n, aspect in enumerate(sentence):
            aspect['category'] = new_all_pred_labels[i][j][n]

In [78]:
new_text_aspects[0][2]

[{'asp_idx': 13823,
  'asp_text': 'встретил',
  'asp_start': 138,
  'asp_end': 146,
  'asp_with_context': 'встретил менеджер - темноволосая',
  'sentiment2': 'neutral',
  'category': 'Service'},
 {'asp_idx': 13823,
  'asp_text': 'менеджер',
  'asp_start': 147,
  'asp_end': 155,
  'asp_with_context': 'менеджер - темноволосая стройная',
  'sentiment2': 'neutral',
  'category': 'Service'},
 {'asp_idx': 13823,
  'asp_text': 'девушка',
  'asp_start': 179,
  'asp_end': 186,
  'asp_with_context': '- темноволосая стройная девушка , проводила к',
  'sentiment2': 'neutral',
  'category': 'Service'},
 {'asp_idx': 13823,
  'asp_text': 'проводила к столу',
  'asp_start': 188,
  'asp_end': 205,
  'asp_with_context': 'стройная девушка , проводила к столу и дала меню',
  'sentiment2': 'both',
  'category': 'Service'},
 {'asp_idx': 13823,
  'asp_text': 'дала меню',
  'asp_start': 208,
  'asp_end': 217,
  'asp_with_context': 'к столу и дала меню . .',
  'sentiment2': 'negative',
  'category': 'Service'}

In [79]:
new_text_sentiments = {}

for text in new_text_aspects:

    text_sent = {}
    whole, interior, food, service, price = [], [], [], [], []
    
    for sentence in text:

        for aspect in sentence:

            text_idx = aspect['asp_idx']
            categ = aspect['category']
            sentim = aspect['sentiment2']
            if categ == 'Whole':
                whole.append(sentim)
            elif categ == 'Interior':
                interior.append(sentim)
            elif categ == 'Food':
                food.append(sentim)
            elif categ == 'Service':
                service.append(sentim)
            elif categ == 'Price':
                price.append(sentim)

    food = Counter(food).most_common()
    interior = Counter(interior).most_common()
    price = Counter(price).most_common()
    whole = Counter(whole).most_common()
    service = Counter(service).most_common()

    if len(food) == 0:
        text_sent['Food'] = 'absence'
    elif len(food) == 1:
        text_sent['Food'] = food[0][0]
    elif food[0][1] > food[1][1]:
        text_sent['Food'] = food[0][0]
    else:
        text_sent['Food'] = 'both'

    if len(interior) == 0:
        text_sent['Interior'] = 'absence'
    elif len(interior) == 1:
        text_sent['Interior'] = interior[0][0]
    elif interior[0][1] > interior[1][1]:
        text_sent['Interior'] = interior[0][0]
    else:
        text_sent['Interior'] = 'both'

    if len(price) == 0:
        text_sent['Price'] = 'absence'
    elif len(price) == 1:
        text_sent['Price'] = price[0][0]
    elif price[0][1] > price[1][1]:
        text_sent['Price'] = price[0][0]
    else:
        text_sent['Price'] = 'both'

    if len(whole) == 0:
        text_sent['Whole'] = 'absence'
    elif len(whole) == 1:
        text_sent['Whole'] = whole[0][0]
    elif whole[0][1] > whole[1][1]:
        text_sent['Whole'] = whole[0][0]
    else:
        text_sent['Whole'] = 'both'

    if len(service) == 0:
        text_sent['Service'] = 'absence'
    elif len(service) == 1:
        text_sent['Service'] = service[0][0]
    elif service[0][1] > service[1][1]:
        text_sent['Service'] = service[0][0]
    else:
        text_sent['Service'] = 'both'

    new_text_sentiments[text_idx] = text_sent

In [80]:
new_all_aspects_with_cats = []
for text in new_text_aspects:
    for sent in text:
        new_all_aspects_with_cats.extend(sent)

In [81]:
new_all_aspects_with_cats[0]

{'asp_idx': 13823,
 'asp_text': 'аппетит "',
 'asp_start': 8,
 'asp_end': 16,
 'asp_with_context': 'Зашли в " аппетит " случайно .',
 'sentiment2': 'neutral',
 'category': 'Whole'}

In [82]:
new_full_tagged_df = pd.DataFrame(new_all_aspects_with_cats)
new_full_tagged_df = new_full_tagged_df[['asp_idx', 'category', 'asp_text', 'asp_start',
                                         'asp_end', 'sentiment2', 'asp_with_context']]
new_full_tagged_df.drop(columns=['asp_with_context'], inplace=True)
new_full_tagged_df.to_csv('new_sentiment_by_aspect.tsv', sep='\t', index=False, header=False)

In [83]:
new_categs_for_df = []
for text, sentiments in list(new_text_sentiments.items()):
    for categ, sentim in list(sentiments.items()):
        new_categs_for_df.append([text, categ, sentim])

In [84]:
new_categ_tagged_df = pd.DataFrame(new_categs_for_df, columns=['idx', 'category', 'sentiment'])
new_categ_tagged_df.to_csv('new_sentiment_by_category.tsv', sep='\t', index=False, header=False)

Ура! Оценка в другой тетрадке :)