# Importing Data

In [55]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from collections import defaultdict, Counter
from tqdm import tqdm

import re

import torch
from torch import nn
from torch.utils.data import DataLoader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_split_aspects.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_split_reviews.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_aspects.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_reviews.txt

--2021-12-27 10:00:12--  https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_split_aspects.txt
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_split_aspects.txt [following]
--2021-12-27 10:00:12--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_split_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170883 (167K) [text/plain]
Saving to: ‘train_split_aspects.txt’


2021-12-27 10:00:12 (7.40 MB/s) - ‘train_split_aspects.txt’ saved [170883/170883]

--2021-12-27 10:00:12--  https:

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 485 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 29.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

# Train data preparation

In [4]:
train_asp = pd.read_csv('train_split_aspects.txt', 
                        delimiter='\t', 
                        names=['text_id', 'category', 'mention', 
                               'start', 'end', 'sentiment'])

train_texts = pd.read_csv('train_split_reviews.txt', 
                          delimiter='\t', names=['text_id','text'])

In [5]:
data = train_asp.merge(train_texts, on='text_id')

In [6]:
data

Unnamed: 0,text_id,category,mention,start,end,sentiment,text
0,30808,Whole,ресторане,16,25,neutral,Отмечали в этом ресторане день рождение на пер...
1,30808,Interior,первом этаже,43,55,neutral,Отмечали в этом ресторане день рождение на пер...
2,30808,Whole,руководству ресторана,124,145,positive,Отмечали в этом ресторане день рождение на пер...
3,30808,Service,обслуживающему персоналу,147,171,positive,Отмечали в этом ресторане день рождение на пер...
4,30808,Service,сотрудникам,189,200,positive,Отмечали в этом ресторане день рождение на пер...
...,...,...,...,...,...,...,...
3568,16630,Service,обслуживание,85,97,positive,Уютная и тёплая домашняя обстановка! Милый и о...
3569,16630,Food,Еда,99,102,positive,Уютная и тёплая домашняя обстановка! Милый и о...
3570,16630,Service,персоналу,244,253,positive,Уютная и тёплая домашняя обстановка! Милый и о...
3571,16630,Whole,ресторан,294,302,positive,Уютная и тёплая домашняя обстановка! Милый и о...


In [7]:
sentiment_vocab = {'positive': 1, 'negative': 0, 'both': 2, 'neutral': 3}

In [8]:
class Dataset():

    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        text = self.df['text'].values[index]
        sentiment = self.df['sentiment'].values[index]
        masked = (text[:self.df['start'].values[index]] + '[MASK]' + 
                  text[self.df['end'].values[index]:])
        
        return masked, sentiment_vocab[sentiment]

    def __len__(self):

        return len(self.df)

In [9]:
train_data, val_data = train_test_split(data, test_size=0.25)

train_dataset = Dataset(train_data)
val_dataset = Dataset(val_data)

In [10]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [11]:
class Classifier(nn.Module):

    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.dropout = nn.Dropout(p=0.3)
        self.linear_1 = nn.Linear(312, 4)
        self.softmax = nn.Softmax()

    def forward(self, x, attention_mask=None):
        output  = self.bert(x, attention_mask=attention_mask)
        x = output['last_hidden_state'][:, 0]
        x = self.dropout(x)
        x = self.linear_1(x)
        x = self.softmax(x)
        
        return x

In [12]:
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

# Import the BERT model

In [13]:
MODEL_NAME = "cointegrated/rubert-tiny"
EPOCHS = 4
BATCH_SIZE = 8

### Create dataloaders

In [14]:
train_loss = []
val_loss = []

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
criterion = nn.CrossEntropyLoss()

model = AutoModel.from_pretrained(
    MODEL_NAME
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    do_lower_case=True
)

optimizer = AdamW(model.parameters(),
                  weight_decay = 1e-4,
                  lr = 4e-5,
                  eps = 1e-8
                )

total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [16]:
model_classifier = Classifier(model)

In [17]:
from sklearn.metrics import accuracy_score, f1_score

### Training loop

In [18]:
import random
import numpy as np

seed_val = 2020

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in tqdm(range(EPOCHS)):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
    print('Training...')

    current_train_loss = 0 
    model_classifier.train()

    for text, y in tqdm(train_dataloader, leave=False):
        tokens = tokenizer.batch_encode_plus(list(text),
            pad_to_max_length=True,
            return_tensors='pt')

        logits = model_classifier(tokens['input_ids'], 
                                  attention_mask=tokens['attention_mask'])

        loss = criterion(logits, y)
        current_train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model_classifier.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    train_loss.append(current_train_loss / len(train_dataloader))

    model_classifier.eval()
    current_val_loss = 0
    predictions = []
    actuals = [] 

    for text, y in val_dataloader:
        tokens = tokenizer.batch_encode_plus(
            list(text),
            pad_to_max_length=True,
            return_tensors='pt')

        logits = model_classifier(tokens['input_ids'], 
                                  attention_mask=tokens['attention_mask'])
        
        loss = criterion(logits, y)
        current_val_loss += loss.item()

        predictions += list(logits.argmax(axis=1).data.numpy())
        actuals += list(y.data.numpy())
    
    print('Accuracy:', accuracy_score(actuals, predictions))    
    val_loss.append(current_val_loss / len(val_dataloader))

  0%|          | 0/4 [00:00<?, ?it/s]


Training...



  from ipykernel import kernelapp as app

  0%|          | 1/335 [00:04<25:00,  4.49s/it][A
  1%|          | 2/335 [00:07<18:54,  3.41s/it][A
  1%|          | 3/335 [00:10<18:01,  3.26s/it][A
  1%|          | 4/335 [00:13<17:03,  3.09s/it][A
  1%|▏         | 5/335 [00:15<15:58,  2.91s/it][A
  2%|▏         | 6/335 [00:18<15:30,  2.83s/it][A
  2%|▏         | 7/335 [00:21<15:26,  2.83s/it][A
  2%|▏         | 8/335 [00:23<14:56,  2.74s/it][A
  3%|▎         | 9/335 [00:25<13:09,  2.42s/it][A
  3%|▎         | 10/335 [00:27<12:41,  2.34s/it][A
  3%|▎         | 11/335 [00:29<12:37,  2.34s/it][A
  4%|▎         | 12/335 [00:32<12:28,  2.32s/it][A
  4%|▍         | 13/335 [00:34<12:16,  2.29s/it][A
  4%|▍         | 14/335 [00:36<11:43,  2.19s/it][A
  4%|▍         | 15/335 [00:39<12:44,  2.39s/it][A
  5%|▍         | 16/335 [00:41<12:58,  2.44s/it][A
  5%|▌         | 17/335 [00:44<13:41,  2.58s/it][A
  5%|▌         | 18/335 [00:46<13:02,  2.47s/it][A
  6%|▌         | 19/335 [00:49

Accuracy: 0.6621923937360179

Training...



  from ipykernel import kernelapp as app

  0%|          | 1/335 [00:02<14:28,  2.60s/it][A
  1%|          | 2/335 [00:04<12:09,  2.19s/it][A
  1%|          | 3/335 [00:07<13:53,  2.51s/it][A
  1%|          | 4/335 [00:10<14:28,  2.62s/it][A
  1%|▏         | 5/335 [00:12<14:16,  2.60s/it][A
  2%|▏         | 6/335 [00:15<14:21,  2.62s/it][A
  2%|▏         | 7/335 [00:18<14:36,  2.67s/it][A
  2%|▏         | 8/335 [00:20<14:23,  2.64s/it][A
  3%|▎         | 9/335 [00:22<12:48,  2.36s/it][A
  3%|▎         | 10/335 [00:24<12:27,  2.30s/it][A
  3%|▎         | 11/335 [00:27<12:30,  2.32s/it][A
  4%|▎         | 12/335 [00:29<12:21,  2.30s/it][A
  4%|▍         | 13/335 [00:31<12:11,  2.27s/it][A
  4%|▍         | 14/335 [00:33<12:04,  2.26s/it][A
  4%|▍         | 15/335 [00:36<12:51,  2.41s/it][A
  5%|▍         | 16/335 [00:39<13:00,  2.45s/it][A
  5%|▌         | 17/335 [00:41<13:37,  2.57s/it][A
  5%|▌         | 18/335 [00:44<12:56,  2.45s/it][A
  6%|▌         | 19/335 [00:46

Accuracy: 0.6621923937360179

Training...



  from ipykernel import kernelapp as app

  0%|          | 1/335 [00:02<14:35,  2.62s/it][A
  1%|          | 2/335 [00:04<12:10,  2.19s/it][A
  1%|          | 3/335 [00:07<13:55,  2.52s/it][A
  1%|          | 4/335 [00:10<14:33,  2.64s/it][A
  1%|▏         | 5/335 [00:12<14:20,  2.61s/it][A
  2%|▏         | 6/335 [00:15<14:21,  2.62s/it][A
  2%|▏         | 7/335 [00:18<14:36,  2.67s/it][A
  2%|▏         | 8/335 [00:20<14:22,  2.64s/it][A
  3%|▎         | 9/335 [00:22<12:45,  2.35s/it][A
  3%|▎         | 10/335 [00:24<12:21,  2.28s/it][A
  3%|▎         | 11/335 [00:26<12:22,  2.29s/it][A
  4%|▎         | 12/335 [00:29<12:20,  2.29s/it][A
  4%|▍         | 13/335 [00:31<12:08,  2.26s/it][A
  4%|▍         | 14/335 [00:33<11:36,  2.17s/it][A
  4%|▍         | 15/335 [00:36<12:31,  2.35s/it][A
  5%|▍         | 16/335 [00:38<12:47,  2.41s/it][A
  5%|▌         | 17/335 [00:41<13:30,  2.55s/it][A
  5%|▌         | 18/335 [00:43<12:52,  2.44s/it][A
  6%|▌         | 19/335 [00:46

Accuracy: 0.6621923937360179

Training...



  from ipykernel import kernelapp as app

  0%|          | 1/335 [00:02<14:22,  2.58s/it][A
  1%|          | 2/335 [00:04<12:09,  2.19s/it][A
  1%|          | 3/335 [00:07<14:00,  2.53s/it][A
  1%|          | 4/335 [00:10<14:38,  2.66s/it][A
  1%|▏         | 5/335 [00:12<14:28,  2.63s/it][A
  2%|▏         | 6/335 [00:15<14:30,  2.65s/it][A
  2%|▏         | 7/335 [00:18<14:46,  2.70s/it][A
  2%|▏         | 8/335 [00:20<14:34,  2.67s/it][A
  3%|▎         | 9/335 [00:22<12:52,  2.37s/it][A
  3%|▎         | 10/335 [00:24<12:28,  2.30s/it][A
  3%|▎         | 11/335 [00:27<12:27,  2.31s/it][A
  4%|▎         | 12/335 [00:29<12:19,  2.29s/it][A
  4%|▍         | 13/335 [00:31<12:04,  2.25s/it][A
  4%|▍         | 14/335 [00:33<11:33,  2.16s/it][A
  4%|▍         | 15/335 [00:36<12:32,  2.35s/it][A
  5%|▍         | 16/335 [00:38<12:48,  2.41s/it][A
  5%|▌         | 17/335 [00:41<13:29,  2.55s/it][A
  5%|▌         | 18/335 [00:43<12:49,  2.43s/it][A
  6%|▌         | 19/335 [00:46

Accuracy: 0.6621923937360179





In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
torch.save(model_classifier.state_dict(), '/content/drive/My Drive/sentiment_classifier')

In [None]:
model_classifier.load_state_dict(torch.load(
    '/content/drive/My Drive/sentiment_classifier'))

In [None]:
model_classifier

### Preparing test data

In [21]:
test_asp = pd.read_csv('dev_aspects.txt', 
                        delimiter='\t', 
                        names=['text_id', 'category', 'mention', 
                               'start', 'end', 'sentiment'])

test_texts = pd.read_csv('dev_reviews.txt', 
                          delimiter='\t', names=['text_id','text'])

In [22]:
test_data = test_asp.merge(test_texts, on='text_id')

In [23]:
test_data

Unnamed: 0,text_id,category,mention,start,end,sentiment,text
0,3976,Whole,ресторане,71,80,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
1,3976,Whole,ресторанах,198,208,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
2,3976,Whole,ресторане,256,265,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
3,3976,Service,Столик бронировали,267,285,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
4,3976,Service,администратор,322,335,positive,"День 8-го марта прошёл, можно и итоги подвести..."
...,...,...,...,...,...,...,...
1185,33043,Service,заказ,792,797,positive,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...
1186,33043,Service,принесли,798,806,positive,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...
1187,33043,Food,приготовили,880,891,positive,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...
1188,33043,Service,оставил,1017,1024,negative,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...


In [24]:
test_dataset = Dataset(test_data)

In [25]:
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

### Model Evaluation on Test Data

In [45]:
predictions = []
true_sentiment = []

print('Predicting sentiment for {:,} sentiment mention...'.format(len(test_data)))

model_classifier.eval()

for text, y in tqdm(test_dataloader):
    tokens = tokenizer.batch_encode_plus(list(text),
        pad_to_max_length=True,
        return_tensors='pt')

    logits = model_classifier(tokens['input_ids'],
        attention_mask=tokens['attention_mask']).argmax(dim=1)
    
    predictions += list(logits.data.numpy())
    true_sentiment += list(y.data.numpy())

Predicting sentiment for 1,190 sentiment mention...


  from ipykernel import kernelapp as app
100%|██████████| 149/149 [01:29<00:00,  1.66it/s]


In [46]:
print('Accuracy:', accuracy_score(true_sentiment, predictions))

Accuracy: 0.6680672268907563


In [47]:
pred_data = pd.read_csv('dev_aspects.txt', 
                        delimiter='\t', 
                        names=['text_id', 'category', 'mention', 
                               'start', 'end', 'sentiment'])

In [48]:
pred_data['pred_sentiment'] = predictions

In [49]:
pred_data['pred_sentiment'] = pred_data['pred_sentiment'].replace([0, 1, 2, 3], 
                                                   ['negative', 
                                                    'positive', 
                                                    'both',
                                                    'neutral'])

In [50]:
pred_data.pred_sentiment.unique()

array(['positive'], dtype=object)

In [51]:
pred_data = pred_data.drop('sentiment', axis=1)

In [52]:
pred_data = pred_data.rename(columns={'pred_sentiment': 'sentiment'})

In [53]:
pred_data

Unnamed: 0,text_id,category,mention,start,end,sentiment
0,3976,Whole,ресторане,71,80,positive
1,3976,Whole,ресторанах,198,208,positive
2,3976,Whole,ресторане,256,265,positive
3,3976,Service,Столик бронировали,267,285,positive
4,3976,Service,администратор,322,335,positive
...,...,...,...,...,...,...
1185,33043,Service,заказ,792,797,positive
1186,33043,Service,принесли,798,806,positive
1187,33043,Food,приготовили,880,891,positive
1188,33043,Service,оставил,1017,1024,positive


In [54]:
pred_data.to_csv('/content/drive/My Drive/dev_pred_aspects.txt')