In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import spacy

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from collections import defaultdict, Counter
from tqdm import tqdm

import re

import torch
from torch import nn
from torch.utils.data import DataLoader

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_split_aspects.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_split_reviews.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_aspects.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_reviews.txt

--2021-12-27 11:39:21--  https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_split_aspects.txt
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_split_aspects.txt [following]
--2021-12-27 11:39:21--  https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/train_split_aspects.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170883 (167K) [text/plain]
Saving to: ‘train_split_aspects.txt’


2021-12-27 11:39:21 (7.64 MB/s) - ‘train_split_aspects.txt’ saved [170883/170883]

--2021-12-27 11:39:21--  https:

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 24.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 77 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 26.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 29.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  

In [4]:
train_asp = pd.read_csv('train_split_aspects.txt', 
                        delimiter='\t', 
                        names=['text_id', 'category', 'mention', 
                               'start', 'end', 'sentiment'])

train_texts = pd.read_csv('train_split_reviews.txt', 
                          delimiter='\t', names=['text_id','text'])

In [5]:
data = train_asp.merge(train_texts, on='text_id')

In [6]:
data

Unnamed: 0,text_id,category,mention,start,end,sentiment,text
0,30808,Whole,ресторане,16,25,neutral,Отмечали в этом ресторане день рождение на пер...
1,30808,Interior,первом этаже,43,55,neutral,Отмечали в этом ресторане день рождение на пер...
2,30808,Whole,руководству ресторана,124,145,positive,Отмечали в этом ресторане день рождение на пер...
3,30808,Service,обслуживающему персоналу,147,171,positive,Отмечали в этом ресторане день рождение на пер...
4,30808,Service,сотрудникам,189,200,positive,Отмечали в этом ресторане день рождение на пер...
...,...,...,...,...,...,...,...
3568,16630,Service,обслуживание,85,97,positive,Уютная и тёплая домашняя обстановка! Милый и о...
3569,16630,Food,Еда,99,102,positive,Уютная и тёплая домашняя обстановка! Милый и о...
3570,16630,Service,персоналу,244,253,positive,Уютная и тёплая домашняя обстановка! Милый и о...
3571,16630,Whole,ресторан,294,302,positive,Уютная и тёплая домашняя обстановка! Милый и о...


In [7]:
data['sentiment'].value_counts()

positive    2309
neutral      694
negative     500
both          70
Name: sentiment, dtype: int64

In [11]:
temp = data.drop(data.query('sentiment ==  "positive"').sample(frac=.6).index)

In [12]:
temp['sentiment'].value_counts()

positive    924
neutral     694
negative    500
both         70
Name: sentiment, dtype: int64

In [13]:
sentiment_vocab = {'positive': 1, 'negative': 0, 'both': 2, 'neutral': 3}

In [14]:
class Dataset():

    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        text = self.df['text'].values[index]
        sentiment = self.df['sentiment'].values[index]
        masked = (text[:self.df['start'].values[index]] + '[MASK]' + 
                  text[self.df['end'].values[index]:])
        
        return masked, sentiment_vocab[sentiment]

    def __len__(self):

        return len(self.df)

In [15]:
train_data, val_data = train_test_split(temp, test_size=0.25, stratify=temp.sentiment.values)

train_dataset = Dataset(train_data)
val_dataset = Dataset(val_data)

In [16]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

device(type='cuda')

In [17]:
class Classifier(nn.Module):

    def __init__(self, bert):
        super().__init__()
        self.bert = bert
        self.dropout = nn.Dropout(p=0.3)
        self.linear_1 = nn.Linear(312, 4)
        self.softmax = nn.Softmax()

    def forward(self, x, attention_mask=None):
        output  = self.bert(x, attention_mask=attention_mask)
        x = output['last_hidden_state'][:, 0]
        x = self.dropout(x)
        x = self.linear_1(x)
        x = self.softmax(x)
        
        return x

In [18]:
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

In [19]:
MODEL_NAME = "cointegrated/rubert-tiny"
EPOCHS = 4
BATCH_SIZE = 8

In [20]:
train_loss = []
val_loss = []

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
criterion = nn.CrossEntropyLoss()

In [21]:
model = AutoModel.from_pretrained(
    MODEL_NAME
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    do_lower_case=True
)

optimizer = AdamW(model.parameters(),
                  weight_decay = 1e-4,
                  lr = 4e-5,
                  eps = 1e-8
                )

total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.5M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/457k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [22]:
model_classifier = Classifier(model)

In [23]:
from sklearn.metrics import accuracy_score, f1_score

In [24]:
import random
import numpy as np

seed_val = 2020

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in tqdm(range(EPOCHS)):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))
    print('Training...')

    current_train_loss = 0 
    model_classifier.train()

    for text, y in tqdm(train_dataloader, leave=False):
        tokens = tokenizer.batch_encode_plus(list(text),
            pad_to_max_length=True,
            return_tensors='pt')

        logits = model_classifier(tokens['input_ids'], 
                                  attention_mask=tokens['attention_mask'])

        loss = criterion(logits, y)
        current_train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model_classifier.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    train_loss.append(current_train_loss / len(train_dataloader))

    model_classifier.eval()
    current_val_loss = 0
    predictions = []
    actuals = [] 

    for text, y in val_dataloader:
        tokens = tokenizer.batch_encode_plus(
            list(text),
            pad_to_max_length=True,
            return_tensors='pt')

        logits = model_classifier(tokens['input_ids'], 
                                  attention_mask=tokens['attention_mask'])
        
        loss = criterion(logits, y)
        current_val_loss += loss.item()

        predictions += list(logits.argmax(axis=1).data.numpy())
        actuals += list(y.data.numpy())
    
    print('Accuracy:', accuracy_score(actuals, predictions))    
    val_loss.append(current_val_loss / len(val_dataloader))

  0%|          | 0/4 [00:00<?, ?it/s]


Training...



  from ipykernel import kernelapp as app

  0%|          | 1/206 [00:03<10:34,  3.09s/it][A
  1%|          | 2/206 [00:05<08:57,  2.64s/it][A
  1%|▏         | 3/206 [00:07<08:43,  2.58s/it][A
  2%|▏         | 4/206 [00:09<08:00,  2.38s/it][A
  2%|▏         | 5/206 [00:12<08:04,  2.41s/it][A
  3%|▎         | 6/206 [00:14<07:45,  2.33s/it][A
  3%|▎         | 7/206 [00:17<07:51,  2.37s/it][A
  4%|▍         | 8/206 [00:19<07:59,  2.42s/it][A
  4%|▍         | 9/206 [00:22<08:10,  2.49s/it][A
  5%|▍         | 10/206 [00:25<08:34,  2.63s/it][A
  5%|▌         | 11/206 [00:27<08:30,  2.62s/it][A
  6%|▌         | 12/206 [00:30<08:26,  2.61s/it][A
  6%|▋         | 13/206 [00:32<08:24,  2.61s/it][A
  7%|▋         | 14/206 [00:37<09:44,  3.05s/it][A
  7%|▋         | 15/206 [00:39<09:27,  2.97s/it][A
  8%|▊         | 16/206 [00:42<09:06,  2.88s/it][A
  8%|▊         | 17/206 [00:45<08:54,  2.83s/it][A
  9%|▊         | 18/206 [00:47<08:45,  2.80s/it][A
  9%|▉         | 19/206 [00:50

Accuracy: 0.48811700182815354

Training...



  from ipykernel import kernelapp as app

  0%|          | 1/206 [00:02<08:13,  2.41s/it][A
  1%|          | 2/206 [00:04<07:48,  2.30s/it][A
  1%|▏         | 3/206 [00:07<07:57,  2.35s/it][A
  2%|▏         | 4/206 [00:09<07:26,  2.21s/it][A
  2%|▏         | 5/206 [00:11<07:38,  2.28s/it][A
  3%|▎         | 6/206 [00:13<07:25,  2.23s/it][A
  3%|▎         | 7/206 [00:15<07:35,  2.29s/it][A
  4%|▍         | 8/206 [00:18<07:48,  2.36s/it][A
  4%|▍         | 9/206 [00:21<07:58,  2.43s/it][A
  5%|▍         | 10/206 [00:23<08:20,  2.55s/it][A
  5%|▌         | 11/206 [00:26<08:15,  2.54s/it][A
  6%|▌         | 12/206 [00:28<08:12,  2.54s/it][A
  6%|▋         | 13/206 [00:31<08:09,  2.54s/it][A
  7%|▋         | 14/206 [00:34<08:16,  2.59s/it][A
  7%|▋         | 15/206 [00:36<07:52,  2.48s/it][A
  8%|▊         | 16/206 [00:39<08:00,  2.53s/it][A
  8%|▊         | 17/206 [00:41<08:04,  2.56s/it][A
  9%|▊         | 18/206 [00:44<08:08,  2.60s/it][A
  9%|▉         | 19/206 [00:46

Accuracy: 0.5118829981718465

Training...



  from ipykernel import kernelapp as app

  0%|          | 1/206 [00:02<08:11,  2.40s/it][A
  1%|          | 2/206 [00:04<07:48,  2.30s/it][A
  1%|▏         | 3/206 [00:07<07:56,  2.35s/it][A
  2%|▏         | 4/206 [00:09<07:28,  2.22s/it][A
  2%|▏         | 5/206 [00:11<07:38,  2.28s/it][A
  3%|▎         | 6/206 [00:13<07:23,  2.22s/it][A
  3%|▎         | 7/206 [00:15<07:32,  2.27s/it][A
  4%|▍         | 8/206 [00:18<07:41,  2.33s/it][A
  4%|▍         | 9/206 [00:20<07:52,  2.40s/it][A
  5%|▍         | 10/206 [00:23<08:13,  2.52s/it][A
  5%|▌         | 11/206 [00:26<08:19,  2.56s/it][A
  6%|▌         | 12/206 [00:28<08:12,  2.54s/it][A
  6%|▋         | 13/206 [00:31<08:08,  2.53s/it][A
  7%|▋         | 14/206 [00:34<08:14,  2.58s/it][A
  7%|▋         | 15/206 [00:36<07:50,  2.46s/it][A
  8%|▊         | 16/206 [00:38<07:55,  2.51s/it][A
  8%|▊         | 17/206 [00:41<08:01,  2.55s/it][A
  9%|▊         | 18/206 [00:44<08:06,  2.59s/it][A
  9%|▉         | 19/206 [00:46

Accuracy: 0.5191956124314442

Training...



  from ipykernel import kernelapp as app

  0%|          | 1/206 [00:02<08:20,  2.44s/it][A
  1%|          | 2/206 [00:04<07:51,  2.31s/it][A
  1%|▏         | 3/206 [00:07<07:58,  2.36s/it][A
  2%|▏         | 4/206 [00:09<07:27,  2.22s/it][A
  2%|▏         | 5/206 [00:11<07:38,  2.28s/it][A
  3%|▎         | 6/206 [00:13<07:21,  2.21s/it][A
  3%|▎         | 7/206 [00:15<07:31,  2.27s/it][A
  4%|▍         | 8/206 [00:18<07:41,  2.33s/it][A
  4%|▍         | 9/206 [00:20<07:53,  2.41s/it][A
  5%|▍         | 10/206 [00:23<08:17,  2.54s/it][A
  5%|▌         | 11/206 [00:26<08:14,  2.54s/it][A
  6%|▌         | 12/206 [00:28<08:10,  2.53s/it][A
  6%|▋         | 13/206 [00:31<08:07,  2.53s/it][A
  7%|▋         | 14/206 [00:34<08:13,  2.57s/it][A
  7%|▋         | 15/206 [00:36<07:49,  2.46s/it][A
  8%|▊         | 16/206 [00:38<07:56,  2.51s/it][A
  8%|▊         | 17/206 [00:41<08:01,  2.55s/it][A
  9%|▊         | 18/206 [00:44<08:05,  2.58s/it][A
  9%|▉         | 19/206 [00:46

Accuracy: 0.5319926873857403





In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
torch.save(model_classifier.state_dict(), '/content/drive/My Drive/sentiment_classifier')

In [27]:
model_classifier

Classifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
 

In [28]:
test_asp = pd.read_csv('dev_aspects.txt', 
                        delimiter='\t', 
                        names=['text_id', 'category', 'mention', 
                               'start', 'end', 'sentiment'])

test_texts = pd.read_csv('dev_reviews.txt', 
                          delimiter='\t', names=['text_id','text'])

In [29]:
test_data = test_asp.merge(test_texts, on='text_id')

In [30]:
test_data

Unnamed: 0,text_id,category,mention,start,end,sentiment,text
0,3976,Whole,ресторане,71,80,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
1,3976,Whole,ресторанах,198,208,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
2,3976,Whole,ресторане,256,265,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
3,3976,Service,Столик бронировали,267,285,neutral,"День 8-го марта прошёл, можно и итоги подвести..."
4,3976,Service,администратор,322,335,positive,"День 8-го марта прошёл, можно и итоги подвести..."
...,...,...,...,...,...,...,...
1185,33043,Service,заказ,792,797,positive,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...
1186,33043,Service,принесли,798,806,positive,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...
1187,33043,Food,приготовили,880,891,positive,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...
1188,33043,Service,оставил,1017,1024,negative,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...


In [31]:
test_dataset = Dataset(test_data)

In [32]:
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [33]:
predictions = []
true_sentiment = []

print('Predicting sentiment for {:,} sentiment mention...'.format(len(test_data)))

model_classifier.eval()

for text, y in tqdm(test_dataloader):
    tokens = tokenizer.batch_encode_plus(list(text),
        pad_to_max_length=True,
        return_tensors='pt')

    logits = model_classifier(tokens['input_ids'],
        attention_mask=tokens['attention_mask']).argmax(dim=1)
    
    predictions += list(logits.data.numpy())
    true_sentiment += list(y.data.numpy())

Predicting sentiment for 1,190 sentiment mention...


  from ipykernel import kernelapp as app
100%|██████████| 149/149 [01:23<00:00,  1.77it/s]


In [34]:
print('Accuracy:', accuracy_score(true_sentiment, predictions))

Accuracy: 0.5815126050420169


In [35]:
pred_data = pd.read_csv('dev_aspects.txt', 
                        delimiter='\t', 
                        names=['text_id', 'category', 'mention', 
                               'start', 'end', 'sentiment'])

In [36]:
pred_data['pred_sentiment'] = predictions

In [37]:
pred_data['pred_sentiment'] = pred_data['pred_sentiment'].replace([0, 1, 2, 3], 
                                                   ['negative', 
                                                    'positive', 
                                                    'both',
                                                    'neutral'])

In [38]:
pred_data.pred_sentiment.unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [39]:
pred_data = pred_data.drop('sentiment', axis=1)

In [40]:
pred_data = pred_data.rename(columns={'pred_sentiment': 'sentiment'})

In [41]:
pred_data

Unnamed: 0,text_id,category,mention,start,end,sentiment
0,3976,Whole,ресторане,71,80,positive
1,3976,Whole,ресторанах,198,208,positive
2,3976,Whole,ресторане,256,265,positive
3,3976,Service,Столик бронировали,267,285,positive
4,3976,Service,администратор,322,335,positive
...,...,...,...,...,...,...
1185,33043,Service,заказ,792,797,positive
1186,33043,Service,принесли,798,806,positive
1187,33043,Food,приготовили,880,891,positive
1188,33043,Service,оставил,1017,1024,positive


In [42]:
pred_data.to_csv('/content/drive/My Drive/dev_pred_aspects.txt')