In [1]:
!pip install transformers huggingface_hub sentencepiece pymorphy2 swifter datasets

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m25.4 MB/s[0m eta [36m0:00:

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from huggingface_hub import hf_hub_download
import pandas as pd
from tqdm import tqdm
import numpy as np
import torch.optim as optim

REPO_ID = "MonoHime/ru_sentiment_dataset"
FILENAME = "datasets.csv"

dataset = pd.read_csv(
    hf_hub_download(repo_id=REPO_ID, filename=FILENAME, repo_type="dataset"), index_col=0
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


datasets.csv:   0%|          | 0.00/307M [00:00<?, ?B/s]

In [4]:
import nltk
import re
import swifter
import pymorphy2
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
morph = pymorphy2.MorphAnalyzer()

def preprocess_text(text):

    text = text.lower()

    text = re.sub(r'@\w+|#[\w-]+|http\S+|\n', '', text)

    text = re.sub(r'[^\w\s]', ' ', text)

    words = word_tokenize(text, language='russian')

    words = [word for word in words if not re.match(r'^_+$', word)]

    processed_words = []

    for word in words:
        try:
            p = morph.parse(word)[0]
            word = p.normal_form
        except:
            pass

        processed_words.append(word)

    return ' '.join(processed_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
dataset.reset_index(drop=True, inplace=True)
dataset['text'] = dataset['text'].swifter.apply(preprocess_text)
file_ = 'drive/MyDrive/temp/texts.txt'
with open(file_, 'a') as f:
  for text in dataset.text:
    f.writelines(text + '\n')

Pandas Apply:   0%|          | 0/210989 [00:00<?, ?it/s]

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

In [7]:
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor

In [8]:
import os
import torch
from typing import Union, List, Tuple
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor
from torch.utils.data import Dataset, DataLoader
import random
from sklearn.model_selection import train_test_split


class TextDataset(Dataset):
    TRAIN_VAL_RANDOM_SEED = 42
    VAL_RATIO = 0.05

    def __init__(self, data_file: str, dataset, train: bool = True, sp_model_prefix: str = None,
                 vocab_size: int = 5000, normalization_rule_name: str = 'nmt_nfkc_cf',
                 model_type: str = 'bpe', max_length: int = 256):

        if not os.path.isfile(sp_model_prefix + '.model'):
            SentencePieceTrainer.train(
                input=data_file, vocab_size=vocab_size,
                model_type=model_type, model_prefix=sp_model_prefix,
                normalization_rule_name=normalization_rule_name,
                unk_id=0, bos_id=1, eos_id=2, pad_id=3
            )
            print(1)
        self.sp_model = SentencePieceProcessor(model_file=sp_model_prefix + '.model')

        with open(data_file) as file:
            texts = file.readlines()

        random.seed(self.TRAIN_VAL_RANDOM_SEED)
        random.shuffle(texts)
        df_train, df_val = train_test_split(dataset, test_size=self.VAL_RATIO, random_state=self.TRAIN_VAL_RANDOM_SEED)

        self.df = df_train if train else df_val
        self.indices = self.sp_model.encode(self.df.text.tolist())

        self.pad_id, self.unk_id, self.bos_id, self.eos_id = \
            self.sp_model.pad_id(), self.sp_model.unk_id(), \
            self.sp_model.bos_id(), self.sp_model.eos_id()
        self.max_length = max_length
        self.vocab_size = self.sp_model.vocab_size()

    def text2ids(self, texts: Union[str, List[str]]) -> Union[List[int], List[List[int]]]:
        return self.sp_model.encode(texts)

    def ids2text(self, ids: Union[torch.Tensor, List[int], List[List[int]]]) -> Union[str, List[str]]:

        return self.sp_model.decode(ids)

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, item: int) -> Tuple[torch.Tensor, int]:
        indices = torch.tensor([self.bos_id, *self.indices[item], self.eos_id])
        if len(indices) < self.max_length:
            padding = torch.full((self.max_length - len(indices),), self.pad_id)
            indices = torch.cat((indices, padding))
        else:
            indices = indices[:self.max_length-1]
            indices = torch.cat((indices, torch.tensor([self.eos_id])))
        return indices, self.df.sentiment.iloc[item]


In [9]:
VOCAB_SIZE = 5000
MAX_LENGTH = 256

train_set = TextDataset(data_file='drive/MyDrive/temp/texts.txt', dataset=dataset, vocab_size=VOCAB_SIZE, train=True, sp_model_prefix='bpe', max_length=MAX_LENGTH)
valid_set = TextDataset(data_file='drive/MyDrive/temp/texts.txt', dataset=dataset, vocab_size=VOCAB_SIZE, train=False, sp_model_prefix='bpe', max_length=MAX_LENGTH)

1


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
from torch import nn

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim, 3)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.dropout(output)
        last_hidden = output[:, -1, :]
        logits = self.fc(last_hidden)
        return logits

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
def train(model, criterion, optimizer, train_loader, val_loader, epoch, scheduler=None):
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0.0
      total_samples = 0
      train_acc = []


      for inputs, targets in tqdm(train_loader):
          inputs = inputs.to(device)
          targets = targets.to(device)

          optimizer.zero_grad()
          outputs = model(inputs)
          loss = criterion(outputs.view(-1, 3), targets.view(-1))
          loss.backward()
          optimizer.step()

          total_loss += loss.item() * len(inputs)
          acc = (targets.view(-1) == outputs.view(-1, 3).argmax(-1)).sum() / len(inputs)
          train_acc.append(acc.cpu())
          total_samples += len(inputs)
          if scheduler is not None:
            scheduler.step()

      model.eval()
      total_val_loss = 0.0
      total_val_samples = 0
      val_acc = []
      with torch.no_grad():
          for inputs, targets in valid_loader:
              inputs = inputs.to(device)
              targets = targets.to(device)
              outputs = model(inputs)
              val_loss = criterion(outputs.view(-1, 3), targets.view(-1))

              total_val_loss += val_loss.item() * len(inputs)
              acc = (targets.view(-1) == outputs.view(-1, 3).argmax(-1)).sum() / len(inputs)
              val_acc.append(acc.cpu())
              total_val_samples += len(inputs)

      avg_loss = total_loss / total_samples
      train_acc = np.mean(train_acc)
      val_acc = np.mean(val_acc)
      avg_val_loss = total_val_loss / total_val_samples

      print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")
  return model

In [15]:
vocab_size = train_set.vocab_size
embedding_dim = 256
hidden_dim = 128
batch_size = 32
num_epochs = 5
learning_rate = 0.001

model_rnn = TextClassifier(vocab_size, embedding_dim, hidden_dim).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model_rnn.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=4, eta_min=0)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, pin_memory=True)

model_rnn = train(model_rnn, criterion, optimizer, train_loader, valid_loader, num_epochs, scheduler)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

100%|██████████| 6264/6264 [00:39<00:00, 156.93it/s]


Epoch 1/5, Train Loss: 1.0128, Val Loss: 0.7811, Train Acc: 0.5024, Val Acc: 0.6359


100%|██████████| 6264/6264 [00:38<00:00, 162.13it/s]


Epoch 2/5, Train Loss: 0.6441, Val Loss: 0.5730, Train Acc: 0.7009, Val Acc: 0.7389


100%|██████████| 6264/6264 [00:38<00:00, 162.84it/s]


Epoch 3/5, Train Loss: 0.5448, Val Loss: 0.5416, Train Acc: 0.7506, Val Acc: 0.7493


100%|██████████| 6264/6264 [00:38<00:00, 162.13it/s]


Epoch 4/5, Train Loss: 0.5000, Val Loss: 0.5272, Train Acc: 0.7740, Val Acc: 0.7603


100%|██████████| 6264/6264 [00:38<00:00, 162.64it/s]


Epoch 5/5, Train Loss: 0.4624, Val Loss: 0.5371, Train Acc: 0.7943, Val Acc: 0.7532


In [16]:
torch.save(model_rnn.state_dict(), 'drive/MyDrive/temp/model_rnn.pth')

Точность на простой рекурентной модели получилось ~0.76 на лучшей эпохе.
Попробуем что-нибудь потяжелее (трансформер)

In [17]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch

model_name = "DeepPavlov/rubert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [19]:
from datasets import load_dataset, load_metric

dataset.rename({'sentiment': 'labels'}, axis=1, inplace=True)
dataset.to_csv('drive/MyDrive/temp/dataset.csv', index=False)

full_dataset = load_dataset('csv', data_files='drive/MyDrive/temp/dataset.csv')
full_dataset = full_dataset.filter(lambda example: example['text'] is not None and example['labels'] is not None)
dataset = full_dataset['train'].train_test_split(test_size=0.2)
dataset = dataset.map(lambda e: tokenizer(e['text'], truncation = True, max_length=80, padding='max_length'), batched=True) # Оставим длину небольшой чтобы не поседеть за время обучения
dataset = dataset.remove_columns('text')
dataset.set_format(type='torch', device=device)

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/210989 [00:00<?, ? examples/s]

Map:   0%|          | 0/168783 [00:00<?, ? examples/s]

Map:   0%|          | 0/42196 [00:00<?, ? examples/s]

In [20]:
import numpy as np
from tqdm.auto import tqdm
from tqdm.notebook import tqdm


def train(model, train_dataloader, test_dataloader, optimizer, lr_scheduler, num_epochs, device):
  for epoch in range(num_epochs):

      model.train()
      pbar = tqdm(train_dataloader, total=len(train_dataloader), desc='Training')

      for i, batch in enumerate(pbar):
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss

          loss.backward()
          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()

      model.eval()

      accuracy = load_metric('accuracy', trust_remote_code=True)

      pbar = tqdm(test_dataloader, total=len(test_dataloader), desc='Validation')

      for batch in tqdm(pbar):

          batch = {k: v.to(device) for k, v in batch.items()}

          with torch.no_grad():
              outputs = model(**batch)

          logits = outputs.logits.detach().cpu()
          predictions = torch.argmax(logits, dim=-1)

          accuracy.add_batch(predictions=predictions, references=batch["labels"].detach().cpu())

      accuracy = accuracy.compute()['accuracy']

      print(f'[{epoch+1}] Test accuracy: {accuracy:.4f}')

In [21]:
from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=64)
test_dataloader = DataLoader(dataset['test'], shuffle=False, batch_size=64)

optimizer = AdamW(model.parameters(), lr=1e-4)
num_epochs = 7
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps= int(0.1 * num_epochs * len(train_dataloader)),
    num_training_steps=num_training_steps
)

In [22]:
train(model, train_dataloader, test_dataloader, optimizer, lr_scheduler, num_epochs, device)

Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[1] Test accuracy: 0.7659


Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[2] Test accuracy: 0.7768


Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[3] Test accuracy: 0.7741


Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[4] Test accuracy: 0.7784


Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[5] Test accuracy: 0.7795


Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[6] Test accuracy: 0.7812


Training:   0%|          | 0/2638 [00:00<?, ?it/s]

Validation:   0%|          | 0/660 [00:00<?, ?it/s]

  0%|          | 0/660 [00:00<?, ?it/s]

[7] Test accuracy: 0.7835


In [23]:
model.save_pretrained("drive/MyDrive/temp/rubert", from_pt=True)

In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("yqelz/xml-roberta-large-ner-russian")
ner_model = AutoModelForTokenClassification.from_pretrained("yqelz/xml-roberta-large-ner-russian")

tokenizer_config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [25]:
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer)
sp_model = SentencePieceProcessor(model_file='bpe.model')
model_rnn = TextClassifier(vocab_size, embedding_dim, hidden_dim)
model_rnn.load_state_dict(torch.load('drive/MyDrive/temp/model_rnn.pth'), strict=False)

In [30]:
def split_by_title(text, ner_pipeline):
  sentences = text.split('.')
  result = []
  temp_text = ''
  for i in range(len(sentences)):
    text = sentences.pop(0)
    if ner_pipeline(text):
      if not result:
        result.append(temp_text)
      else:
        result[-1] = result[-1] + temp_text
      temp_text = text
    else:
      temp_text = '.'.join([temp_text, text])
  else:
    result.append(temp_text)
  return result


def get_sentiments(classification_model, classification_tokenizer, ner_pipeline, text, device):
  mapping = {
      0: 'neutral',
      1: 'positive',
      2: 'negative'
  }
  result = []
  classification_model = classification_model.to(device)
  texts = split_by_title(text, ner_pipeline)
  model.eval()
  for text in texts:
    ner_res = ner_pipeline(text)
    if ner_res:
      movie_title = ''
      end = 0
      for entity in ner_res:
        if entity.get('entity') == 'B-ORG':
          movie_title += entity.get('word', '').replace('▁', ' ')
          end = entity.get('end')
        if (entity.get('entity') == 'I-ORG'):
          movie_title += entity.get('word', '').replace('▁', ' ')
          end = entity.get('end')
    text = preprocess_text(text)
    tokens = torch.tensor(classification_tokenizer.encode(text)).unsqueeze(0).to(device)
    logits = classification_model(tokens).cpu()
    result.append({
          'title': movie_title.lstrip(),
          'setiment': mapping[logits.argmax(-1).numpy()[0]]
          })
  return result

In [None]:
text = 'Во все тяжкие это лучший сериал из всех что я смотрел. А вот Игра престолов оказалась абсолютным бредом'
result = get_sentiments(model_rnn, sp_model, ner_pipeline, text, device)
result

[{'title': 'Во все тяжкие', 'setiment': 'positive'},
 {'title': 'Игра престолов', 'setiment': 'negative'}]