# **Иницилизация всех библиотек**

In [15]:
import pandas as pd
import numpy as np
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup


In [76]:
!pip install translate

Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting libretranslatepy==2.1.1 (from translate)
  Downloading libretranslatepy-2.1.1-py3-none-any.whl.metadata (233 bytes)
Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


**Создание кастомного датасета, для правильности ввода модели**

In [58]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

Инициализация текстовой модели **BERT**, в конструкторе которой указывается кол-во классов, число эпох, предобученная модель и модель токениризации


In [59]:
class BertClassifier:

    def __init__(self, model_path, tokenizer_path, n_classes=2, epochs=3, model_save_path='/content/bert.pt'):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)

    def preparation(self, X_train, y_train, X_valid, y_valid):
        # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

        # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=64, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=64, shuffle=True)

        # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)

    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in self.train_loader:
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss

    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

        val_acc = correct_predictions.double() / len(self.valid_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss

    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')

            val_acc, val_loss = self.eval()
            print(f'Val loss {val_loss} accuracy {val_acc}')
            print('-' * 10)

            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_acc

        self.model = torch.load(self.model_save_path)

    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }

        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)

        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )

        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction

Загрузка ***датасета***

In [89]:
df = pd.read_csv('/content/data.csv')


Вывод информации, чтобы посмотреть размерность и название классов

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


**Кол-во** классов

In [62]:
df['Rating'].value_counts()

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
5,9054
4,6039
3,2184
2,1793
1,1421


Разделение датасета на выборку **train** и **val**

In [63]:
train_data = df[:12000]
val_data = df[12000:17000]
test_data = df[17000:17501]

Настройка **модели**

In [64]:
classifier = BertClassifier(
        model_path='cointegrated/rubert-tiny',
        tokenizer_path='cointegrated/rubert-tiny',
        n_classes=6,
        epochs=10,
        model_save_path='/content/bert.pt')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Настройка **токениризатора**

In [65]:
classifier.preparation(
        X_train=list(train_data['Review']),
        y_train=list(train_data['Rating']),
        X_valid=list(val_data['Review']),
        y_valid=list(val_data['Rating'])
    )



Обучение **модели**

In [66]:
classifier.train()

Epoch 1/10
Train loss 1.047251488934172 accuracy 0.54375
Val loss 0.8675375905218004 accuracy 0.6204000000000001
----------
Epoch 2/10
Train loss 0.8352451749304508 accuracy 0.6365
Val loss 0.7996029438851755 accuracy 0.6584
----------
Epoch 3/10
Train loss 0.7793711401680683 accuracy 0.66125
Val loss 0.7878778773018077 accuracy 0.662
----------
Epoch 4/10
Train loss 0.7434933889419475 accuracy 0.6845
Val loss 0.790104290352592 accuracy 0.6648000000000001
----------
Epoch 5/10
Train loss 0.7174855058497571 accuracy 0.69025
Val loss 0.7825516718852369 accuracy 0.665
----------
Epoch 6/10
Train loss 0.6942767673667442 accuracy 0.7035833333333333
Val loss 0.7906637508657914 accuracy 0.6616000000000001
----------
Epoch 7/10
Train loss 0.6789907193247308 accuracy 0.7108333333333333
Val loss 0.7785571563658835 accuracy 0.6672
----------
Epoch 8/10
Train loss 0.6698125426439528 accuracy 0.7195
Val loss 0.7909971807576432 accuracy 0.6674
----------
Epoch 9/10
Train loss 0.6570026349831135 accu

  self.model = torch.load(self.model_save_path)


Разделение на объекты и целевую переменную датафрейма **test** для тестирование качества модели

In [39]:
review = list(test_data['Review'])
rating = list(test_data['Rating'])

Запись предсказаний модели в массив, для определения среднего значения рейтинга отеля

In [87]:
preds = []

for txt in review:
    predictions = classifier.predict(txt)
    preds.append(predictions)

Демонстрация работы модели путем сравнение рейтинга на основе среднего значения **тестового** датасета и среднего значения предсказаний модели

In [70]:
np.mean(rating), np.mean(preds)

(4.06187624750499, 4.043912175648702)

Так как для демонстрации работы модели использовался готовый датасет с отзывами на английском языке, потому что языковые модели лучше его воспринимают
, а отзывы на российских сайтах соответственно на русском, для решения этой проблемы предлагается данный алгоритм:

In [None]:
from translate import Translator
translator= Translator(from_lang="russian", to_lang="english")
translation = translator.translate("your text")
print(translation)