[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

#Введение

В данном ноутбуке анализ тональности отзывов реализован с использованием предобученной на русских текстах BERT-модели DeepPavlov/rubert-base-cased. Модель дообучается на отзывах на мобильные телефоны, программно загруженных с сайта irecommend.ru и затем отфильтрованных. 

Загрузка отзывов реализована в скрипте reviews_spider.py (каталог spider)

Фильтрация датасета осуществлялась на основании сходства с отзывами в тестовой выборке. Фильтрация реализована в отдельном ноутбуке - filter_reviews_by_similarity.ipynb

#Установка и импорт библиотек

In [None]:
!pip install datasets transformers[sentencepiece]

In [None]:
import pandas as pd
import numpy as np
import gc
import os
import random

from tqdm.auto import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler

from datasets import Dataset, load_dataset
from datasets.dataset_dict import DatasetDict

#Подготовка данных

Загружаем датасет, разбиваем его на обучающий и проверочный

In [None]:
url = 'https://raw.githubusercontent.com/chekhovana/courses/main/ml_stepik/' + \
      '6_final_project/week6_kaggle/data/reviews_filtered_2000.csv'

df = pd.read_csv(url, sep='\t')
n_samples = df.shape[0]
n_train = int(0.8 * n_samples)
n_valid = n_samples - n_train

df_train = df[:n_train]
df_valid = df[-n_valid:]

ds_train = Dataset.from_pandas(df_train)
ds_valid = Dataset.from_pandas(df_valid)

raw_datasets = DatasetDict({'train': ds_train, 'validation': ds_valid})

Проверяем сбалансированность выборок - обучающей и проверочной

In [None]:
balance = df_train['label'].sum() / df_train.shape[0]
print(f'Доля положительных отзывов в обучающей выборке: {balance}')
balance = df_valid['label'].sum() / df_valid.shape[0]
print(f'Доля положительных отзывов в проверочной выборке: {balance}')

Доля положительных отзывов в обучающей выборке: 0.48875
Доля положительных отзывов в проверочной выборке: 0.545


Определяем функции для токенизации текста и создания DataLoader

In [None]:
def tokenize_data(tokenizer, dataset):
    def tokenize_function(example):
        return tokenizer(example["review"], truncation=True, max_length=512)

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    tokenized_dataset = tokenized_dataset.remove_columns(['review'])
    if 'label' in tokenized_dataset.features:
        tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

def create_dataloader(checkpoint, dataset, batch_size=8):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    tokenized_dataset = tokenize_data(tokenizer, dataset)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)   
    dataloader = DataLoader(tokenized_dataset, shuffle=False, 
                            batch_size=batch_size, 
                            collate_fn=data_collator)
    return dataloader

Вспомогательная функция, генерирует имя каталога для сохранения модели

In [None]:
def generate_folder_name(checkpoint):
    fname = checkpoint.lower()
    fname = fname.replace('/', '_')
    return fname.replace('-', '_')

#Обучение модели

Функции для инициализации псевдогенераторов случайных чисел и переключения на GPU

In [None]:
def seed_all():
    seed = 42
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(seed)

def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    return torch.device('cpu')

device = get_device()
device

device(type='cuda')

Создание класса, реализующего алгоритм дообучения модели под конкретную задачу

In [None]:
class Trainer():
    def __init__(self, checkpoint, datasets):
        self.checkpoint = checkpoint
        self.train_loader = create_dataloader(checkpoint, datasets['train'])
        self.eval_loader = create_dataloader(checkpoint, datasets['validation'])
        self.model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
        self.model.to(device)
        self.bar_params = {'bar_format': '{l_bar}{r_bar}'}

    def train(self, num_epochs, patience):
        self.num_epochs = num_epochs
        self.best_accuracy = 0
        self.patience = patience
        self.train_bar = tqdm(range(len(self.train_loader) * num_epochs), 'train', 
                              **self.bar_params)
        self.eval_bar = tqdm(range(len(self.eval_loader)), 'eval', 
                             **self.bar_params)

        for epoch in range(self.num_epochs):
            self.train_loop()
            accuracy = self.eval_loop()
            print(f'epoch {epoch}, validation accuracy {accuracy}')
            if accuracy > self.best_accuracy:
                self.best_accuracy = accuracy
                fname = generate_folder_name(checkpoint)
                print(f'saving model with accuracy {accuracy} to {fname}')
                self.model.save_pretrained(fname)
                patience = self.patience
            else:
                patience -= 1
                if patience == 0:
                    print('patience is exhausted')
                    return
            gc.collect()
            torch.cuda.empty_cache()

    def train_loop(self):
        optimizer = AdamW(self.model.parameters(), lr=5e-5)
        num_training_steps = self.num_epochs * len(self.train_loader)
        lr_scheduler = get_scheduler("linear", optimizer=optimizer,
            num_warmup_steps=0, num_training_steps=num_training_steps)

        self.model.train()
        for batch in self.train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = self.model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            self.train_bar.update(1)

    def eval_loop(self):
        self.model.eval()
        total, correct = 0, 0
        self.eval_bar.reset()
        for batch in self.eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch['labels']
            with torch.no_grad():
                outputs = self.model(**batch)
            
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total += len(labels)
            correct += (predictions == labels).sum().item()
            self.eval_bar.update(1)
        return correct / total

Обучение модели

In [None]:
seed_all()
checkpoint = 'DeepPavlov/rubert-base-cased'
trainer = Trainer(checkpoint, raw_datasets)
trainer.train(10, 5)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train:   0%|| 0/2000 [00:00<?, ?it/s]

eval:   0%|| 0/50 [00:00<?, ?it/s]

epoch 0, validation accuracy 0.455
saving model with accuracy 0.455 to deeppavlov_rubert_base_cased
epoch 1, validation accuracy 0.46
saving model with accuracy 0.46 to deeppavlov_rubert_base_cased
epoch 2, validation accuracy 0.6075
saving model with accuracy 0.6075 to deeppavlov_rubert_base_cased
epoch 3, validation accuracy 0.455
epoch 4, validation accuracy 0.7475
saving model with accuracy 0.7475 to deeppavlov_rubert_base_cased
epoch 5, validation accuracy 0.82
saving model with accuracy 0.82 to deeppavlov_rubert_base_cased
epoch 6, validation accuracy 0.825
saving model with accuracy 0.825 to deeppavlov_rubert_base_cased
epoch 7, validation accuracy 0.67
epoch 8, validation accuracy 0.8025
epoch 9, validation accuracy 0.8325
saving model with accuracy 0.8325 to deeppavlov_rubert_base_cased


#Расчет прогноза на тестовой выборке

Загружаем тестовую выборку

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://raw.githubusercontent.com/chekhovana/courses/main/ml_stepik/' + \
      '6_final_project/week6_kaggle/data/test.csv'

content = requests.get(url).content

bs = BeautifulSoup(content)
reviews = [r.text for r in bs.findAll('review')]
test_ds = Dataset.from_dict({'review': reviews})

Восстанавливаем модель с лучшей точностью (она была сохранена в процессе обучения) и используем ее для расчета прогноза

In [None]:
checkpoint = 'DeepPavlov/rubert-base-cased'
test_loader = create_dataloader(checkpoint, test_ds)
model_checkpoint = generate_folder_name(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
model.to(device)
model.eval()
predictions = []
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    predictions.append(torch.argmax(logits, dim=-1))

predictions = torch.cat(predictions)
predictions = predictions.cpu().numpy()
predictions

  0%|          | 0/1 [00:00<?, ?ba/s]

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1])

Сохраняем прогноз в файл для отправки на kaggle

In [None]:
df = pd.DataFrame(predictions, columns=['y'])
df['y'] = df['y'].apply(lambda x: 'pos' if x else 'neg')
df.index.name = 'Id'
df.to_csv('submission.csv')