In [None]:
#Задача: продемонстрировать различные подходы  к решению задачи классификации на основе выборки с отзывами 

In [None]:
!pip install pymorphy2

In [None]:
import pymorphy2
from pymorphy2 import MorphAnalyzer

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, AdamW

from transformers import AutoModel, AutoTokenizer, BertForSequenceClassification,AutoModelForSequenceClassification, BertTokenizerFast, get_linear_schedule_with_warmup, AutoConfig

from typing import Dict
from numpy import asarray
from functools import reduce
from tqdm.notebook import tqdm
import os
import re
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score

torch.manual_seed(42)

import nltk

nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


from nltk.corpus import stopwords


In [None]:
df = pd.read_excel('/kaggle/input/testtaskreviews/DS_task_NLP_20240406/data.xlsx')
test = pd.read_excel('/kaggle/input/testtaskreviews/DS_task_NLP_20240406/test.xlsx')


In [None]:
le = LabelEncoder()
df.sentiment =  le.fit_transform(df.sentiment)

In [None]:
df.head()

In [None]:
df.sentiment.value_counts()

In [None]:

patterns = "[0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
morph = MorphAnalyzer()

def lemmatize(doc):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    for token in doc.split():
        token = token.strip()
        token_norm = morph.normal_forms(token)[0]
        
        if token_norm:
            tokens.append(token_norm.lower())                                
        else: 
            tokens.append(token)
    return ' '.join(tokens) 
    

In [None]:
df['lem'] = df['review'].apply(lemmatize)

In [None]:
df['lem'].isna().sum()

In [None]:
df.head(20)


In [None]:
tf_idf = TfidfVectorizer(
    lowercase=True,
    analyzer="word",    
    ngram_range=(1, 3),
    dtype=np.float32
    
) 

In [None]:
train, val = train_test_split(df, stratify = df['sentiment'], test_size = 0.15, random_state =112)
train.shape, val.shape

In [None]:
x_train = tf_idf.fit_transform(train['lem'])
x_val = tf_idf.transform(val['lem'])
y_train = train['sentiment']
y_val = val['sentiment']

In [None]:
def evaluate(model, x, y): 
    preds = model.predict(x)
    print(classification_report(y, preds))
    return f1_score(y, preds, average='weighted')

In [None]:
logreg = LogisticRegressionCV(max_iter=1000, scoring='f1_weighted')
logreg.fit(x_train, y_train)



In [None]:
evaluate(logreg, x_val, y_val)

## Взвешенные эмбеддинги fasttext + логистическая регрессия


In [None]:
!pip install fasttext

In [None]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-ru-vectors", filename="model.bin")
model_ft = fasttext.load_model(model_path)



In [None]:
tfidf_dict = dict(zip(tf_idf.get_feature_names_out(), tf_idf.idf_))

In [None]:
def emb_weigh(text):
    text = text.split()
    if len(text) > 0:  
        tokens = [model_ft.get_word_vector(word)* tfidf_dict[word] if word in tfidf_dict else model_ft.get_word_vector(word)*tf_idf.idf_.max() for word in text ]
        summed_vecs =  reduce((lambda x,y: x+y), tokens)
        return summed_vecs / len(text) 
    else:
        return np.zeros(300)
    
    

In [None]:
x_train = train['lem'].apply(emb_weigh).to_list()
x_val = val['lem'].apply(emb_weigh).to_list()


In [None]:
lr_ft  = LogisticRegressionCV(max_iter=1000, scoring='f1_weighted')
lr_ft.fit(x_train, y_train)
evaluate(lr_ft, x_val, y_val)

## Эмбеддинги предложений из предобученного берта  + логистическая регрессия


In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
sbert = SentenceTransformer('cointegrated/rubert-tiny2')


In [None]:

x_train = train['review'].apply(lambda x: sbert.encode(x, show_progress_bar = False)).to_list()
x_val = val['review'].apply(lambda x: sbert.encode(x, show_progress_bar = False)).to_list()


In [None]:
lr_bert = LogisticRegressionCV(max_iter = 3000, scoring = 'f1_weighted')
lr_bert.fit(x_train, y_train)
evaluate(lr_bert, x_val, y_val)

## токенизатор rubert-tiny2 + RNN

In [None]:
# токенизатор берта тут используется для простоты и с целью использования преимуществ токенизации трансформеров и уже готового словаря 


In [None]:
tokenizer = AutoTokenizer.from_pretrained('seara/rubert-tiny2-russian-sentiment')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class CustomDataset(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len=50):
        self.texts = list(texts)
        if targets is not None: 
            self.targets = list(targets)
        else:
            self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        if self.targets:
            targets = self.targets[idx]


        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )
        if self.targets :
            return {
               'input_ids': encoding['input_ids'].flatten(),
               'attention_mask': encoding['attention_mask'].flatten(),
               'targets': torch.tensor(targets, dtype=torch.long),
           
                    }
        else: 
            return {
             'input_ids': encoding['input_ids'].flatten(),
             'attention_mask': encoding['attention_mask'].flatten()
                    }

In [None]:

train_dataset = CustomDataset(texts=train['review'], targets=train['sentiment'],tokenizer=tokenizer)
val_dataset = CustomDataset(texts=val['review'], targets=val['sentiment'], tokenizer=tokenizer)
test_dataset = CustomDataset(texts=test['review'], targets=None, tokenizer=tokenizer)

train_data = DataLoader(train_dataset, batch_size=216, shuffle=True)
val_data =  DataLoader(val_dataset, batch_size=216, shuffle=False)
test_data = DataLoader(test_dataset, batch_size=1, shuffle=False)
 

In [None]:
VOCAB_SIZE = 83828 #rubert-tiny2 vocab size
EMBEDDING_DIM = 2048


class LSTMClassifier(nn.Module):

    def __init__(self,  n_classes = 3, vocab_size = VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_dim=500, n_layers=2, bidirectional=True
     ):

        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=bidirectional,
            dropout = 0.3,
            batch_first=True,
        )
        self.hidden_dim = hidden_dim
        self.output_dim = n_classes
        self.linear = nn.Linear(self.hidden_dim*2, self.hidden_dim)
        self.projection = nn.Linear(self.hidden_dim, self.output_dim)
        self.func = nn.Tanh()
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, inputs):

        inputs = self.embedding(inputs)
        outputs, (hidden, cell) = self.rnn(inputs)
        outputs = torch.mean(outputs, dim=1)
        outputs = self.dropout(self.linear(self.func(outputs)))
        projection = self.projection(self.func(outputs))
       
        return projection
    


In [None]:
def fit(model, train_loader,  epochs):
    
    losses = []
    predictions = []
    model.to(device)
    model.train()
    
    loss_fn = torch.nn.CrossEntropyLoss().to(device)
    optimizer = Adam(model.parameters(), lr=2e-5)
    for data in train_loader:
        
        optimizer.zero_grad()
        targets = data["targets"]
        targets = targets.to(device)
        inputs = data['input_ids'].to(device)
        
        
        outputs = model(inputs).to(device)
        loss = loss_fn(outputs.to(torch.float32), targets)
        loss.backward()
        losses.append(loss.item())
                      
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
       
    train_loss = np.mean(losses)
        
    return train_loss
    
def eval(model, valid_loader):
    model.eval()
    all_logits = []
    all_preds = []
    all_labels = []
    loss_fn = torch.nn.CrossEntropyLoss().to(device)
    
    with torch.no_grad():
        for data in valid_loader:
            targets = data["targets"]
            targets = targets.to(device)
            inputs = data['input_ids'].to(device)
        
            outputs = model(inputs).to(device)
            loss = loss_fn(outputs.to(torch.float32), targets)
    
            all_logits.append(outputs)
            all_labels.append(targets)
            all_preds.extend(outputs.argmax(1).tolist())
        
        all_labels = torch.cat(all_labels).to(device)
        all_logits = torch.cat(all_logits).to(device)
        loss = loss_fn(all_logits, all_labels).item()
        score = f1_score(all_labels.cpu(), all_preds, average='weighted' )
        
        print(classification_report(all_labels.cpu(), all_preds, zero_division=0))
    
    return loss, score 
    
def train_model(model, train_loader, valid_loader, epochs):
        
        best_score = 0
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')
            train_loss = fit(model, train_loader, epochs)
            print(f'Train loss {train_loss}')

            val_loss, val_score = eval(model, valid_loader)
            print(f'Val loss {val_loss} f1_weighted {val_score}')
            print('-' * 10)
            
            if val_score > best_score: 
                best_score = val_score
                torch.save(model.state_dict(), f'/kaggle/working/lstm.pt')
            
            


In [None]:
model = LSTMClassifier()

In [None]:
train_model(model, train_data, val_data,  epochs=30
           )

## Fine-Tuned Bert (with frozen encoder)

In [None]:
trainer_config = {

'save_score': 0,
'n_epochs': 50,
'device': device,
'verbose':True, 
"weight_decay": 1e-4,
}
class BertTrainer:
    def __init__(self, config: Dict):
        self.config = config
        self.n_epochs = config['n_epochs']
        self.device = config['device']
        self.model =  AutoModelForSequenceClassification.from_pretrained('seara/rubert-tiny2-russian-sentiment', return_dict=True)
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, 3)
        self.history = None
        self.verbose = config.get('verbose', True)
        self.best_score = config['save_score']
    def fit(self, train_dataloader, val_dataloader, trainable=True):
        best_score = self.best_score
        self.train_loader = train_dataloader
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=len(self.train_loader) * self.n_epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
        
        
        self.history = {
            'train_loss': [],
            'val_loss': [],
            'val_fscore': []
          }

        for epoch in range(self.n_epochs):
            print(f"Epoch {epoch + 1}/{self.n_epochs}")
            train_info = self.train_epoch(train_dataloader)
            val_info = self.val_epoch(val_dataloader)
            self.history['train_loss'].extend([train_info['loss']])
            self.history['val_loss'].extend([val_info['loss']])
            self.history['val_fscore'].extend([val_info['fscore']])
            if val_info['fscore'] > best_score: 
                best_score = val_info['fscore']
                self.model.save_pretrained(f'bert_ft.pt')

                
        return self.model.eval()

    def train_epoch(self, train_dataloader):
        
        self.model.to(self.device)
        self.model = self.model.train()
        losses = []
        if self.verbose:
            train_dataloader = tqdm(train_dataloader)
        for batch in train_dataloader:
            ids = batch['input_ids'].to(self.device)
            mask = batch['attention_mask'].to(self.device)
            targets = batch['targets'].to(self.device)
            outputs = self.model(ids, mask)
            loss = self.loss_fn(outputs.logits, targets)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()   
            self.scheduler.step()
            loss_val = loss.item()
            losses.append(loss_val)
        print('Train loss: ', np.mean(losses))
        return {'loss': np.mean(losses)}

    def val_epoch(self, val_dataloader):
        self.model.eval()
        all_logits = []
        all_labels = []
        all_preds = []
        preds = []
        if self.verbose:
            val_dataloader = tqdm(val_dataloader)
        with torch.no_grad():
            for batch in val_dataloader:
                ids = batch['input_ids'].to(self.device)
                mask = batch['attention_mask'].to(self.device)
                targets = batch['targets'].to(self.device)
                outputs = self.model(ids, mask).logits
                all_logits.append(outputs)
                all_labels.append(targets)
                all_preds.extend(outputs.argmax(1).tolist())
        all_labels = torch.cat(all_labels).to(self.device)
        
        all_logits = torch.cat(all_logits).to(self.device)
        loss = self.loss_fn(all_logits, all_labels).item()
        report = classification_report(all_labels.cpu(), all_preds, zero_division=0)
        fscore = f1_score(all_labels.cpu(), all_preds, average='weighted' )
        print('Val loss:', loss)
        print('F1_score: ', fscore)
        print(report)
            
        return {
            'fscore': fscore,
            'loss': loss
         }

In [None]:
trainer = BertTrainer(trainer_config)

for param in trainer.model.bert.parameters():
    param.requires_grad=False
    
trainer.fit(train_data, val_data)

In [None]:
#попробуем немного подучить кодировщик поверх обученного классификатора
for param in trainer.model.bert.parameters():
    param.requires_grad=True
trainer.n_epochs = 10
trainer.best_score = 0.767
trainer.fit(train_data, val_data)

## Fine-tuned BERT (with encoder training)

In [None]:
#чуть настроим классификатор, чтобы не испортить веса на первых шагах
trainer_ = BertTrainer(trainer_config)
for param in trainer_.model.bert.parameters():
    param.requires_grad=False
trainer_.n_epochs = 5
trainer_.best_score = 0.791 
trainer_.fit(train_data, val_data)
for param in trainer_.model.bert.parameters():
    param.requires_grad=True
trainer_.n_epochs = 25
trainer_.fit(train_data, val_data)

In [None]:
saved_model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/bert_ft.pt')

In [None]:
saved_model.to(device)
saved_model.eval()
preds = []
with torch.no_grad():
    for batch in test_data:
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        outputs = saved_model(ids, mask).logits
        preds.extend(outputs.argmax(1).tolist())
        

In [None]:
test['predicted'] = preds
test['predicted'] = le.inverse_transform(test.predicted)


In [None]:
test.head(20)

In [None]:
test.to_csv('res.csv')

## Выводы: 
Целью было не добиться максимальной достижимой метрики, а продемонстрировать разные подходы к задаче, поэтому препроцессинг базовый, а гиперпараметры не подбирались. 

Были реализованы подходы: линейный классификтор на основе tf_idf, линейный классификатор с использованием взвешенных эмбеддингов fasttext, LSTM c токенизацией bert, обучение только классификтора без обучения кодировщика bert, дообучение кодировщика bert. 

Датасет изначально проблемный, т.к. помимо дисбаланса классов содержит отзывы как на русском, так и на английском. Если задача не ставится как мультиязычная, такие отзывы нужно вычищать. Соответственно, если ставится, то нужно ориентироваться на мультиязычные модели или строить разные пайплайны для обработки разных языков. Я не вычищала эти отзывы из обучающей выборки, т.к. тестовая тоже их содержит, и не пыталась реализовать разные пайлайны, т.к. это, кажется, чрезмерно в данной задаче. 

В качестве метрики на несбалансированных данных используется f1_weighted

В качестве бейзлайна лучшую метрику показал fine-tuned Bert, при этом метрика без обучения энкодера лишь на несколько процентов уступает метрике, полученной с дообучением кодировщика. 

Хороший бейзлайн дает tf_idf + логистическая регрессия. Варианты с использованием предобученных векторов проявили себя не очень, вероятно, из-за зашумленности данных англоязычными отзывами. 

  
