In [None]:
import pandas as pd
from collections import Counter
import fasttext
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

#Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#BERT
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import re
import numpy as np
import math
from functools import reduce

import time
import datetime
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.preprocessing import normalize


#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('snowball_data')
#nltk.download('perluniprops')
#nltk.download('universal_tagset')
#nltk.download('stopwords')
#nltk.download('nonbreaking_prefixes')
#nltk.download('wordnet')
#from nltk import tokenize
#from nltk.tokenize import word_tokenize

import time
import random
import gc

## Data Preparation

In [None]:
test_news = pd.read_csv('/kaggle/input/testnews/test_news.csv')

In [None]:
# lenta.ru
care = pd.read_csv('/kaggle/input/lentaru/care_lenta_2012-03-01_2023-12-22.csv') #5
ussr = pd.read_csv('/kaggle/input/lentaru/ex_ussr_lenta_2012-03-01_2023-12-22.csv') #3
police = pd.read_csv('/kaggle/input/lentaru/police_lenta_2012-03-01_2023-12-22.csv') #2
science = pd.read_csv('/kaggle/input/lentaru/science_lenta_2012-03-01_2023-12-22.csv')#8
sport = pd.read_csv('/kaggle/input/lentaru/sport_lenta_2012-03-01_2023-12-22.csv') #4
tourism = pd.read_csv('/kaggle/input/lentaru/tourism_lenta_2012-03-01_2023-12-22.csv') #7
society= pd.read_csv('/kaggle/input/lentaru/society_lenta_2012-03-01_2023-12-22.csv') #0
economy = pd.read_csv('/kaggle/input/lentaru/economy_lenta_2012-03-01_2023-12-22.csv') #1

In [None]:
# ria.ru
realty_list = !ls /kaggle/input/ria-custom-topics/ria_realty
construction_list = !ls /kaggle/input/ria-custom-topics/ria_construction
health_list = !ls /kaggle/input/ria-custom-topics/ria_health
society_list= !ls /kaggle/input/ria-custom-topics/ria_society

cat_dict = {'realty':'ria_realty/', 'construction':'ria_construction/', 'health':'ria_health/', 'society' :'ria_society/'}

base = '/kaggle/input/ria-custom-topics/'

def clean_ria_source(somestring):
    pat = re.compile('([-—]\sРИА\sН(едвижимость|овости)\.)|(\/\sРадио\sSputnik\.)|(\n)')
    somestring = re.sub(pat, '', somestring)
    return somestring

def ria_set(some, cat):
    res = pd.DataFrame()
    for item in some:
        res = pd.concat([pd.read_csv(base+cat_dict[cat]+str(item)), res])
    res = res.rename(columns={'id': 'docid', 'content':'text'})
    res['text'] =  res['text'].apply(lambda x: clean_ria_source(x))
    return res

In [None]:
realty = ria_set(realty_list, 'realty')
construction = ria_set(construction_list, 'construction')
health = ria_set(health_list, 'health')
ria_society = ria_set(society_list, 'society')

In [None]:
#iz.ru
construction_list_iz = !ls /kaggle/input/iz-construction
base = '/kaggle/input/iz-construction/'
res = pd.DataFrame()
for item in construction_list_iz:
     res = pd.concat([pd.read_csv(base+str(item)), res])
        
construction_iz = res.rename(columns={'id': 'docid', 'content':'text'})
construction_iz['text'] =  construction_iz['text'].apply(lambda x: clean_ria_source(x))



In [None]:
construction_iz.head()

In [None]:
construction_iz['text']

In [None]:
realty['text']

In [None]:
realty = pd.concat([realty, construction, construction_iz], ignore_index=True)
society = pd.concat([society, ria_society], ignore_index=True)
care = pd.concat([health, care], ignore_index=True)

In [None]:
care.loc[:, 'target'] = 5
ussr.loc[:, 'target'] = 3
police.loc[:, 'target'] = 2
science.loc[:, 'target'] = 8
sport.loc[:, 'target'] = 4
tourism.loc[:, 'target'] = 7
society.loc[:, 'target'] = 0
economy.loc[:, 'target'] = 1
realty.loc[:, 'target'] = 6


In [None]:
realty.drop_duplicates(['text', 'title'], inplace=True)
care.drop_duplicates(['text', 'title'], inplace=True)
society.drop_duplicates(['text', 'title'], inplace=True)

In [None]:
lens = []
for item in [care, ussr, police, science, sport, tourism, society, economy, realty]:
    lens.append(len(item))
lens

In [None]:
minlen = min(lens)


In [None]:
#балансируем обучающую выборку
n = minlen
care =  care.sample(n)
ussr = ussr.sample(n)
police = police.sample(int(2*n))
science = science.sample(n)
sport = sport.sample(n)
tourism = tourism.sample(n)
society = society.sample(3*n)
economy = economy.sample(n)
realty = realty.sample(n)

In [None]:
cols_to_keep = ['docid','url', 'title', 'text', 'target']

In [None]:
df = pd.concat([care, ussr, police, science, sport, tourism, society, economy, realty], ignore_index=True)
df = df[cols_to_keep]

In [None]:
df.target.value_counts()

In [None]:
df.isna().any().sum()

In [None]:
df.head()

In [None]:
data_train, data_test = train_test_split(df, stratify=df['target'], test_size=0.1, random_state = 112)

In [None]:
data_train.to_csv('data_train.csv')
data_test.to_csv('data_test.csv')

## BERT

In [None]:
MAX_LEN = 2048
MODEL_PATH = 'cointegrated/rubert-tiny2'
EPOCHS = 1
BATCH_SIZE=2
SAVE_PATH = '/kaggle/working/tynyrubert2.pt'


In [None]:


from torch.utils.data import Dataset

class CustomDataset(Dataset):

    def __init__(self, texts, targets, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )

        return {
           'text': text,
           'input_ids': encoding['input_ids'].flatten(),
           'attention_mask': encoding['attention_mask'].flatten(),
           'targets': torch.tensor(target, dtype=torch.long)
            }

In [None]:
class BertClassifier:

    def __init__(self, model_path=MODEL_PATH, tokenizer_path=MODEL_PATH, n_classes=9, epochs=EPOCHS, model_save_path=SAVE_PATH):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = MAX_LEN
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)
        
    def preparation(self, X_train, y_train, X_valid, y_valid):
    # create datasets
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

    # create data loaders
        self.train_loader = DataLoader(self.train_set, batch_size=BATCH_SIZE, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=1, shuffle=True)

    # helpers initialization
        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
    
    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0

        for data in self.train_loader:
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())
        
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        
        return train_acc, train_loss
    
    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())
    
        val_acc = correct_predictions.double() / len(self.valid_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss
    
    def train(self):
        best_accuracy = 0
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')

            val_acc, val_loss = self.eval()
            print(f'Val loss {val_loss} accuracy {val_acc}')
            print('-' * 10)

            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_acc

        self.model = torch.load(self.model_save_path)
        
    
    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
    
        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }
    
        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)
    
        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
    
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction
        

In [None]:
tinybert = BertClassifier()


In [None]:
tinybert.preparation(
        X_train=list(data_train['text']),
        y_train=list(data_train['target']),
        X_valid=list(data_test['text']),
        y_valid=list(data_test['target'])
    )

In [None]:
tinybert.train()

In [None]:
res = test_news['content'].apply(lambda x: tinybert.predict(x))
new = pd.DataFrame()
new['topic'] = pd.Series(res)
new.index.rename('index', inplace=True)
new.to_csv('tinybert_fin.csv')

In [None]:
preds = data_test['text'].apply(lambda x: tinybert.predict(x))
print(classification_report(data_test['target'], preds))