In [None]:
class Config: 
    datapath = '/kaggle/input/lenta-ru-private-dataset-for-tatar-hackathon/lenta_ru_news_2019_2023.csv'
    
    ner_preset = "Davlan/distilbert-base-multilingual-cased-ner-hrl"
    sentiment_preset = 'cointegrated/rubert-tiny-sentiment-balanced'

#     preprocessing
    bias = 0.2 
    test_size = 0.25
    
#     training
    batch_size = 64
    
cfg = Config() 

In [None]:
! pip install -q evaluate

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import torch
from torch.utils.data import DataLoader

from transformers import (AutoTokenizer, 
                          AutoModelForTokenClassification, 
                          AutoModelForSequenceClassification, 
                          pipeline
                         )
from sklearn.model_selection import train_test_split

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")

os.environ['HF_TOKEN'] = secret_value_0

from tqdm import tqdm 
tqdm.pandas() 

import wandb
wandb.init(mode='disabled')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
data_files = list() 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        data_files.append(os.path.join(dirname, filename))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.sentiment_preset)
model = AutoModelForSequenceClassification.from_pretrained(cfg.sentiment_preset)

# DATA

In [None]:
import ast 
import datasets 

def mean(i): 
    try: 
        return sum(i)/len(i) + cfg.bias
    except: 
        return 0

def get_inputs_outputs(item): 
    text = item['text']
    organization_sentiments = item['organizations_sentiments']
    organization_sentiments = ast.literal_eval(organization_sentiments)
    
    return (
        # inputs
        [f"[focus: {i['word']}] \n{text}" for i in organization_sentiments], 
        
        # outputs
        [mean(i['sentiment']) for i in organization_sentiments]
    )

In [None]:
def pp1(item):
    if item['score'] > .5: 
        label = 2
    elif item['score'] < .5 and item['score'] > -.5: 
        label = 1 
    elif item['score'] < -.5: 
        label = 0 
    
    return {
        'label': torch.tensor(int(label))
    }

def pp2(item):
    return tokenizer(item['text'], return_tensors='pt', padding=True, truncation=True)

def to_dataset(X, y): 
    ds = datasets.Dataset.from_dict({'text': X, 'score': y})
    
    return (ds
            .map(pp1)
            .map(pp2, batched = True))

def train_val_datasets(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= cfg.test_size)
    
    return to_dataset(X_train, y_train), to_dataset(X_val, y_val)

In [None]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="lenta-ru-sentiments",
    learning_rate=2e-5,
    per_device_train_batch_size=cfg.batch_size,
    per_device_eval_batch_size=cfg.batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    save_total_limit = 1, 
)

def start_train(train_dataset, eval_dataset): 
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

In [None]:
model.config

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
for f in data_files: 
    try: 
        df = pd.read_csv(f)
    except: 
        break
    X = list() 
    y = list() 
    
    for i in range(df.shape[0]): 
        item = df.iloc[i]
        
        inputs, outputs = get_inputs_outputs(item)
        X.extend(inputs)
        y.extend(outputs)
    
    train_dataset, eval_dataset = train_val_datasets(X, y)
    start_train(train_dataset, eval_dataset)