In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installs

In [None]:
!pip install -q transformers sentencepiece sentence-transformers catboost

[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 53.1 MB/s 
[K     |████████████████████████████████| 85 kB 4.8 MB/s 
[K     |████████████████████████████████| 76.6 MB 89 kB/s 
[K     |████████████████████████████████| 6.6 MB 44.3 MB/s 
[K     |████████████████████████████████| 101 kB 9.2 MB/s 
[K     |████████████████████████████████| 596 kB 35.5 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


# Imports

In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments, default_data_collator, DebertaV2Tokenizer, PegasusForConditionalGeneration, PegasusTokenizer
from tqdm.notebook import tqdm
import torch 
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
import datetime
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
def create_folds(data, target, num_splits=3):
    if num_splits > 1:
        data.loc[:,'kfold'] = -1
        X = data.drop([target], axis=1)
        y = data[[target]]
        mskf = KFold(n_splits=num_splits, shuffle=True, random_state=42)
        for fold, (trn_, val_) in enumerate(mskf.split(data)):
            data.loc[val_,'kfold'] = fold
    else:
        data.loc[:,'kfold'] = 0
        

    return data

# Parse Features

In [None]:
train = pd.read_csv('/content/train_dataset_train.csv')
test = pd.read_csv('/content/test_dataset_test.csv')

In [None]:
train_parse = []
for i in tqdm(range(len(train))):
  soup = BeautifulSoup(requests.get(f'https://www.rbc.ru/rbcfreenews/{train.document_id.iloc[i][:24]}').text, 'html.parser')
  is_image = 0
  if soup.find('div', class_='article__text article__text_free').find('div', class_='article__main-image'):
    is_image = 1
  text = ''
  for p in soup.find('div', class_='article__text article__text_free').find_all('p'):
    if not p.find('div'):
      text += p.text.strip() + ' '
  train_parse.append([is_image, text])

# Train Features

In [None]:
train = pd.read_csv('/content/train_dataset_train.csv').join(pd.read_csv('train_rbk_parse.csv'))
train['text'] = train['text'].astype(str)

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].detach().cpu() #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def make_features_transformers(df, model_name, df_model, col, max_len):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name).cuda()
  text_features = []
  for sentence in tqdm(df[col]):
    encoded_input = tokenizer([sentence], padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    with torch.no_grad():
      model_output = model(input_ids=encoded_input['input_ids'].cuda())
    sentence_embeddings = list(mean_pooling(model_output, encoded_input['attention_mask']).numpy())
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df

In [None]:
models = ['sberbank-ai/ruRoberta-large', 'sberbank-ai/sbert_large_nlu_ru', 'sberbank-ai/sbert_large_mt_nlu_ru', 'sberbank-ai/ruBert-large', 'sberbank-ai/ruBert-base', 'cointegrated/rubert-tiny2', 'DeepPavlov/rubert-base-cased-conversational', 'cointegrated/LaBSE-en-ru', 'microsoft/mdeberta-v3-base', 'vicgalle/xlm-roberta-large-xnli-anli', 'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 'facebook/bart-large-mnli']
for m in models:
  print(m)
  train = train.join(make_features_transformers(train, m, m.split('/')[1], 'title', 128))
  train.to_csv('rbk_transformers_features.csv', index=False)

In [None]:
models = [('sberbank-ai/ruRoberta-large', 512), ('sberbank-ai/sbert_large_nlu_ru', 512), ('sberbank-ai/sbert_large_mt_nlu_ru', 512), ('sberbank-ai/ruBert-large', 512), ('sberbank-ai/ruBert-base', 512), ('cointegrated/rubert-tiny2', 2048), ('DeepPavlov/rubert-base-cased-conversational', 512), ('cointegrated/LaBSE-en-ru', 512), ('microsoft/mdeberta-v3-base', 512), ('vicgalle/xlm-roberta-large-xnli-anli', 512), ('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512), ('facebook/bart-large-mnli', 1024)]
for m in models:
  print(m)
  train = train.join(make_features_transformers(train, m[0], m[0].split('/')[1], 'text', m[1]))
  train.to_csv('rbk_transformers_text_features.csv', index=False)

In [None]:
def make_labse(df, col):
  df_model = 'labse'
  model = SentenceTransformer('sentence-transformers/LaBSE')
  text_features = []
  for sentence in tqdm(df[col]):
    sentence_embeddings = list(model.encode([sentence]))
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df

In [None]:
train = train.join(make_labse(train, 'title'))
train.to_csv('rbk_transformers_features.csv', index=False)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

  0%|          | 0/7000 [00:00<?, ?it/s]

In [None]:
train = train.join(make_labse(train, 'text'))
train.to_csv('rbk_transformers_text_features.csv', index=False)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

  0%|          | 0/7000 [00:00<?, ?it/s]

In [None]:
!cp /content/rbk_transformers_features.csv drive/MyDrive/rbk_transformers_features.csv 

In [None]:
!cp /content/rbk_transformers_text_features.csv drive/MyDrive/rbk_transformers_text_features.csv 

# Test Features

In [None]:
test = pd.read_csv('/content/test_dataset_test.csv').join(pd.read_csv('test_rbk_parse.csv'))
test['text'] = test['text'].astype(str)

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0].detach().cpu() #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def make_features_transformers(df, model_name, df_model, col, max_len):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name).cuda()
  text_features = []
  for sentence in tqdm(df[col]):
    encoded_input = tokenizer([sentence], padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')
    with torch.no_grad():
      model_output = model(input_ids=encoded_input['input_ids'].cuda())
    sentence_embeddings = list(mean_pooling(model_output, encoded_input['attention_mask']).numpy())
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df

In [None]:
models = ['sberbank-ai/ruRoberta-large', 'sberbank-ai/sbert_large_nlu_ru', 'sberbank-ai/sbert_large_mt_nlu_ru', 'sberbank-ai/ruBert-large', 'sberbank-ai/ruBert-base', 'cointegrated/rubert-tiny2', 'DeepPavlov/rubert-base-cased-conversational', 'cointegrated/LaBSE-en-ru', 'microsoft/mdeberta-v3-base', 'vicgalle/xlm-roberta-large-xnli-anli', 'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 'facebook/bart-large-mnli']
for m in models:
  print(m)
  test = test.join(make_features_transformers(test, m, m.split('/')[1], 128))
  test.to_csv('test_rbk_transformers_features.csv', index=False)

In [None]:
models = [('sberbank-ai/ruRoberta-large', 512), ('sberbank-ai/sbert_large_nlu_ru', 512), ('sberbank-ai/sbert_large_mt_nlu_ru', 512), ('sberbank-ai/ruBert-large', 512), ('sberbank-ai/ruBert-base', 512), ('cointegrated/rubert-tiny2', 2048), ('DeepPavlov/rubert-base-cased-conversational', 512), ('cointegrated/LaBSE-en-ru', 512), ('microsoft/mdeberta-v3-base', 512), ('vicgalle/xlm-roberta-large-xnli-anli', 512), ('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512), ('facebook/bart-large-mnli', 1024)]
for m in models:
  print(m)
  test = test.join(make_features_transformers(test, m[0], m[0].split('/')[1], 'text', m[1]))
  test.to_csv('test_rbk_transformers_text_features.csv', index=False)

('sberbank-ai/ruRoberta-large', 512)


Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.73M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to b

  0%|          | 0/3000 [00:00<?, ?it/s]

('sberbank-ai/sbert_large_nlu_ru', 512)


Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/655 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

('sberbank-ai/sbert_large_mt_nlu_ru', 512)


Downloading:   0%|          | 0.00/331 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

('sberbank-ai/ruBert-large', 512)


Downloading:   0%|          | 0.00/591 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruBert-large were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('sberbank-ai/ruBert-base', 512)


Downloading:   0%|          | 0.00/590 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.70M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/683M [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruBert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('cointegrated/rubert-tiny2', 2048)


Downloading:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('DeepPavlov/rubert-base-cased-conversational', 512)


Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('cointegrated/LaBSE-en-ru', 512)


Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('microsoft/mdeberta-v3-base', 512)


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/534M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('vicgalle/xlm-roberta-large-xnli-anli', 512)


Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

Some weights of the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli were not used when initializing XLMRobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predict

  0%|          | 0/3000 [00:00<?, ?it/s]

('MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 512)


Downloading:   0%|          | 0.00/463 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/532M [00:00<?, ?B/s]

Some weights of the model checkpoint at MoritzLaurer/mDeBERTa-v3-base-mnli-xnli were not used when initializing DebertaV2Model: ['pooler.dense.weight', 'classifier.weight', 'classifier.bias', 'pooler.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

('facebook/bart-large-mnli', 1024)


Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartModel: ['classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
def make_labse(df, col):
  df_model = 'labse'
  model = SentenceTransformer('sentence-transformers/LaBSE')
  text_features = []
  for sentence in tqdm(df[col]):
    sentence_embeddings = list(model.encode([sentence]))
    text_features.extend(sentence_embeddings)
  text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
  return text_features_df

In [None]:
test = test.join(make_labse(test))
test.to_csv('test_rbk_transformers_features.csv', index=False)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
test = test.join(make_labse(test, 'text'))
test.to_csv('test_rbk_transformers_text_features.csv', index=False)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/461 [00:00<?, ?B/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

In [None]:
!cp /content/test_rbk_transformers_features.csv drive/MyDrive/test_rbk_transformers_features.csv 

In [None]:
!cp /content/test_rbk_transformers_text_features.csv drive/MyDrive/test_rbk_transformers_text_features.csv 

# Base Features

In [None]:
train = pd.read_csv('/content/train_dataset_train.csv').join(pd.read_csv('train_rbk_parse.csv'))
train['text'] = train['text'].astype(str)
test = pd.read_csv('/content/test_dataset_test.csv').join(pd.read_csv('test_rbk_parse.csv'))
test['text'] = test['text'].astype(str)

In [None]:
train['len_title'] = train['title'].apply(len)
train['len_word_title'] = train['title'].apply(lambda x: len(x.split()))
train['len_text'] = train['text'].apply(len)
train['len_word_text'] = train['text'].apply(lambda x: len(x.split()))
train['publish_date'] = pd.to_datetime(train['publish_date'], format='%Y-%m-%d %H:%M:%S')
train['year'] = train['publish_date'].apply(lambda x: x.year)
train['month'] = train['publish_date'].apply(lambda x: x.month)
train['day'] = train['publish_date'].apply(lambda x: x.day)
train['hour'] = train['publish_date'].apply(lambda x: x.hour)
train['season'] = train['month'].apply(lambda x: (x - 1) // 3).astype(str)
train['part_day'] = train['hour'].apply(lambda x: x // 6).astype(str)
train['days_from_public'] = train['publish_date'].apply(lambda x: (datetime.datetime(2022, 6, 26) - x).days)

In [None]:
test['len_title'] = test['title'].apply(len)
test['len_word_title'] = test['title'].apply(lambda x: len(x.split()))
test['len_text'] = test['text'].apply(len)
test['len_word_text'] = test['text'].apply(lambda x: len(x.split()))
test['publish_date'] = pd.to_datetime(test['publish_date'], format='%Y-%m-%d %H:%M:%S')
test['year'] = test['publish_date'].apply(lambda x: x.year)
test['month'] = test['publish_date'].apply(lambda x: x.month)
test['day'] = test['publish_date'].apply(lambda x: x.day)
test['hour'] = test['publish_date'].apply(lambda x: x.hour)
test['season'] = test['month'].apply(lambda x: (x - 1) // 3).astype(str)
test['part_day'] = test['hour'].apply(lambda x: x // 6).astype(str)
test['days_from_public'] = test['publish_date'].apply(lambda x: (datetime.datetime(2022, 6, 26) - x).days)

In [None]:
train_authors = []
for elem in train.authors:
  train_authors.extend([x.strip("'") for x in elem.strip('[]').split(',')])
test_authors = []
for elem in test.authors:
  test_authors.extend([x.strip("'") for x in elem.strip('[]').split(',')])

In [None]:
for elem in set(train_authors) & set(test_authors)  - set(['']):
  train['author_' + elem] = 0
for i in range(len(train)):
  for elem in train.authors.iloc[i].strip('[]').split(','):
    if elem.strip("'") in set(train_authors) & set(test_authors)  - set(['']):
      train.loc[i, 'author_' + elem.strip("'")] += 1

  


In [None]:
for elem in set(train_authors) & set(test_authors) - set(['']):
  test['author_' + elem] = 0
for i in range(len(test)):
  for elem in test.authors.iloc[i].strip('[]').split(','):
    if elem.strip("'") in set(train_authors) & set(test_authors)  - set(['']):
      test.loc[i, 'author_' + elem.strip("'")] += 1

In [None]:
train_tags = []
for elem in train.tags:
  train_tags.extend([x.strip("'") for x in elem.strip('[]').split(', ')])
test_tags = []
for elem in test.tags:
  test_tags.extend([x.strip("'") for x in elem.strip('[]').split(', ')])

In [None]:
for elem in set(train_tags) & set(test_tags) - set(['']):
  train['tag_' + elem] = 0
for i in range(len(train)):
  for elem in train.tags.iloc[i].strip('[]').split(', '):
    if elem.strip("'") in set(train_tags) & set(test_tags) - set(['']):
      train.loc[i, 'tag_' + elem.strip("'")] += 1
drop_tags = []
for elem in set(train_tags) & set(test_tags) - set(['']):
  if train[ 'tag_' +elem].sum() < 25:
    drop_tags.append('tag_' +elem)
train.drop(columns=drop_tags, inplace=True)

  


In [None]:
for elem in set(train_tags) & set(test_tags) - set(['']):
  test['tag_' + elem] = 0
for i in range(len(test)):
  for elem in test.tags.iloc[i].strip('[]').split(', '):
    if elem.strip("'") in set(train_tags) & set(test_tags) - set(['']):
      test.loc[i, 'tag_' + elem.strip("'")] += 1
drop_tags = [x for x in drop_tags if x in test.columns]
test.drop(columns=drop_tags, inplace=True)

  


In [None]:
submit = pd.DataFrame({'document_id': test['document_id'], 'views': -1, 'depth': -1, 'full_reads_percent': -1})

# Views

In [None]:
need_features = pd.read_csv('/content/need_features_views.csv').iloc[:500]
train_features = pd.read_csv("drive/MyDrive/rbk_transformers_features.csv")[need_features.feature_names]
test_features = pd.read_csv("drive/MyDrive/test_rbk_transformers_features.csv")[need_features.feature_names]
need_text_features = pd.read_csv('/content/need_text_features_views.csv').iloc[:1000]
train_text_features = pd.read_csv("drive/MyDrive/rbk_transformers_text_features.csv")[need_text_features.feature_names]
test_text_features = pd.read_csv("drive/MyDrive/test_rbk_transformers_text_features.csv")[need_text_features.feature_names]

In [None]:
drop_features = ['document_id', 'title', 'session', 'publish_date', 'authors', 'tags', 'text']
df = train.join(train_features).join(train_text_features).drop(columns=drop_features)
df = create_folds(df, 'views', num_splits=5)
test_df = test.join(test_features).join(test_text_features).drop(columns=drop_features)

In [None]:
target_features = ['views', 'depth', 'full_reads_percent']
X = df.drop(columns=target_features)
y = df[['views', 'kfold']]

In [None]:
regs = []
scores = []
N = 5
val_ds = pd.DataFrame({'true': y['views']})
val_ds['pred'] = -1
for kfold in range(5):
    X_train, X_test = X[X.kfold!=kfold].drop('kfold', axis=1), X[X.kfold==kfold].drop('kfold', axis=1)
    y_train, y_test = y[y.kfold!=kfold].drop('kfold', axis=1), y[y.kfold==kfold].drop('kfold', axis=1)
    print(X_train.shape)
    print(f'--------------------------------{kfold}-fold-------------------------------')
    reg = CatBoostRegressor(iterations = 1000,
                          loss_function='RMSE',
                          #l2_leaf_reg=1,
                          #depth=7,                          
                          task_type='GPU', eval_metric='R2',
                          #min_data_in_leaf = 32,
                          #max_bin = 220,
                          #verbose=100,
                          #leaf_estimation_iterations=30,
                          #od_wait=500,
                          #grow_policy='Lossguide',
                          #learning_rate=0.05,
                          #bootstrap_type='Poisson'
                          verbose=10
                          )
    
    reg.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model = True, plot = False, cat_features=['category', 'part_day', 'season'])
    scores.append(r2_score(y_test, reg.predict(X_test)))
    val_ds.loc[y_test.index, 'pred'] = reg.predict(X_test)
    regs.append(reg)
        
        
assert len(regs) == N

# массив для записи финального результата
y_pred = np.zeros(test_df.shape[0])
for reg in regs:
    y_pred += reg.predict(test_df)#[:,1]
y_pred /= N
submit['views'] = y_pred
print('mean R2', r2_score(val_ds['true'], val_ds['pred']))

# Depth

In [None]:
need_features = pd.read_csv('/content/need_features_depth.csv').iloc[:500]
train_features = pd.read_csv("drive/MyDrive/rbk_transformers_features.csv")[need_features.feature_names]
test_features = pd.read_csv("drive/MyDrive/test_rbk_transformers_features.csv")[need_features.feature_names]
need_text_features = pd.read_csv('/content/need_text_features_depth.csv').iloc[:1000]
train_text_features = pd.read_csv("drive/MyDrive/rbk_transformers_text_features.csv")[need_text_features.feature_names]
test_text_features = pd.read_csv("drive/MyDrive/test_rbk_transformers_text_features.csv")[need_text_features.feature_names]

In [None]:
drop_features = ['document_id', 'title', 'session', 'publish_date', 'authors', 'tags', 'text']
df = train.join(train_features).join(train_text_features).drop(columns=drop_features)
df = create_folds(df, 'depth', num_splits=5)
test_df = test.join(test_features).join(test_text_features).drop(columns=drop_features)

In [None]:
target_features = ['views', 'depth', 'full_reads_percent']
X = df.drop(columns=target_features)
y = df[['depth', 'kfold']]

In [None]:
regs = []
scores = []
N = 5
val_ds = pd.DataFrame({'true': y['depth']})
val_ds['pred'] = -1
for kfold in range(5):
    X_train, X_test = X[X.kfold!=kfold].drop('kfold', axis=1), X[X.kfold==kfold].drop('kfold', axis=1)
    y_train, y_test = y[y.kfold!=kfold].drop('kfold', axis=1), y[y.kfold==kfold].drop('kfold', axis=1)
    print(X_train.shape)
    print(f'--------------------------------{kfold}-fold-------------------------------')
    reg = CatBoostRegressor(iterations = 1000,
                          loss_function='RMSE',
                          #l2_leaf_reg=1,
                          #depth=7,                          
                          task_type='GPU', eval_metric='R2',
                          #min_data_in_leaf = 32,
                          #max_bin = 220,
                          #verbose=100,
                          #leaf_estimation_iterations=30,
                          #od_wait=500,
                          #grow_policy='Lossguide',
                          #learning_rate=0.05,
                          #bootstrap_type='Poisson'
                          verbose=10
                          )
    
    reg.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model = True, plot = False, cat_features=['category', 'part_day', 'season'])
    scores.append(r2_score(y_test, reg.predict(X_test)))
    val_ds.loc[y_test.index, 'pred'] = reg.predict(X_test)
    regs.append(reg)
        
        
assert len(regs) == N

# массив для записи финального результата
y_pred = np.zeros(test_df.shape[0])
for reg in regs:
    y_pred += reg.predict(test_df)#[:,1]
y_pred /= N
submit['depth'] = y_pred
print('mean R2', r2_score(val_ds['true'], val_ds['pred']))

# Percent

In [None]:
need_features = pd.read_csv('/content/need_features_percent.csv').iloc[:500]
train_features = pd.read_csv("drive/MyDrive/rbk_transformers_features.csv")[need_features.feature_names]
test_features = pd.read_csv("drive/MyDrive/test_rbk_transformers_features.csv")[need_features.feature_names]
need_text_features = pd.read_csv('/content/need_text_features_percent.csv').iloc[:1000]
train_text_features = pd.read_csv("drive/MyDrive/rbk_transformers_text_features.csv")[need_text_features.feature_names]
test_text_features = pd.read_csv("drive/MyDrive/test_rbk_transformers_text_features.csv")[need_text_features.feature_names]

In [None]:
drop_features = ['document_id', 'title', 'session', 'publish_date', 'authors', 'tags', 'text']
df = train.join(train_features).join(train_text_features).drop(columns=drop_features)
df = create_folds(df, 'full_reads_percent', num_splits=5)
test_df = test.join(test_features).join(test_text_features).drop(columns=drop_features)

In [None]:
target_features = ['views', 'depth', 'full_reads_percent']
X = df.drop(columns=target_features)
y = df[['full_reads_percent', 'kfold']]

In [None]:
regs = []
scores = []
N = 5
val_ds = pd.DataFrame({'true': y['full_reads_percent']})
val_ds['pred'] = -1
for kfold in range(5):
    X_train, X_test = X[X.kfold!=kfold].drop('kfold', axis=1), X[X.kfold==kfold].drop('kfold', axis=1)
    y_train, y_test = y[y.kfold!=kfold].drop('kfold', axis=1), y[y.kfold==kfold].drop('kfold', axis=1)
    print(X_train.shape)
    print(f'--------------------------------{kfold}-fold-------------------------------')
    reg = CatBoostRegressor(iterations = 1000,
                          loss_function='RMSE',
                          #l2_leaf_reg=1,
                          depth=5,                          
                          task_type='GPU', eval_metric='R2',
                          #min_data_in_leaf = 32,
                          #max_bin = 220,
                          #verbose=100,
                          #leaf_estimation_iterations=30,
                          #od_wait=500,
                          grow_policy='Lossguide',
                          learning_rate=0.05,
                          #bootstrap_type='Poisson'
                          verbose=10
                          )
    
    reg.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model = True, plot = False, cat_features=['category', 'part_day', 'season'])
    scores.append(r2_score(y_test, reg.predict(X_test)))
    val_ds.loc[y_test.index, 'pred'] = reg.predict(X_test)
    regs.append(reg)
        
        
assert len(regs) == N

# массив для записи финального результата
y_pred = np.zeros(test_df.shape[0])
for reg in regs:
    y_pred += reg.predict(test_df)#[:,1]
y_pred /= N
submit['full_reads_percent'] = y_pred
print('mean R2', r2_score(val_ds['true'], val_ds['pred']))

In [None]:
submit.to_csv('rbk_submit_2.csv', index=False)