In [1]:
pip install razdel



In [2]:
pip install pymorphy3



In [3]:
pip install WordCloud



In [4]:
pip install datasets



In [5]:
pip install graphviz



#Библиотеки

In [6]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import warnings
from collections import Counter
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score


import nltk
from nltk.corpus import stopwords
from razdel import tokenize as razdel_tokenize
import pymorphy3
from wordcloud import WordCloud


import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoModel

from datasets import Dataset as HFDataset

from graphviz import Digraph

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

if torch.backends.mps.is_available():
    device = torch.device("mps")
    torch.mps.manual_seed(RANDOM_STATE)
elif torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(RANDOM_STATE)
else:
    device = torch.device("cpu")

print(f"✅ Используемое устройство: {device}")


✅ Используемое устройство: cuda


#Первоначальная обработка данных

In [7]:
def prepare_data(df: pd.DataFrame, text_cols: list) -> pd.DataFrame:
  prepared_df = df.copy()

  prepared_df['skills'] = prepared_df['skills'].fillna('')

  prepared_df['full_text'] = prepared_df[text_cols].agg(' | '.join, axis=1)

  prepared_df['full_text'] = prepared_df['full_text'].str.replace(r'<[^>]+>', ' ', regex=True)
  prepared_df['full_text'] = prepared_df['full_text'].str.replace(r'\n\n+', '\n', regex=True)
  prepared_df['full_text'] = prepared_df['full_text'].str.replace(r'\t+', ' ', regex=True)
  prepared_df['full_text'] = prepared_df['full_text'].str.replace(r' +', ' ', regex=True)

  return prepared_df

In [8]:
full_train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [9]:
full_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16667 entries, 0 to 16666
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            16667 non-null  object 
 1   location         16667 non-null  object 
 2   company          16667 non-null  object 
 3   skills           10842 non-null  object 
 4   description      16667 non-null  object 
 5   experience_from  16667 non-null  float64
 6   salary_from      16667 non-null  float64
 7   log_salary_from  16667 non-null  float64
dtypes: float64(3), object(5)
memory usage: 1.0+ MB


In [10]:
full_train_df.isna().sum()

Unnamed: 0,0
title,0
location,0
company,0
skills,5825
description,0
experience_from,0
salary_from,0
log_salary_from,0


In [11]:
test_df.isna().sum()

Unnamed: 0,0
title,0
location,0
company,0
skills,2014
description,0
experience_from,0


In [12]:
full_train_df = prepare_data(full_train_df, ['title', 'location', 'company', 'skills', 'description'])

In [13]:
test_df = prepare_data(test_df, ['title', 'location', 'company', 'skills', 'description'])

In [14]:
train_df, val_df = train_test_split(
    full_train_df,
    test_size=0.2,
    random_state=42
)

In [15]:
y_train = train_df['log_salary_from']
y_val = val_df['log_salary_from']

In [16]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
save_path = "/content/drive/MyDrive/train_df.csv"
train_df.to_csv(save_path, index=False, encoding='utf-8')

In [20]:
train_df.to_csv("train_df.csv")

In [21]:
val_df.to_csv("val_df.csv")

In [22]:
test_df.to_csv("test_df.csv")

In [18]:
save_path = "/content/drive/MyDrive/val_df.csv"
val_df.to_csv(save_path, index=False, encoding='utf-8')

In [19]:
save_path = "/content/drive/MyDrive/test_df.csv"
test_df.to_csv(save_path, index=False, encoding='utf-8')

# Обучение Берта+линейная часть

In [None]:
class BertRegressor(nn.Module):
    def __init__(self, bert_name):
        super(BertRegressor, self).__init__()
        self.bert = AutoModel.from_pretrained(
            bert_name,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1
        )
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
        self.loss_fn = nn.HuberLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.regressor(pooled_output).squeeze(-1)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

In [None]:
MODEL_NAME = 'ai-forever/ruBert-base'
tokenizer   = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
model = BertRegressor(MODEL_NAME).to(device)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

In [None]:
train_bert_df = train_df.copy()
val_bert_df = val_df.copy()

In [None]:
train_bert_df = train_bert_df[['full_text', 'log_salary_from']].rename(columns={'full_text': 'text', 'log_salary_from': 'label'})
val_bert_df = val_bert_df[['full_text', 'log_salary_from']].rename(columns={'full_text': 'text', 'log_salary_from': 'label'})

In [None]:
train_hf_dataset = HFDataset.from_pandas(train_bert_df)
val_hf_dataset = HFDataset.from_pandas(val_bert_df)

In [None]:
tokenized_train = train_hf_dataset.map(tokenize_function, batched=True, remove_columns=['text'])
tokenized_val = val_hf_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

In [None]:
training_args = TrainingArguments(
    output_dir="/content/Model_bert",
            eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        learning_rate= 3.5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        save_total_limit=2,
        num_train_epochs=4,
        weight_decay= 0.09,
        warmup_steps= 1500,
        lr_scheduler_type="polynomial",
        bf16=torch.cuda.is_bf16_supported(),
        fp16=not torch.cuda.is_bf16_supported(),
        seed=42,
        report_to="tensorboard",
        load_best_model_at_end=True,
        metric_for_best_model="r2",
        greater_is_better=True,
    )

In [None]:
def compute_r2_score(y_true, y_pred):
    """Computes and prints the R2 score."""
    score = r2_score(y_true, y_pred)
    print(f"R2 Score: {score:.6f}")
    return score

In [None]:
def compute_metrics_for_trainer(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    labels = labels.flatten()
    r2 = compute_r2_score(labels, predictions)
    return {"r2": r2}

In [None]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics_for_trainer,
        processing_class=tokenizer
    )

In [None]:
trainer.train()


# Дообучение с последнего чекпоинта

In [None]:
OUTPUT_DIR = "/content/trainer_output"

import os, glob
def get_last_ckpt(path):
    cks = glob.glob(os.path.join(path, "checkpoint-*"))
    return max(cks, key=os.path.getmtime) if cks else None

last_ckpt = get_last_ckpt(OUTPUT_DIR)
print("LAST CKPT:", last_ckpt)

In [None]:
from transformers import TrainingArguments

new_training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    learning_rate=3.5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    save_total_limit=2,
    weight_decay=0.09,
    warmup_steps=1500,
    lr_scheduler_type="polynomial",
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),
    seed=42,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="r2",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=new_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics_for_trainer,
    processing_class=tokenizer
)

trainer.train(resume_from_checkpoint=last_ckpt)

# Сохранение модели

In [None]:
FINAL_DIR = "/content/Model_bert_final"
trainer.save_model(FINAL_DIR)
tokenizer.save_pretrained(FINAL_DIR)
print("Saved to:", FINAL_DIR)

In [None]:
from google.colab import drive
drive.mount("/content/drive")

!cp -r /content/Model_bert_final /content/drive/MyDrive/trainer_output

# Загрузка готовой модели и eval

In [23]:
class BertRegressor(nn.Module):
    def __init__(self, bert_name):
        super(BertRegressor, self).__init__()
        self.bert = AutoModel.from_pretrained(
            bert_name,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1
        )
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
        self.loss_fn = nn.HuberLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.regressor(pooled_output).squeeze(-1)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

model_dir = "/content/drive/MyDrive/Model_bert_final"

In [25]:
from safetensors.torch import load_file

In [36]:
tokenizer = AutoTokenizer.from_pretrained(model_dir)
state_dict = load_file(f"{model_dir}/model.safetensors")

In [39]:
model = BertRegressor("ai-forever/ruBert-base")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

BertRegressor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(120138, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [40]:
texts_to_predict = train_df["full_text"].tolist()

In [42]:
predictions_train = []
for i in range(0, len(texts_to_predict), 32):
  if i % 3000 == 0:
    print('3000')
  batch_texts = texts_to_predict[i:i + 32]
  inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True,
                           max_length=512)
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
  inputs.pop("token_type_ids", None)
  with torch.no_grad():
    outputs = model(**inputs)
    preds = outputs["logits"].cpu().numpy().flatten()
    predictions_train.extend(preds)


3000
3000


In [45]:
texts_to_predict = val_df["full_text"].tolist()

In [46]:
len(texts_to_predict)

3334

In [47]:
predictions_val = []
for i in range(0, len(texts_to_predict), 32):
  if i % 3000 == 0:
    print('3000')
  batch_texts = texts_to_predict[i:i + 32]
  inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True,
                           max_length=512)
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
  inputs.pop("token_type_ids", None)
  with torch.no_grad():
    outputs = model(**inputs)
    preds = outputs["logits"].cpu().numpy().flatten()
    predictions_val.extend(preds)


3000


In [48]:
len(predictions_val)

3334

In [49]:
texts_to_predict = test_df["full_text"].tolist()

In [50]:
len(texts_to_predict)

5556

In [51]:
predictions_test = []
for i in range(0, len(texts_to_predict), 32):
  if i % 3000 == 0:
    print('3000')
  batch_texts = texts_to_predict[i:i + 32]
  inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True,
                           max_length=512)
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
  inputs.pop("token_type_ids", None)
  with torch.no_grad():
    outputs = model(**inputs)
    preds = outputs["logits"].cpu().numpy().flatten()
    predictions_test.extend(preds)


3000


In [52]:
len(predictions_test)

5556

In [53]:
import numpy as np
from google.colab import files

np.save("/content/predictions_train.npy", np.array(predictions_train))

In [54]:
import numpy as np
from google.colab import files

np.save("/content/predictions_val.npy", np.array(predictions_val))

In [55]:
import numpy as np
from google.colab import files

np.save("/content/predictions_test.npy", np.array(predictions_test))