## Get the data and dependencies

In [47]:
%pip install spacy gensim -qqq
!python -m spacy download ru_core_news_md -qqq

Note: you may need to restart the kernel to use updated packages.
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_md')


In [3]:
!mkdir data
!mkdir data/history
# !wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1iqg1FIPfbrZWlung6gZqve1MeQWc0Je4&export=download&authuser=1&confirm=t' -O './data/dataset.csv'
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#####################
# Utility functions #
#####################

def memory_cleanup():
    gc.collect()
    torch.cuda.empty_cache()

def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

#####################
# Data Preprocessing#
#####################

df = pd.read_csv('../data/dataset.csv')
df.info()


mkdir: data: File exists
mkdir: data/history: File exists
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22224 entries, 0 to 22223
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   title                                 22224 non-null  object 
 1   company                               22224 non-null  object 
 2   location                              22224 non-null  object 
 3   skills                                14384 non-null  object 
 4   source                                22224 non-null  object 
 5   description_no_numbers_with_skills    22224 non-null  object 
 6   experience_from                       22224 non-null  float64
 7   experience_to_adjusted_10             22224 non-null  float64
 8   description_size                      22224 non-null  int64  
 9   description                           22224 non-null  object 
 10  description_no_numbers  

In [6]:
SEED = 42

## Data preprocessing

##### Create merged title/skills/location/source feature

In [4]:
df['skills'].fillna('Не указаны', inplace=True)

title_company_location_skills_feature_template = """
Позиция: {position}
Компания: {company}
Место: {location}
Навыки: {skills}
Источник: {source}
"""

df['title_company_location_skills_source'] = df.apply(lambda x: title_company_location_skills_feature_template.format(
    position=x['title'],
    company=x['company'],
    location=x['location'],
    skills=x['skills'],
    source=x['source']
), axis=1)

In [7]:
# Предположим, что у нас есть столбец 'description_no_numbers' или 'description',
# и таргет 'log_salary_from'. Если нет, адаптируйте поля.
text_col = 'description_no_numbers'
text_col2 = 'title_company_location_skills_source'
target_col = 'log_salary_from'

# Удаляем пропуски
df = df.dropna(subset=[text_col, text_col2, target_col])

# get a subset of the data to try tokenization on
sample = df.sample(100, random_state=SEED)

In [30]:
def token_lookup(word):
    """
    Generate a dict to turn punctuation into a token.
    :return: Token corresponding to the punctuation.
    """
    token = dict()
    token['.'] = ' <PERIOD>'
    token[','] = ' <COMMA>'
    token[','] = ' <COMMA>'
    token['"'] = ' <QUOTATION_MARK>'
    token[';'] = ' <SEMICOLON>'
    token['!'] = ' <EXCLAMATION_MARK>'
    token['?'] = ' <QUESTION_MARK>'
    token['('] = ' <LEFT_PAREN>'
    token[')'] = ' <RIGHT_PAREN>'
    token['-'] = ' <DASH>'
    token['\n'] = ' <NEW_LINE>'
    return token.get(word, word)

def replace_punctuation(text):
    """
    Replace punctuation with tokens so we can use them in our model.
    :param text: The text to be modified.
    :return: The text with punctuation replaced.
    """
    return ''.join([token_lookup(word) for word in text.lower()])

# Простая токенизация: разбиение по словам, очистка
def simple_tokenize(text):
    # В реальном случае можно добавить более сложную предобработку, лемматизацию и т.д.
    tokens = re.findall(r'\b\w+\b|<\w+>', text)
    return tokens

sample_tokens = sample[text_col].apply(replace_punctuation).apply(simple_tokenize)
print(sample_tokens.iloc[0])

['в', 'аккредитованную', 'it', 'компанию', 'подбираем', 'ведущий', 'сетевой', 'инженер', '<PERIOD>', 'чем', 'предстоит', 'заниматься', 'выполнять', 'настройку', '<COMMA>', 'установку', 'и', 'поддержку', 'оборудования', 'локальных', '<COMMA>', 'глобальных', '<COMMA>', 'беспроводных', 'вычислительных', 'сетей', '<PERIOD>', 'обеспечивать', 'бесперебойную', 'работу', 'служб', 'удаленного', 'доступа', 'к', 'ресурсам', 'вычислительных', 'сетей', '<PERIOD>', 'выполнять', 'настойку', 'и', 'поддержку', 'систем', 'мониторинга', 'и', 'систем', 'сбора', 'статистики', 'по', 'использованию', 'вычислительных', 'сетей', '<COMMA>', 'каналов', 'связи', '<PERIOD>', 'устранять', 'нештатные', 'ситуации', '<PERIOD>', 'выполнять', 'заявки', '<COMMA>', 'поступающие', 'от', 'оператора', 'технической', 'поддержки', '<PERIOD>', 'оказывать', 'техническую', 'помощь', 'и', 'консультации', 'структурным', 'подразделениям', 'организации', '<PERIOD>', 'разрабатывать', 'и', 'доводить', 'до', 'сведения', 'руководителя', 

In [31]:
print(sample_tokens.iloc[50])

['softmedialab', '<DASH>', 'резидент', 'сколково', '<COMMA>', 'аккредитованная', 'ит', 'компания', '<COMMA>', 'мы', 'занимаемся', 'цифровизацией', 'с', '2015', 'года', '<PERIOD>', 'специализируемся', 'на', 'сложных', 'решениях', 'для', 'крупных', 'корпоративных', 'клиентов', 'в', 'различных', 'отраслях', '<COMMA>', 'с', 'высокими', 'требованиями', 'к', 'качеству', '<COMMA>', 'надежности', 'и', 'производительности', 'решений', '<PERIOD>', 'на', 'данный', 'момент', 'сделали', '80', 'проектов', 'в', 'россии', '<COMMA>', 'канаде', '<COMMA>', 'европе', '<COMMA>', 'израиле', '<COMMA>', 'сингапуре', 'и', 'сша', '<PERIOD>', 'цель', 'компании', 'на', '2024', 'год', '<DASH>', 'вырасти', 'в', 'два', 'раза', '<COMMA>', 'как', 'по', 'выручке', 'так', 'и', 'по', 'численности', '<PERIOD>', 'мы', 'наращиваем', 'продуктовую', 'экспертизу', '<COMMA>', 'создавая', 'специализированные', 'направления', '<PERIOD>', 'ищем', 'devops', '<DASH>', 'инженера', 'в', 'аутстафф', 'проект', '<PERIOD>', 'мы', 'рассмат

In [33]:
simple_tokenize(replace_punctuation('Hello, world!'))

['hello', '<COMMA>', 'world', '<EXCLAMATION_MARK>']

to be incorp

In [48]:
from nltk.tokenize import word_tokenize
import spacy
import gensim.downloader as api
gensim_model = api.load("word2vec-ruscorpora-300")



# Load the pre-trained Russian model
token_to_pos = spacy.load("ru_core_news_md")

In [49]:
text = 'Мама  мыла 1 раму'
doc = token_to_pos(text)
for token in doc:
    print(token.text, token.pos_)

Мама NOUN
  SPACE
мыла NOUN
1 NUM
раму NOUN


In [51]:
def tokenize_with_pos(text):
    doc = token_to_pos(text.lower())
    token_w_pos_lst = [f"{token.text}_{token.pos_}" for token in doc]
    return token_w_pos_lst


def average_word_vectors(token_w_pos_lst, word2vec_model):
    mean = gensim_model.vectors.mean(1).mean()
    std = gensim_model.vectors.std(1).mean()
    word_vectors = [word2vec_model.get_vector(token_w_pos) for token_w_pos in token_w_pos_lst if token_w_pos in word2vec_model]
    if len(word_vectors) == 0:
        word_vectors = [np.random.normal(mean, std, word2vec_model.vector_size)]
        # print(f'No words in model for sentence')
        # return np.zeros(word2vec_model.vector_size)
    return np.mean(word_vectors, axis=0)

# write a function that takes in a text (like sentence) 
# and returns the list of word vectors up to a certain length
# if the sentence is shorter than the length, pad with with padding vectors at the beginning
# if there are words that are not in the model, use a random vector with the same mean and std as the model
# do not average the vectors, just return the list of vectors

def get_word_vectors(text, word2vec_model, max_len=100):
    token_w_pos_lst = tokenize_with_pos(text)
    mean = gensim_model.vectors.mean(1).mean()
    std = gensim_model.vectors.std(1).mean()
    word_vectors = [word2vec_model.get_vector(token_w_pos) for token_w_pos in token_w_pos_lst if token_w_pos in word2vec_model]
    # now pad the vectors
    # padding = [np.random.normal(mean, std, word2vec_model.vector_size)] * (max_len - len(word_vectors))
    padding = [np.zeros(word2vec_model.vector_size)] * (max_len - len(word_vectors))
    return padding + word_vectors[:max_len]


In [None]:

# Разделяем на train/test
X_train, X_test, y_train, y_test = train_test_split(df[text_col], df[target_col], test_size=0.2, random_state=42)

# Простая токенизация: разбиение по словам, очистка
def simple_tokenize(text):
    # В реальном случае можно добавить более сложную предобработку, лемматизацию и т.д.
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Строим словарь
def build_vocab(texts, min_freq=2):
    freq = {}
    for t in texts:
        for w in t:
            freq[w] = freq.get(w, 0) + 1
    # Фильтруем по частоте
    freq = {k:v for k,v in freq.items() if v >= min_freq}
    # Создаем словарь слово->индекс
    # 0 - <PAD>, 1 - <UNK>
    word2idx = {'<PAD>':0, '<UNK>':1}
    for w in freq:
        word2idx[w] = len(word2idx)
    return word2idx

X_train_tokens = [simple_tokenize(txt) for txt in X_train]
X_test_tokens = [simple_tokenize(txt) for txt in X_test]

word2idx = build_vocab(X_train_tokens, min_freq=2)
vocab_size = len(word2idx)

def text_to_seq(tokens, word2idx):
    return [word2idx.get(w, 1) for w in tokens]  # 1 для <UNK>

# Считаем максимальную длину
row_max_length = max(max(len(t) for t in X_train_tokens), max(len(t) for t in X_test_tokens))
print("Max sequence length:", row_max_length)

def pad_sequence(seq, max_len, pad_idx=0):
    if len(seq) < max_len:
        seq = seq + [pad_idx]*(max_len - len(seq))
    else:
        seq = seq[:max_len]
    return seq

X_train_seq = [pad_sequence(text_to_seq(t, word2idx), row_max_length) for t in X_train_tokens]
X_test_seq = [pad_sequence(text_to_seq(t, word2idx), row_max_length) for t in X_test_tokens]

X_train_seq = np.array(X_train_seq)
X_test_seq = np.array(X_test_seq)

y_train = y_train.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

In [None]:
######################################
# PyTorch Dataset and DataLoaders    #
######################################

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 64
train_dataset = TextDataset(X_train_seq, y_train)
test_dataset = TextDataset(X_test_seq, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
###################################
# Bi_GRU_CNN Model in PyTorch     #
###################################

class BiGRUCNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, gru_units=128, conv_filters=64, kernel_size=3):
        super(BiGRUCNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # BiGRU: return_sequences = True аналогично: batch, seq, hidden*2
        self.gru = nn.GRU(embed_dim, gru_units, bidirectional=True, batch_first=True)

        # Conv1D: в PyTorch Conv1d ожидает (batch, channels, seq_len)
        # У нас выход GRU (batch, seq_len, hidden*2), нужно переставить оси
        self.conv = nn.Conv1d(in_channels=gru_units*2, out_channels=conv_filters, kernel_size=kernel_size, padding='same')
        self.pool = nn.MaxPool1d(kernel_size=2)

        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(conv_filters * (row_max_length//2), 1) # после пула длина примерно в 2 раза меньше
        # Важно: после пула seq_len уменьшается в 2 раза. Если row_max_length нечет, потоки аккуратны, либо подбираем размер.

    def forward(self, x):
        # x: (batch, seq_len)
        emb = self.embedding(x)  # (batch, seq_len, embed_dim)
        gru_out, _ = self.gru(emb)  # (batch, seq_len, hidden*2)
        # permute для Conv1d
        gru_out = gru_out.permute(0, 2, 1)  # (batch, hidden*2, seq_len)
        conv_out = self.conv(gru_out) # (batch, conv_filters, seq_len)
        pooled = self.pool(conv_out) # (batch, conv_filters, seq_len/2)
        # распрямляем
        flat = pooled.flatten(start_dim=1) # (batch, conv_filters*(seq_len/2))
        drop = self.dropout(flat)
        out = self.fc(drop)  # (batch, 1)
        return out.squeeze(1)

In [None]:
###################################
# Training loop with metrics      #
###################################

def fit_eval(model, train_dl, test_dl, num_epochs=10, lr=1e-3, device='cuda'):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.L1Loss()  # MAE; можно поменять на MSELoss, HuberLoss

    history = {
        "train_loss": [],
        "test_loss": [],
        "train_mae": [],
        "test_mae": [],
        "train_r2": [],
        "test_r2": [],
        "train_rmse": [],
        "test_rmse": [],
        "y_pred": [],
        "y_test": []
    }

    for epoch in range(num_epochs):
        model.train()
        train_losses = []
        all_preds = []
        all_labels = []

        for Xb, yb in train_dl:
            Xb = Xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            preds = model(Xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(yb.cpu().numpy())

        train_loss = np.mean(train_losses)
        train_mae = mean_absolute_error(all_labels, all_preds)
        train_r2 = r2_score(all_labels, all_preds)
        train_rmse = np.sqrt(mean_squared_error(all_labels, all_preds))

        model.eval()
        test_losses = []
        all_preds_test = []
        all_labels_test = []
        with torch.no_grad():
            for Xb, yb in test_dl:
                Xb = Xb.to(device)
                yb = yb.to(device)
                preds = model(Xb)
                loss = criterion(preds, yb)
                test_losses.append(loss.item())
                all_preds_test.extend(preds.cpu().numpy())
                all_labels_test.extend(yb.cpu().numpy())

        test_loss = np.mean(test_losses)
        test_mae = mean_absolute_error(all_labels_test, all_preds_test)
        test_r2 = r2_score(all_labels_test, all_preds_test)
        test_rmse = np.sqrt(mean_squared_error(all_labels_test, all_preds_test))

        history["train_loss"].append(train_loss)
        history["test_loss"].append(test_loss)
        history["train_mae"].append(train_mae)
        history["test_mae"].append(test_mae)
        history["train_r2"].append(train_r2)
        history["test_r2"].append(test_r2)
        history["train_rmse"].append(train_rmse)
        history["test_rmse"].append(test_rmse)
        history["y_pred"].append(all_preds_test)
        history["y_test"].append(all_labels_test)

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, "
              f"Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}, Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")

    return history

###################################
# Run training                    #
###################################

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BiGRUCNNModel(vocab_size=vocab_size, embed_dim=256, gru_units=128, conv_filters=64, kernel_size=3)
history = fit_eval(model, train_dataloader, test_dataloader, num_epochs=10, lr=1e-3, device=device)

# Сохраним историю
with open('./data/history/bi_gru_cnn_history.pickle', 'wb') as f:
    pickle.dump(history, f, protocol=pickle.HIGHEST_PROTOCOL)


Epoch 1/10 | Train Loss: 0.4870, Test Loss: 0.4092, Train R2: -0.1599, Test R2: 0.3198, Train MAE: 0.4871, Test MAE: 0.4082
