# RNN

### Getting the data

#### Train/test data

In [None]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

In [24]:
import pandas as pd

df = pd.read_csv('../data/dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22224 entries, 0 to 22223
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   title                                 22224 non-null  object 
 1   company                               22224 non-null  object 
 2   location                              22224 non-null  object 
 3   skills                                14384 non-null  object 
 4   source                                22224 non-null  object 
 5   description_no_numbers_with_skills    22224 non-null  object 
 6   experience_from                       22224 non-null  float64
 7   experience_to_adjusted_10             22224 non-null  float64
 8   description_size                      22224 non-null  int64  
 9   description                           22224 non-null  object 
 10  description_no_numbers                22224 non-null  object 
 11  description_no_

### Extra dependencies

In [30]:
%pip install pymystem3 swifter -qqq

Note: you may need to restart the kernel to use updated packages.


### Service functions

##### Text preprocessing

In [12]:
# import nltk
# nltk.download("stopwords")
#--------#

# from nltk.corpus import stopwords
from pymystem3 import Mystem
# from string import punctuation
# from nltk.stem import WordNetLemmatizer

# wnl = WordNetLemmatizer()

#Create lemmatizer and stopwords list
mystem = Mystem() 
# russian_stopwords = stopwords.words("russian")

In [20]:
def token_lookup(word):
    """
    Generate a dict to turn punctuation into a token.
    :return: Token corresponding to the punctuation.
    """
    # TODO: Implement Function
    token = dict()
    token['.'] = ' <PERIOD>'
    token[','] = ' <COMMA>'
    token[','] = ' <COMMA>'
    token['"'] = ' <QUOTATION_MARK>'
    token[';'] = ' <SEMICOLON>'
    token['!'] = ' <EXCLAMATION_MARK>'
    token['?'] = ' <QUESTION_MARK>'
    token['('] = ' <LEFT_PAREN>'
    token[')'] = ' <RIGHT_PAREN>'
    token['-'] = ' <DASH>'
    token['\n'] = ' <NEW_LINE>'
    return token.get(word, word)


#Preprocess function
def preprocess_text(text):
    text = ''.join(map(token_lookup, text))
    tokens = mystem.lemmatize(text.lower())

    # tokens = [token for token in tokens if token not in russian_stopwords\
    #           and token != " " \
    #           and token.strip() not in punctuation]

    
    text = "".join(tokens)
    
    return text

#Examples    
print(preprocess_text("Ну что сказать, я вижу кто-то наступил на грабли, Ты разочаровал меня, ты был натравлен вот такие дела."))

print(preprocess_text("По асфальту мимо цемента, looking at the strangers, Избегая зевак под аплодисменты. Обитатели спальных аррондисманов"))

print(preprocess_text("Ну что сказать, я вижу кто-то наступил на грабли, Ты разочаровал меня, ты был натравлен."))

ну что сказать <comma> я видеть кто <dash>то наступать на грабли <comma> ты разочаровывать я <comma> ты быть натравлять вот такой дело <period>

по асфальт мимо цемент <comma> looking at the strangers <comma> избегать зевака под аплодисменты <period> обитатель спальный аррондисман

ну что сказать <comma> я видеть кто <dash>то наступать на грабли <comma> ты разочаровывать я <comma> ты быть натравлять <period>



In [None]:
# use np vectorize to apply the function to the whole column
# import numpy as np
import swifter

# description_no_numbers_v2_lemmatized = map(preprocess_text, df['description_no_numbers_v2'])
# df['description_no_numbers_v2_lemmatized'] = np.array(list(description_no_numbers_v2_lemmatized))
df['description_no_numbers_v2_lemmatized'] = df['description_no_numbers_v2'].swifter.apply(preprocess_text)
# df['description_no_numbers_v2_lemmatized'] = df['description_no_numbers_v2'].apply(preprocess_text)

In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import gc
import re
import os
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from numba import cuda


os.environ["WANDB_DISABLED"] = "true"


def memory_cleanup():
    "Clean up memory"
    gc.collect()
    torch.cuda.empty_cache()


def get_sentence_lengths(text):
    "Get number of words in each sentence in the text"
    # pattern = r'(?<=[.!?])\s+'
    pattern = r'(?<=[.!?])'
    sentences = re.split(pattern, text)
    # remove empty strings
    sentences = [sentence for sentence in sentences if len(sentence) > 0]
    # get number of words in each sentence
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    return sentences, sentence_lengths


def set_seed(seed: int) -> None:
    "Set seed for reproducibility"
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def train_tsdae_bert(model_name, train_sentences):
    """Train a denoising auto-encoder model with BERT model.
    more examples at https://sbert.net/examples/unsupervised_learning/TSDAE/README.html"""
    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Create the special denoising dataset that adds noise on-the-fly
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
    
    # DataLoader to batch your data
    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    
    # Use the denoising auto-encoder loss
    train_loss = losses.DenoisingAutoEncoderLoss(
        model, decoder_name_or_path=model_name, tie_encoder_decoder=True,
    )
    
    # Call the fit method
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        weight_decay=0,
        scheduler="constantlr",
        optimizer_params={"lr": 3e-5},
        show_progress_bar=True,
    )
    
    return model

Display model output

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t


def display_metrics_with_ci(history: dict):
    # plot mean and ci for train and test r2 for all seeds and all iterations, averaged over seeds
    seeds = list(history.keys())
    def mean_confidence_interval(data, confidence=0.95):
        n = len(data)
        m, se = np.mean(data), np.std(data) / np.sqrt(n)
        h = se * t.ppf((1 + confidence) / 2, n-1)
        return m, m-h, m+h

    r2_train_values = [history[seed]['train_r2'] for seed in seeds]
    r2_test_values = [history[seed]['test_r2'] for seed in seeds]

    r2_train_values = np.array(r2_train_values)
    r2_test_values = np.array(r2_test_values)

    r2_train_mean = np.mean(r2_train_values, axis=0)
    r2_test_mean = np.mean(r2_test_values, axis=0)

    r2_train_ci = np.array([mean_confidence_interval(r2_train_values[:, i]) for i in range(r2_train_values.shape[1])])
    r2_test_ci = np.array([mean_confidence_interval(r2_test_values[:, i]) for i in range(r2_test_values.shape[1])])

    plt.figure(figsize=(10, 6))
    plt.plot(r2_train_mean, label='train')
    plt.fill_between(range(len(r2_train_mean)), r2_train_ci[:, 1], r2_train_ci[:, 2], alpha=0.3)

    plt.plot(r2_test_mean, label='test')
    plt.fill_between(range(len(r2_test_mean)), r2_test_ci[:, 1], r2_test_ci[:, 2], alpha=0.3)
    plt.title('Mean R2 by iteration, with 95% CI')
    plt.xlabel('Iteration')
    plt.ylabel('R2')
    plt.legend()
    plt.show()

    mae_test_values = [history[seed]['test_mae'] for seed in seeds]
    rmse_test_values = [history[seed]['test_rmse'] for seed in seeds]

    mae_test_values = np.array(mae_test_values)
    rmse_test_values = np.array(rmse_test_values)

    mae_test_mean = np.mean(mae_test_values, axis=0)
    rmse_test_mean = np.mean(rmse_test_values, axis=0)

    mae_test_ci = np.array([mean_confidence_interval(mae_test_values[:, i]) for i in range(mae_test_values.shape[1])])
    rmse_test_ci = np.array([mean_confidence_interval(rmse_test_values[:, i]) for i in range(rmse_test_values.shape[1])])

    # get an index of the epoch, where the test R2 is the highest
    # get mean and CI for this epoch
    best_epoch = np.argmax(r2_test_mean)
    best_epoch_r2 = r2_test_mean[best_epoch]
    best_epoch_mae = mae_test_mean[best_epoch]
    best_epoch_rmse = rmse_test_mean[best_epoch]
    best_epoch_r2_ci = r2_test_ci[best_epoch]
    best_epoch_mae_ci = mae_test_ci[best_epoch]
    best_epoch_rmse_ci = rmse_test_ci[best_epoch]

    print(f'TEST METRICS FOR THE BEST EPOCH: {best_epoch+1}')
    print(f'R2: mean = {best_epoch_r2:.4f}, 95% CI = [{best_epoch_r2_ci[1]:.4f}, {best_epoch_r2_ci[2]:.4f}]')
    print(f'MAE: mean = {best_epoch_mae:.4f}, 95% CI = [{best_epoch_mae_ci[1]:.4f}, {best_epoch_mae_ci[2]:.4f}]')
    print(f'RMSE: mean = {best_epoch_rmse:.4f}, 95% CI = [{best_epoch_rmse_ci[1]:.4f}, {best_epoch_rmse_ci[2]:.4f}]')

### Traning-related classes

#### Dataset

##### Baseline dataset

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        print('Creating the dataset...')
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target
    

# Dataset and DataLoader
class SalaryDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]



#### Model

##### Double-head BERT with [MASK] token-based regression and MLP

##### Double-head BERT with multitask learning and MLP

#### Training methods

##### Baseline regression

### Training-eval loop with experiments

#### Data preprocessing

##### Define text feature/target columns

In [None]:
text_col_1 = 'description_no_numbers'

text_col_2 = 'title_company_location_skills_source' # Merged text column, second feature

target_col = 'log_salary_from' # regression target

##### Create merged title/skills/location/source feature

In [None]:
df['skills'].fillna('Не указаны', inplace=True)

title_company_location_skills_feature_template = """
Позиция: {position}
Компания: {company}
Место: {location}
Навыки: {skills}
Источник: {source}
"""

df['title_company_location_skills_source'] = df.apply(lambda x: title_company_location_skills_feature_template.format(
    position=x['title'],
    company=x['company'],
    location=x['location'],
    skills=x['skills'],
    source=x['source']
), axis=1)

#### Training code

In [18]:
memory_cleanup()

##### Experiment 1: Single BERT, MSE loss

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
from collections import Counter

# Data preparation
X = df_new2['total'].values
y = df_new2['mean_salary_all'].values.astype(int)

# Tokenizer replacement using PyTorch
def tokenize_texts(texts, vocab=None):
    if vocab is None:
        vocab = Counter(word for text in texts for word in text.split())
    word2idx = {word: idx + 1 for idx, (word, _) in enumerate(vocab.most_common())}
    tokenized = [[word2idx[word] for word in text.split() if word in word2idx] for text in texts]
    return tokenized, word2idx

def pad_sequences(sequences, max_len):
    return np.array([seq + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences])

# Tokenize and pad sequences
tokenized_texts, vocab = tokenize_texts(X)
vocabulary_size = len(vocab) + 1  # Add 1 for padding token
row_max_length = max(len(seq) for seq in tokenized_texts)
X_padded = pad_sequences(tokenized_texts, max_len=row_max_length)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=10)


train_dataset = SalaryDataset(X_train, y_train)
test_dataset = SalaryDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Model definition
class BiGRUCNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, max_length):
        super(BiGRUCNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bigru = nn.GRU(embed_dim, 128, bidirectional=True, batch_first=True)
        self.conv1d = nn.Conv1d(256, 64, kernel_size=3, padding='same')
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(64 * (max_length // 2), 1)

    def forward(self, x):
        x = self.embedding(x)
        gru_out, _ = self.bigru(x)
        conv_out = self.conv1d(gru_out.permute(0, 2, 1))
        pooled_out = self.pool(conv_out)
        flat_out = self.flatten(pooled_out)
        drop_out = self.dropout(flat_out)
        output = self.fc(drop_out)
        return output

# Initialize model, loss, optimizer
embed_dim = 256
model = BiGRUCNNModel(vocab_size=vocabulary_size, embed_dim=embed_dim, max_length=row_max_length)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 15
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        predictions = model(batch_X).squeeze()
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation
model.eval()
y_pred, y_true = [], []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        predictions = model(batch_X).squeeze()
        y_pred.extend(predictions.cpu().numpy())
        y_true.extend(batch_y.cpu().numpy())

# Calculate R2 score
r2 = r2_score(y_true, y_pred)
print(f"R2 Score: {round(r2 * 100, 2)}%")


Starting for seed 42...
double_huber_multitask model...
Creating the dataset...
Creating the dataset...
Starting training/eval loop...
Starting training...
3
32
tensor([83835, 83836, 83836, 83836, 83836, 83836, 83833, 83836, 83836, 83835,
        83835, 83835, 83834, 83836, 83836, 83835], device='cuda:0')
mask_embedding shape: torch.Size([11, 312])
bin_embedding shape: torch.Size([16, 312])
tensor([83835, 83836, 83836, 83836, 83837, 83837, 83836, 83836, 83836, 83833,
        83836, 83836, 83836, 83835, 83835, 83837], device='cuda:1')
mask_embedding shape: torch.Size([5, 312])
bin_embedding shape: torch.Size([16, 312])


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_23/2258028468.py", line 92, in forward
    similarity_loss = self.criterion_similarity(mask_embedding, bin_embedding, torch.ones(mask_embedding.size(0)))
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 1299, in forward
    return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/functional.py", line 3581, in cosine_embedding_loss
    return torch.cosine_embedding_loss(input1, input2, target, margin, reduction_enum)
RuntimeError: The size of tensor a (11) must match the size of tensor b (16) at non-singleton dimension 0
