# Fine-tuning transformer

### Getting the data

#### Train/test data

In [1]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

import pandas as pd

df = pd.read_csv('./data/dataset.csv')
df.info()

--2024-12-16 09:01:06--  https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.202.132, 2607:f8b0:4001:c06::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.202.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 295792946 (282M) [application/octet-stream]
Saving to: './data/dataset.csv'


2024-12-16 09:01:10 (153 MB/s) - './data/dataset.csv' saved [295792946/295792946]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22224 entries, 0 to 22223
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   title                                 22224 non-null  object 
 1   company                               22224 non-null  object 
 2   location                             

#### Catboost predictions

In [2]:
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1gbA1owa-9aTxTaGABpTffGvE8y31qHze&export=download&authuser=1&confirm=t' -O './data/best_catboost_model_train_eval_history.pickle'

import pickle

file = './data/best_catboost_model_train_eval_history.pickle'
with open(file, 'rb') as f:
    catboost_history = pickle.load(f)

--2024-12-16 09:01:17--  https://drive.usercontent.google.com/download?id=1gbA1owa-9aTxTaGABpTffGvE8y31qHze&export=download&authuser=1&confirm=t
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.202.132, 2607:f8b0:4001:c06::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.202.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 428138 (418K) [application/octet-stream]
Saving to: './data/best_catboost_model_train_eval_history.pickle'


2024-12-16 09:01:19 (70.0 MB/s) - './data/best_catboost_model_train_eval_history.pickle' saved [428138/428138]



### Extra dependencies

In [3]:
%pip install -U sentence-transformers -qqq

Note: you may need to restart the kernel to use updated packages.


### Service functions

In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import gc
import re
import os
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from numba import cuda


os.environ["WANDB_DISABLED"] = "true"


def memory_cleanup():
    "Clean up memory"
    gc.collect()
    torch.cuda.empty_cache()


def get_sentence_lengths(text):
    "Get number of words in each sentence in the text"
    # pattern = r'(?<=[.!?])\s+'
    pattern = r'(?<=[.!?])'
    sentences = re.split(pattern, text)
    # remove empty strings
    sentences = [sentence for sentence in sentences if len(sentence) > 0]
    # get number of words in each sentence
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    return sentences, sentence_lengths


def set_seed(seed: int) -> None:
    "Set seed for reproducibility"
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def train_tsdae_bert(model_name, train_sentences):
    """Train a denoising auto-encoder model with BERT model.
    more examples at https://sbert.net/examples/unsupervised_learning/TSDAE/README.html"""
    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Create the special denoising dataset that adds noise on-the-fly
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
    
    # DataLoader to batch your data
    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    
    # Use the denoising auto-encoder loss
    train_loss = losses.DenoisingAutoEncoderLoss(
        model, decoder_name_or_path=model_name, tie_encoder_decoder=True,
    )
    
    # Call the fit method
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        weight_decay=0,
        scheduler="constantlr",
        optimizer_params={"lr": 3e-5},
        show_progress_bar=True,
    )
    
    return model

Display model output

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t


def display_metrics_with_ci(history: dict):
    # plot mean and ci for train and test r2 for all seeds and all iterations, averaged over seeds
    seeds = list(history.keys())
    def mean_confidence_interval(data, confidence=0.95):
        n = len(data)
        m, se = np.mean(data), np.std(data) / np.sqrt(n)
        h = se * t.ppf((1 + confidence) / 2, n-1)
        return m, m-h, m+h

    r2_train_values = [history[seed]['train_r2'] for seed in seeds]
    r2_test_values = [history[seed]['test_r2'] for seed in seeds]

    r2_train_values = np.array(r2_train_values)
    r2_test_values = np.array(r2_test_values)

    r2_train_mean = np.mean(r2_train_values, axis=0)
    r2_test_mean = np.mean(r2_test_values, axis=0)

    r2_train_ci = np.array([mean_confidence_interval(r2_train_values[:, i]) for i in range(r2_train_values.shape[1])])
    r2_test_ci = np.array([mean_confidence_interval(r2_test_values[:, i]) for i in range(r2_test_values.shape[1])])

    plt.figure(figsize=(10, 6))
    plt.plot(r2_train_mean, label='train')
    plt.fill_between(range(len(r2_train_mean)), r2_train_ci[:, 1], r2_train_ci[:, 2], alpha=0.3)

    plt.plot(r2_test_mean, label='test')
    plt.fill_between(range(len(r2_test_mean)), r2_test_ci[:, 1], r2_test_ci[:, 2], alpha=0.3)
    plt.title('Mean R2 by iteration, with 95% CI')
    plt.xlabel('Iteration')
    plt.ylabel('R2')
    plt.legend()
    plt.show()

    mae_test_values = [history[seed]['test_mae'] for seed in seeds]
    rmse_test_values = [history[seed]['test_rmse'] for seed in seeds]

    mae_test_values = np.array(mae_test_values)
    rmse_test_values = np.array(rmse_test_values)

    mae_test_mean = np.mean(mae_test_values, axis=0)
    rmse_test_mean = np.mean(rmse_test_values, axis=0)

    mae_test_ci = np.array([mean_confidence_interval(mae_test_values[:, i]) for i in range(mae_test_values.shape[1])])
    rmse_test_ci = np.array([mean_confidence_interval(rmse_test_values[:, i]) for i in range(rmse_test_values.shape[1])])

    # get an index of the epoch, where the test R2 is the highest
    # get mean and CI for this epoch
    best_epoch = np.argmax(r2_test_mean)
    best_epoch_r2 = r2_test_mean[best_epoch]
    best_epoch_mae = mae_test_mean[best_epoch]
    best_epoch_rmse = rmse_test_mean[best_epoch]
    best_epoch_r2_ci = r2_test_ci[best_epoch]
    best_epoch_mae_ci = mae_test_ci[best_epoch]
    best_epoch_rmse_ci = rmse_test_ci[best_epoch]

    print(f'TEST METRICS FOR THE BEST EPOCH: {best_epoch+1}')
    print(f'R2: mean = {best_epoch_r2:.4f}, 95% CI = [{best_epoch_r2_ci[1]:.4f}, {best_epoch_r2_ci[2]:.4f}]')
    print(f'MAE: mean = {best_epoch_mae:.4f}, 95% CI = [{best_epoch_mae_ci[1]:.4f}, {best_epoch_mae_ci[2]:.4f}]')
    print(f'RMSE: mean = {best_epoch_rmse:.4f}, 95% CI = [{best_epoch_rmse_ci[1]:.4f}, {best_epoch_rmse_ci[2]:.4f}]')

KeyboardInterrupt: 

In [2]:
import pickle

file = '../data/history/transfomers_single_bert_mse.pickle'

with open(file, 'rb') as f:
    history = pickle.load(f)

In [3]:
keys = list(history.keys())
keys

[42, 78687, 123123]

In [4]:
keys = list(history[42].keys())
keys

['train_loss',
 'test_loss',
 'train_rmse',
 'test_rmse',
 'train_r2',
 'test_r2',
 'train_mae',
 'test_mae',
 'y_pred',
 'y_test']

In [14]:
for seed, h in history.items():
    print(f'Seed: {seed}')
    print(len(h['y_pred']))
    print([len(_) for _ in h['y_pred']])
    arr = np.array(h['y_pred'])
    print(arr.shape)


Seed: 42
10
[40, 40, 40, 40, 40, 40, 40, 40, 40, 40]
(10, 40)
Seed: 78687
10
[40, 40, 40, 40, 40, 40, 40, 40, 40, 40]
(10, 40)
Seed: 123123
10
[40, 40, 40, 40, 40, 40, 40, 40, 40, 40]
(10, 40)


In [6]:
len(history[42]['y_pred'])

10

In [8]:
[len(_) for _ in history[42]['y_pred']]

[40, 40, 40, 40, 40, 40, 40, 40, 40, 40]

In [21]:
import numpy as np

y_pred = np.array(history[42]['y_pred'])
y_test = np.array(history[42]['y_test'])

# get r2 for each epoch, use sklearn r2_score
from sklearn.metrics import r2_score

r2_scores = [r2_score(test, pred) for test, pred in zip(y_test, y_pred)]
print(r2_scores)

[-307.69464111328125, -301.8771057128906, -296.1627502441406, -290.5736999511719, -285.07257080078125, -279.64605712890625, -274.1941223144531, -268.7493591308594, -263.2622985839844, -257.6695556640625]


In [25]:
# generate two arrays, n_rows = 1000, n_cols = 8 and 2
import numpy as np
# fix random seed for reproducibility
np.random.seed(42)
X = np.random.rand(1000, 10)
y = np.random.rand(1000, 2)

# do a train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_test.shape
print(y_test[:20])

[[0.92431636 0.71239403]
 [0.70296566 0.18266305]
 [0.60341758 0.38912301]
 [0.60290881 0.57907253]
 [0.95463751 0.12857726]
 [0.11054136 0.92209265]
 [0.89703852 0.90174168]
 [0.14938455 0.46916605]
 [0.25274267 0.31140565]
 [0.97926187 0.08863297]
 [0.0651928  0.04992249]
 [0.28021641 0.05312062]
 [0.56338374 0.11376246]
 [0.33407722 0.27244644]
 [0.80312417 0.42667398]
 [0.13258218 0.39507633]
 [0.11786011 0.95825117]
 [0.25698594 0.05218506]
 [0.21623289 0.06947998]
 [0.75081138 0.26554418]]


In [27]:
# generate two arrays, n_rows = 1000, n_cols = 8 and 2
import numpy as np
# fix random seed for reproducibility
np.random.seed(42)
X = np.random.rand(1000, 10)
y = np.random.rand(1000, 2)

# do a train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y[:, 0], test_size=0.2, random_state=42)

y_test.shape
print(y_test[:20])

[0.92431636 0.70296566 0.60341758 0.60290881 0.95463751 0.11054136
 0.89703852 0.14938455 0.25274267 0.97926187 0.0651928  0.28021641
 0.56338374 0.33407722 0.80312417 0.13258218 0.11786011 0.25698594
 0.21623289 0.75081138]


In [28]:
# generate two arrays, n_rows = 1000, n_cols = 8 and 2
import numpy as np
# fix random seed for reproducibility
np.random.seed(42)
X = np.random.rand(1000, 10)
y = np.random.rand(1000, 2)

# do a train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y[:, 1], test_size=0.2, random_state=42)

y_test.shape
print(y_test[:20])

[0.71239403 0.18266305 0.38912301 0.57907253 0.12857726 0.92209265
 0.90174168 0.46916605 0.31140565 0.08863297 0.04992249 0.05312062
 0.11376246 0.27244644 0.42667398 0.39507633 0.95825117 0.05218506
 0.06947998 0.26554418]


In [11]:
len(history[42]['y_test'])

10

In [12]:
[len(_) for _ in history[42]['y_test']]

[40, 40, 40, 40, 40, 40, 40, 40, 40, 40]

In [13]:
import numpy as np

arr = np.array(history[42]['y_test'])
arr.shape

(10, 40)

### Traning-related classes

#### Dataset

##### Baseline dataset

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        print('Creating the dataset...')
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target


##### Dataset for multitask learning

In [None]:
class DualTextDatasetWithBins(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets_reg, targets_bin, tokenizer, max_len):
        print('Creating the dataset...')
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")

        # Targets
        self.targets_reg = targets_reg.tolist()  # Log salary target for regression
        self.targets_bin = targets_bin.tolist()  # Bin IDs for mask token prediction
        
        # Get the bin token ID offset from the tokenizer's vocabulary
        self.bin_token_offset = tokenizer.convert_tokens_to_ids("[BIN_0]")  # This will give the token ID of the first bin token
        self.bin_to_id_mapping = {i: torch.tensor(i + self.bin_token_offset, dtype=torch.long) for i in range(15)}
        
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets_reg)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets_reg[idx], dtype=torch.float)
        
        # Map the bin_targets (0 to 14) to the corresponding token IDs
        # bin_target = torch.tensor(self.bin_targets[idx] + self.bin_token_offset, dtype=torch.long)  # Add offset
        bin_target = self.bin_to_id_mapping[self.targets_bin[idx]]
        
        return inputs1, inputs2, target, bin_target


#### Model

##### Single-head BERT with MLP

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer


# single bert for two text features with MLP head for regression
class SingleBERTWithMLP(nn.Module):
    def __init__(self, config):
        super(SingleBERTWithMLP, self).__init__()
        model_name = config["model_name"]
        # Initialize a single BERT model
        self.bert = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = self.bert.config.hidden_size
        mlp_hidden_size = config["mlp_hidden_size"]
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),  # Double hidden size for concatenation
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Pass both inputs through the same BERT model
        cls1 = self.bert(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token for input1
        cls2 = self.bert(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token for input2

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output


##### Double-head BERT with MLP

In [None]:
# two berts for two text features with MLP head for regression
class DualBERTWithMLP(nn.Module):
    def __init__(self, config):
        super(DualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        model_name = config['model_name']
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        cls1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token

        # Forward pass through BERT2
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)

        return output



##### Double-head BERT with [MASK] token-based regression and MLP

In [None]:
# two berts for two text features with MLP head for regression over MASK token embedding
class MASKPoolDualBERTWithMLP(nn.Module):
    def __init__(self, config):
        super(MASKPoolDualBERTWithMLP, self).__init__()
        model_name = config['model_name']
        # Initialize two independent BERT models
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mask_token_index = config['mask_token_index']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        mask1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, self.mask_token_index, :]  # mask token 

        # Forward pass through BERT2
        mask2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, self.mask_token_index, :]  # mask token

        # concat mask embeddings
        combined_mask = torch.cat([mask1, mask2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_mask)
        return output

##### Double-head BERT with TSDAE pre-tuning and MLP

In [None]:
class TSDAEDualBERTWithMLP(nn.Module):
    """two berts for two text features with MLP head for regression.
    The model is pre-tuned with TSDAE."""
    def __init__(self, config, bert1, bert2):
        super(TSDAEDualBERTWithMLP, self).__init__()
        # Load TSDAE-ed BERT models
        self.bert1 = bert1
        self.bert2 = bert2

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # idea from: https://github.com/UKPLab/sentence-transformers/issues/2494
        # Forward pass through BERT1
        input_dict1 = {
            'input_ids': input1,
            'attention_mask': attention_mask1
        }
        cls1 = self.bert1(input_dict1)['sentence_embedding']
        
        # Forward pass through BERT2
        input_dict2 = {
            'input_ids': input2,
            'attention_mask': attention_mask2
        }
        cls2 = self.bert2(input_dict2)['sentence_embedding']

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output

##### Double-head BERT with multitask learning and MLP

In [None]:
# two berts for two text features with MLP head for regression
class MaskBinDualBERTWithMLP(nn.Module):
    """two berts for two text features with regression over CLS token
    and cosine similarity loss between MASK token and bin embeddings."""
    def __init__(self, config):
        super(MaskBinDualBERTWithMLP, self).__init__()

        # Initialize two independent BERT models
        model_name = config['model_name']
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Add new bin tokens to the tokenizer and resize the model embeddings
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        new_tokens = ["[BIN_0]", "[BIN_1]", "[BIN_2]", "[BIN_3]", "[BIN_4]", "[BIN_5]", "[BIN_6]", "[BIN_7]",
                      "[BIN_8]", "[BIN_9]", "[BIN_10]", "[BIN_11]", "[BIN_12]", "[BIN_13]", "[BIN_14]"]
        self.tokenizer.add_tokens(new_tokens)
        self.bert1.resize_token_embeddings(len(self.tokenizer))  # Resize the model to accommodate new tokens
        self.mask_token_index = config['mask_token_index']

        # Define MLP head for regression
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

        # CosineEmbeddingLoss setup for MASK token prediction
        self.criterion_similarity = nn.CosineEmbeddingLoss(margin=0.0)


    def get_tokenizer(self):
        return self.tokenizer


    def forward(self, input1, attention_mask1, input2, attention_mask2, bin_ids=None):
        # Forward pass through BERT1 (for MASK token prediction and bin prediction)
        outputs1 = self.bert1(input_ids=input1, attention_mask=attention_mask1)
        cls1 = outputs1.last_hidden_state[:, 0, :]  # CLS token
        mask_idx = (input2 == self.mask_token_index).nonzero(as_tuple=True)  # Assume [MASK] token is at index self.mask_token_index
        mask_embedding = outputs1.last_hidden_state[mask_idx]  # Mask token embedding

        # Forward pass through BERT2 (for CLS token regression)
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate CLS tokens from BERT1 and BERT2 and pass through MLP for regression output
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Concatenate CLS embeddings
        
        # Pass through MLP head for salary regression
        salary_output = self.mlp(combined_cls)
        
        # If bin_ids are provided, compute cosine similarity between MASK token and bin embeddings
        if bin_ids is not None:
            # Retrieve bin embeddings from the BERT model's embedding layer
            print(bin_ids)
            bin_embedding = self.bert1.embeddings.word_embeddings(bin_ids)  # [batch_size, hidden_size]
            print(f"mask_embedding shape: {mask_embedding.shape}")
            print(f"bin_embedding shape: {bin_embedding.shape}")
            similarity_loss = self.criterion_similarity(mask_embedding, bin_embedding, torch.ones(mask_embedding.size(0)))
            return salary_output, similarity_loss
        
        return salary_output, None  # For inference, return salary output only

##### Double bert, cross attention

In [None]:
class CrossAttentionLayer(nn.Module):
    def __init__(self, hidden_size, num_heads=8): 
        super(CrossAttentionLayer, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=num_heads)
        self.layer_norm = nn.LayerNorm(hidden_size)


    def forward(self, query, key, value, attn_mask=None, key_padding_mask=None):
        # Apply multi-head attention
        attn_output, _ = self.attention(query, key, value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)
        # Add residual connection and layer norm
        output = self.layer_norm(query + attn_output)  # residual connection with query
        return output


class DualBERTWithCrossAttention(nn.Module):
    """Dual BERT model with cross-attention between the two text features.
    Here, query is the first text feature and key, value are the second text feature."""
    def __init__(self, config):
        super(DualBERTWithCrossAttention, self).__init__()
        model_name = config['model_name']
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

        self.num_heads = config['num_heads']
        self.cross_attention = CrossAttentionLayer(hidden_size, num_heads=self.num_heads)
      
    def forward(self, input1: torch.Tensor, attention_mask1: torch.Tensor, input2: torch.Tensor, attention_mask2: torch.Tensor):
        # Get BERT outputs
        outputs1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state
        outputs2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state
        
        # Get raw CLS token before attention
        cls2 = outputs2[:, 0, :]

        # prepare key_padding_mask
        key_padding_mask = (attention_mask2 == 0).bool() # True indicates positions to exclude (padding tokens). Shape: (batch_size, source_len)

        # prepare attention mask
        # # Step 1: Expand to match query and key dimensions
        # original shape: (batch_size, target_len)
        attn_mask = attention_mask1.unsqueeze(2) # create a new dimension at the end to be able to expand. Shape: (batch_size, target_len, 1)
        attn_mask = attn_mask.expand(-1, -1, attention_mask2.size(1)) # expand to match the source_len. Shape: (batch_size, target_len, source_len)

        # # Step 2: Adjust for multi-head attention
        attn_mask = attn_mask.unsqueeze(1) # Add head dimension at position 1. Shape: (batch_size, 1, target_len, source_len)
        attn_mask = attn_mask.repeat(1, self.num_heads, 1, 1)  # Repeat for each head. Shape: (batch_size, num_heads, target_len, source_len)
        attn_mask = attn_mask.view(-1, attention_mask1.size(1), attention_mask2.size(1))  # Merge batch and head dimensions. Shape: (batch_size * num_heads, target_len, source_len)

        # # Step 3: Convert to boolean mask
        attn_mask = (attn_mask == 0).bool()  # True indicates positions to exclude (padding tokens). Shape: (batch_size * num_heads, target_len, source_len)

        # Apply cross-attention
        attended_features = self.cross_attention(
            query=outputs1,
            key=outputs2,
            value=outputs2,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
        )

        # Get CLS from attended features
        cls1_attended = attended_features[:, 0, :]

        # mean pooling:
        # # Handle padding tokens during pooling
        # valid_token_mask = attention_mask2.unsqueeze(-1).float()  # Shape: (batch_size, seq_len, 1)
        # attended_features = attended_features * valid_token_mask  # Zero-out padding embeddings
        # pooled_output = attended_features.sum(dim=1) / valid_token_mask.sum(dim=1)  # Mean pooling only over valid tokens


        # Concatenate the two [CLS] tokens
        combined = torch.cat([cls1_attended, cls2], dim=1) # Shape: (batch_size, 2 * hidden_size)

        # Pass through MLP head
        output = self.mlp(combined)

        return output

##### Single bert, cross attention

In [None]:
class SingleBERTWithCrossAttention(nn.Module):
    """Single BERT model with cross-attention between the two text features.
    Here, query is the first text feature and key, value are the second text feature."""
    def __init__(self, config):
        super(SingleBERTWithCrossAttention, self).__init__()
        model_name = config['model_name']
        self.bert = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

        self.num_heads = config['num_heads']
        self.cross_attention = CrossAttentionLayer(hidden_size, num_heads=self.num_heads)
      
    def forward(self, input1: torch.Tensor, attention_mask1: torch.Tensor, input2: torch.Tensor, attention_mask2: torch.Tensor):
        # Get BERT outputs
        outputs1 = self.bert(input_ids=input1, attention_mask=attention_mask1).last_hidden_state
        outputs2 = self.bert(input_ids=input2, attention_mask=attention_mask2).last_hidden_state
        
        # Get raw CLS token before attention
        cls2 = outputs2[:, 0, :]

        # prepare key_padding_mask
        key_padding_mask = (attention_mask2 == 0).bool() # True indicates positions to exclude (padding tokens). Shape: (batch_size, source_len)

        # prepare attention mask
        # # Step 1: Expand to match query and key dimensions
        # original shape: (batch_size, target_len)
        attn_mask = attention_mask1.unsqueeze(2) # create a new dimension at the end to be able to expand. Shape: (batch_size, target_len, 1)
        attn_mask = attn_mask.expand(-1, -1, attention_mask2.size(1)) # expand to match the source_len. Shape: (batch_size, target_len, source_len)

        # # Step 2: Adjust for multi-head attention
        attn_mask = attn_mask.unsqueeze(1) # Add head dimension at position 1. Shape: (batch_size, 1, target_len, source_len)
        attn_mask = attn_mask.repeat(1, self.num_heads, 1, 1)  # Repeat for each head. Shape: (batch_size, num_heads, target_len, source_len)
        attn_mask = attn_mask.view(-1, attention_mask1.size(1), attention_mask2.size(1))  # Merge batch and head dimensions. Shape: (batch_size * num_heads, target_len, source_len)

        # # Step 3: Convert to boolean mask
        attn_mask = (attn_mask == 0).bool()  # True indicates positions to exclude (padding tokens). Shape: (batch_size * num_heads, target_len, source_len)

        # Apply cross-attention
        attended_features = self.cross_attention(
            query=outputs1,
            key=outputs2,
            value=outputs2,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
        )

        # Get CLS from attended features
        cls1_attended = attended_features[:, 0, :]

        # Concatenate the two [CLS] tokens
        combined = torch.cat([cls1_attended, cls2], dim=1) # Shape: (batch_size, 2 * hidden_size)

        # Pass through MLP head
        output = self.mlp(combined)

        return output

In [1]:
import torch

# make 2 arrays from normal distribution:
# shape of output1:  torch.Size([16, 1024, 312])
# shape of output2:  torch.Size([16, 256, 312])

output1 = torch.randn(16, 1024, 312)
output2 = torch.randn(16, 256, 312)

# make 2 arrays, for each in dim 1 first 20% values are 1, the rest are 0:
# shape of att mask1:  torch.Size([16, 1024])
# shape of att mask2:  torch.Size([16, 256])

att_mask1 = torch.zeros(16, 1024)
att_mask1[:, :int(0.2 * 1024)] = 1

att_mask2 = torch.zeros(16, 256)
att_mask2[:, :int(0.2 * 256)] = 1

In [8]:
# attn_mask (Optional[Tensor]) – If specified, a 2D or 3D mask preventing attention to certain positions. 
# Must be of shape (L,S) or (N⋅num_heads,L,S),
# where N is the batch size, L is the target sequence length, and S is the source sequence length. 
# A 2D mask will be broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. 
# Binary and float masks are supported. 
# For a binary mask, a True value indicates that the corresponding position is not allowed to attend.
# For a float mask, the mask values will be added to the attention weight.
# If both attn_mask and key_padding_mask are supplied, their types should match.

# shape: 16 * 8, 1024, 256

In [3]:
print(output1.shape, output2.shape, att_mask1.shape, att_mask2.shape, sep='\n')

torch.Size([16, 1024, 312])
torch.Size([16, 256, 312])
torch.Size([16, 1024])
torch.Size([16, 256])


In [7]:
key_padding_mask = (att_mask2 == 0).bool()  
print(key_padding_mask[0, :20], key_padding_mask[0, -20:], sep='\n')

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False])
tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])


In [41]:
print(type(key_padding_mask))

<class 'torch.Tensor'>


In [40]:
print('original shape = ', att_mask1.shape)  # Shape: (batch_size, query_len)
original_mask = att_mask1
original_mask_bool = (original_mask == 0).bool()
# count True values
original_n_true = original_mask_bool.sum().item()
print('original n True = ', original_n_true)

# Step 1: Expand to match query and key dimensions
attn_mask = att_mask1.unsqueeze(2)
print('unsqueezed shape = ', attn_mask.shape)  # Expected shape: (16, 1024, 1)


attn_mask = attn_mask.expand(-1, -1, att_mask2.size(1))  # Shape: (batch_size, query_len, key_len)
print('expanded shape = ', attn_mask.shape)  # Expected shape: (16, 1024, 256)

slice_0 = attn_mask[:, :, 0]  # Shape: (batch_size, query_len)
print('slice 0 shape = ', slice_0.shape)  # Expected shape: (16, 1024)
slice_100 = attn_mask[:, :, 100]  # Shape: (batch_size, query_len)

# Ensure the slices are equal to the unsqueezed mask (after squeezing last dimension)
assert torch.equal(slice_0, slice_100), "Slices [:, :, 0] and [:, :, 100] are not equal!"
assert torch.equal(slice_0, original_mask), "Slice [:, :, 0] does not match the unsqueezed mask!"
assert torch.equal(slice_100, original_mask), "Slice [:, :, 100] does not match the unsqueezed mask!"


# Step 2: Adjust for multi-head attention
num_heads = 8  # Number of attention heads
attn_mask = attn_mask.unsqueeze(1)
print('unsqueezed for heads shape =', attn_mask.shape)  # Final shape: (16 * 8, 1024, 256)
attn_mask = attn_mask.repeat(1, num_heads, 1, 1)  # Add head dimension and repeat for each head
print('repeated for heads shape =', attn_mask.shape)  # Final shape: (16 * 8, 1024, 256)
attn_mask = attn_mask.view(-1, att_mask1.size(1), att_mask2.size(1))  # Merge batch and head dimensions
print('view shape =', attn_mask.shape)  # Final shape: (16 * 8, 1024, 256)

# ensure [:16, :, 1] equals original mask
assert torch.equal(attn_mask[:16, :, 1], original_mask), "Slice [:16, :, 1] does not match the original mask!"

print('num heads adjusted shape = ', attn_mask.shape)  # Final shape: (16 * 8, 1024, 256)

# Step 3: Convert to boolean mask
attn_mask = (attn_mask == 0).bool()  # True indicates positions to exclude
print('bool shape = ', attn_mask.shape)  # Final boolean mask shape: (16 * 8, 1024, 256)
# count True values
n_true = attn_mask.sum().item()
print('n True = ', n_true)

original shape =  torch.Size([16, 1024])
original n True =  13120
unsqueezed shape =  torch.Size([16, 1024, 1])
expanded shape =  torch.Size([16, 1024, 256])
slice 0 shape =  torch.Size([16, 1024])
unsqueezed for heads shape = torch.Size([16, 1, 1024, 256])
repeated for heads shape = torch.Size([16, 8, 1024, 256])
view shape = torch.Size([128, 1024, 256])
num heads adjusted shape =  torch.Size([128, 1024, 256])
bool shape =  torch.Size([128, 1024, 256])
n True =  26869760


#### Training methods

##### Baseline regression

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


def fit_eval(
    seed,
    model,
    X_train,
    X_test,
    y_train_reg,
    y_test_reg,
    criterion,
    tokenizer,
    config,
    text_col_1,
    text_col_2,
):
    set_seed(seed)
    
    # Memory cleanup
    memory_cleanup()

    # Unpack config
    learning_rate = config["learning_rate"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    seq_length = config["seq_length"]

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Make datasets
    train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train_reg, tokenizer, seq_length)
    test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test_reg, tokenizer, seq_length)
    # Make dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Training and Evaluation Loop
    history = {
                "train_loss": [],
               "test_loss": [], 
                "train_rmse": [],
               "test_rmse": [], 
               "train_r2": [],
               "train_r2": [],
               "test_r2": [],
               "train_mae": [],
               "test_mae": [],
               "y_pred": [],
               "y_test": [],
               }

    print('Starting training/eval loop...')
    for epoch in range(num_epochs):
        print('Starting training...')
        # Training Phase
        model.train()
        train_losses = []
        all_preds = []
        all_labels = []
        for batch in train_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model(input1, attention_mask1, input2, attention_mask2)
            outputs = outputs.flatten()

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

            all_preds.extend(outputs.cpu().detach().numpy())
            all_labels.extend(targets.cpu().numpy())
    
        train_loss = np.mean(train_losses)
        
        train_r2 = r2_score(all_labels, all_preds)

        train_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        
        train_mae = mean_absolute_error(all_labels, all_preds)
        
        history["train_loss"].append(train_loss)
        history["train_r2"].append(train_r2)
        history["train_rmse"].append(train_rmse)
        history["train_mae"].append(train_mae)
    
        # Evaluation Phase
        print('Epoch done, evaluating...')
        model.eval()
        test_losses = []
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in test_dataloader:
                inputs1, inputs2, targets = batch
                input1 = inputs1["input_ids"].squeeze(1).to(device)
                attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
                input2 = inputs2["input_ids"].squeeze(1).to(device)
                attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
                targets = targets.to(device)
    
                outputs = model(input1, attention_mask1, input2, attention_mask2)
                outputs = outputs.flatten()
                # loss = criterion(outputs.squeeze(), targets)
                loss = criterion(outputs, targets)
                test_losses.append(loss.item())
                # all_preds.extend(outputs.squeeze().cpu().numpy())
                all_preds.extend(outputs.cpu().numpy())
                all_labels.extend(targets.cpu().numpy())

        history["y_pred"].append(all_preds)
        history["y_test"].append(all_labels)

        test_loss = np.mean(test_losses)

        test_r2 = r2_score(all_labels, all_preds)
        
        test_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        
        test_mae = mean_absolute_error(all_labels, all_preds)
    
        history["test_loss"].append(test_loss)
        
        history["test_r2"].append(test_r2)
        
        history["test_rmse"].append(test_rmse)
        
        history["test_mae"].append(test_mae)
    
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, "
              f"Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}")

        print(f"Epoch {epoch + 1}/{num_epochs}, Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")

    return model, history

##### Multitask training

In [None]:
def fit_eval_with_bins(
    seed,
    model,
    X_train,
    X_test,
    y_train_reg,
    y_test_reg,
    y_train_bin,
    y_test_bin,
    criterion,
    tokenizer,
    config,
    text_col_1,
    text_col_2,
):
    set_seed(seed)
    
    # Memory cleanup
    memory_cleanup()

    # Unpack config
    learning_rate = config["learning_rate"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    seq_length = config["seq_length"]

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Make datasets
    train_dataset = DualTextDatasetWithBins(X_train, text_col_1, text_col_2, y_train_reg, y_train_bin, tokenizer, seq_length)
    test_dataset = DualTextDatasetWithBins(X_test, text_col_1, text_col_2, y_test_reg, y_test_bin, tokenizer, seq_length)
    # Make dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Training and Evaluation Loop
    history = {
                "train_loss": [],
               "test_loss": [], 
                "train_rmse": [],
               "test_rmse": [], 
               "train_r2": [],
               "train_r2": [],
               "test_r2": [],
               "train_mae": [],
               "test_mae": [],
               "y_pred": [],
               "y_test": [],
               }

    print('Starting training/eval loop...')
    for epoch in range(num_epochs):
        print('Starting training...')
        # Training Phase
        model.train()
        train_losses = []
        all_preds = []
        all_labels = []
        for batch in train_dataloader:
            inputs1, inputs2, targets, bin_ids = batch  # Now bin_ids are included in the dataset
            print(len(inputs1))
            print(len(bin_ids))
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)
            bin_ids = bin_ids.to(device)  # Ensure bin_ids are moved to the same device
    
            optimizer.zero_grad()
            # Forward pass: include bin_ids
            salary_output, similarity_loss = model(input1, attention_mask1, input2, attention_mask2, bin_ids)
            loss = criterion(salary_output, targets) + similarity_loss  # Combine regression loss and similarity loss
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            all_preds.extend(salary_output.cpu().detach().numpy())
            all_labels.extend(targets.cpu().numpy())
            
            # outputs = model(input1, attention_mask1, input2, attention_mask2)
            # outputs = outputs.flatten()
            # # loss = criterion(outputs.squeeze(), targets)
            # loss = criterion(outputs, targets)
            # loss.backward()
            # optimizer.step()
            # train_losses.append(loss.item())
            # # all_preds.extend(outputs.squeeze().cpu().detach().numpy())
            # all_preds.extend(outputs.cpu().detach().numpy())
            # all_labels.extend(targets.cpu().numpy())
    
        train_loss = np.mean(train_losses)
        
        train_r2 = r2_score(all_labels, all_preds)

        train_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        
        train_mae = mean_absolute_error(all_labels, all_preds)
        
        history["train_loss"].append(train_loss)
        history["train_r2"].append(train_r2)
        history["train_rmse"].append(train_rmse)
        history["train_mae"].append(train_mae)
    
        # Evaluation Phase
        print('Epoch done, evaluating...')
        model.eval()
        test_losses = []
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in test_dataloader:
                inputs1, inputs2, targets, bin_ids = batch  # Get bin_ids during evaluation
                input1 = inputs1["input_ids"].squeeze(1).to(device)
                attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
                input2 = inputs2["input_ids"].squeeze(1).to(device)
                attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
                targets = targets.to(device)
                bin_ids = bin_ids.to(device)  # Ensure bin_ids are moved to the same device
    
                # Forward pass: include bin_ids
                salary_output, similarity_loss = model(input1, attention_mask1, input2, attention_mask2, bin_ids)
                loss = criterion(salary_output, targets) + similarity_loss  # Combine losses
                test_losses.append(loss.item())
                all_preds.extend(salary_output.cpu().numpy())
                all_labels.extend(targets.cpu().numpy())
            
        history["y_pred"].append(all_preds)
        history["y_test"].append(all_labels)

        test_loss = np.mean(test_losses)

        test_r2 = r2_score(all_labels, all_preds)
        
        test_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        
        test_mae = mean_absolute_error(all_labels, all_preds)
    
        history["test_loss"].append(test_loss)
        
        history["test_r2"].append(test_r2)
        
        history["test_rmse"].append(test_rmse)
        
        history["test_mae"].append(test_mae)
    
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, "
              f"Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}")

        print(f"Epoch {epoch + 1}/{num_epochs}, Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}")

    return model, history

### Training-eval loop with experiments

#### Data preprocessing

##### Define text feature/target columns

In [None]:
text_col_1 = 'description_no_numbers'
text_col_1_with_prompt = text_col_1 + '_with_prompt' # Add prompt to text for multitask learning

text_col_2 = 'title_company_location_skills_source' # Merged text column, second feature

bin_targets_col = 'salary_bin' # Bin targets for MASK token prediction and multitask learning
target_col = 'log_salary_from' # regression target

##### Create merged title/skills/location/source feature

In [None]:
df['skills'].fillna('Не указаны', inplace=True)

title_company_location_skills_feature_template = """
Позиция: {position}
Компания: {company}
Место: {location}
Навыки: {skills}
Источник: {source}
"""

df['title_company_location_skills_source'] = df.apply(lambda x: title_company_location_skills_feature_template.format(
    position=x['title'],
    company=x['company'],
    location=x['location'],
    skills=x['skills'],
    source=x['source']
), axis=1)

##### Create bins for target prediction

In [None]:
import numpy as np


# Compute bin edges for 15 equal bins
num_bins = 15
bin_edges = np.linspace(
    df[target_col].min(), df[target_col].max(), num_bins + 1
)

# Assign bins to a new column
df[bin_targets_col] = np.digitize(df[target_col], bins=bin_edges, right=True) - 1

# Ensure bin labels are within range
df[bin_targets_col] = df[bin_targets_col].clip(0, num_bins - 1)

# prompt to be added to feature 1 for multitask learning
prompt = """\
[CLS] Далее указано описание вакансии. \
Судя по описанию, зарплата на этой позиции составляет [MASK].[SEP]\
"""

df[text_col_1_with_prompt] = prompt + df[text_col_1]

#### Training code

In [18]:
memory_cleanup()

##### Experiment 1: Single BERT, MSE loss

In [None]:
import torch.nn as nn
import pickle
import warnings
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


experiment_name = 'single_bert_mse'

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

seeds = [42, 78687, 123123]
combined_history = {}

# Hyperparameters and configuration
config = {
    "model_name": "sergeyzh/rubert-tiny-turbo",
    "batch_size": 32,
    "seq_length": 1024,
    "hidden_size": 312,
    "mlp_hidden_size": 128,
    # "num_epochs": 10,
    "num_epochs": 2,
    "learning_rate": 5e-6,
    "mask_token_index": 17, # position of the [MASK] token in the feature 1 (to be used in MASK token prediction)
}

memory_cleanup()

model_name = config['model_name']
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and DataLoader
# Prepare data
X = df[[text_col_1, text_col_1_with_prompt, text_col_2]][:200]
y = df[[target_col, bin_targets_col]][:200]

for seed in seeds:
    memory_cleanup()
    print(f'Starting for seed {str(seed)}...')
    print('-' * 100)

    combined_history[seed] = {}

    set_seed(seed)

    # Split train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    y_train_reg = y_train[target_col]
    y_test_reg = y_test[target_col]

    # Initialize the model
    model = SingleBERTWithMLP(config)
    model = torch.nn.DataParallel(model).to(device)

    # Loss Function
    criterion = nn.MSELoss()

    model, history = fit_eval(
        seed,
        model,
        X_train,
        X_test,
        y_train_reg,
        y_test_reg,
        criterion,
        tokenizer,
        config,
        text_col_1,
        text_col_2,
    )

    memory_cleanup()

    combined_history[seed] = history    

# Display metrics
display_metrics_with_ci(combined_history)

# save the history as pickle
with open(f'./history/transfomers_{experiment_name}.pickle', 'wb') as handle:
    pickle.dump(combined_history, handle, protocol=pickle.HIGHEST_PROTOCOL)

Starting for seed 42...
double_huber_multitask model...
Creating the dataset...
Creating the dataset...
Starting training/eval loop...
Starting training...
3
32
tensor([83835, 83836, 83836, 83836, 83836, 83836, 83833, 83836, 83836, 83835,
        83835, 83835, 83834, 83836, 83836, 83835], device='cuda:0')
mask_embedding shape: torch.Size([11, 312])
bin_embedding shape: torch.Size([16, 312])
tensor([83835, 83836, 83836, 83836, 83837, 83837, 83836, 83836, 83836, 83833,
        83836, 83836, 83836, 83835, 83835, 83837], device='cuda:1')
mask_embedding shape: torch.Size([5, 312])
bin_embedding shape: torch.Size([16, 312])


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_23/2258028468.py", line 92, in forward
    similarity_loss = self.criterion_similarity(mask_embedding, bin_embedding, torch.ones(mask_embedding.size(0)))
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 1299, in forward
    return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/functional.py", line 3581, in cosine_embedding_loss
    return torch.cosine_embedding_loss(input1, input2, target, margin, reduction_enum)
RuntimeError: The size of tensor a (11) must match the size of tensor b (16) at non-singleton dimension 0


In [None]:
import torch.nn as nn
import pickle
import warnings
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

seeds = [42, 78687, 123123]
combined_history = {}

# Hyperparameters and configuration
config = {
    "model_name": "sergeyzh/rubert-tiny-turbo",
    "batch_size": 32,
    "seq_length": 1024,
    "hidden_size": 312,
    "mlp_hidden_size": 128,
    # "num_epochs": 10,
    "num_epochs": 2,
    "learning_rate": 5e-6,
    "mask_token_index": 17,
    
}

memory_cleanup()
model_name = config['model_name']
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and DataLoader
# Prepare data
X = df[[text_col_1, text_col_1_with_prompt, text_col_2]][:200]
y = df[[target_col, bin_targets_col]][:200]

for seed in seeds:
    
    memory_cleanup()
    print(f'Starting for seed {str(seed)}...')
    catboost_preds = catboost_history[seed]['y_pred'][:40]

    combined_history[seed] = {}

    set_seed(seed)

    # Split train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    y_train_reg = y_train[target_col]
    y_test_reg = y_test[target_col]
    y_train_bin = y_train[bin_targets_col]
    y_test_bin = y_test[bin_targets_col]


    # fit-eval non-TSDAE-ed model
    # model = DualBERTWithMLP(config)
    model = MaskBinDualBERTWithMLP(config)
    tokenizer = model.get_tokenizer()
    model = torch.nn.DataParallel(model).to(device)
    # Loss Function
    # criterion = nn.MSELoss()  # For regression
    criterion = nn.HuberLoss()

    print(f'double_huber_multitask model...')
    # model, history = fit_eval(seed, model, X_train, X_test, y_train, y_test, catboost_preds, criterion, tokenizer, config)
    model, history = fit_eval_with_bins(
        seed,
        model,
        X_train,
        X_test,
        y_train_reg,
        y_test_reg,
        y_train_bin,
        y_test_bin,
        catboost_preds, criterion, tokenizer, config,
        text_col_1 = text_col_1_with_prompt,
    )
    memory_cleanup()

    combined_history[seed]['double_huber_multitask'] = history    

    # # further split train data into regression train and tsdae train data
    # X_train_tsdae, X_tsdae, y_train_tsdae, y_tsdae = train_test_split(X_train, y_train, test_size=0.01, random_state=seed)

    # # convert text_col_1 data into set of sentences and select 20-60 word sentences as a feature column:
    # # Create a DataFrame of unique sentences and their lengths for X_tsdae
    # unique_sentences = []
    # unique_sentence_lengths = []
    # for text in X_tsdae[text_col_1]:
    #     sentences, sentence_lengths = get_sentence_lengths(text)
    #     unique_sentences.extend(sentences)
    #     unique_sentence_lengths.extend(sentence_lengths)

    # unique_sentences_df = pd.DataFrame({
    #     'sentence': unique_sentences,
    #     'length': unique_sentence_lengths
    # })

    # unique_sentences = unique_sentences_df[(unique_sentences_df.length >= 10) & (unique_sentences_df.length <= 60)]['sentence']

    # # get array with features for each bert
    # train_sentences_array = [
    #     unique_sentences.tolist(),
    #     # X_tsdae[text_col_1].tolist(),
    #     X_tsdae[text_col_2].tolist(),
    # ]

    # berts_after_tsdae = []
    # for index, train_sentences in enumerate(train_sentences_array):
    #     memory_cleanup()
    #     berts_after_tsdae.append(train_tsdae_bert(model_name, train_sentences))
    # memory_cleanup()

    # tsdae_bert1, tsdae_bert2 = berts_after_tsdae

    # # Initialize the non-TSDAE-ed BERT models
    # model = TSDAEDualBERTWithMLP(config, tsdae_bert1, tsdae_bert2)
    # model = torch.nn.DataParallel(model).to(device)
    # # Loss Function
    # # criterion = nn.MSELoss()  # For regression
    # criterion = nn.HuberLoss()

    # print(f'tsdae model...')
    # model, history = fit_eval(seed, model, X_train_tsdae, X_test, y_train_tsdae, y_test, catboost_preds, criterion, tokenizer, config)

    # combined_history[seed]['double_huber_1p_sentences_tsdae'] = history

# save the history as pickle
with open('./models/combined_history_exp_3.pickle', 'wb') as handle:
    pickle.dump(combined_history, handle, protocol=pickle.HIGHEST_PROTOCOL)

Starting for seed 42...
double_huber_multitask model...
Creating the dataset...
Creating the dataset...
Starting training/eval loop...
Starting training...
3
32
tensor([83835, 83836, 83836, 83836, 83836, 83836, 83833, 83836, 83836, 83835,
        83835, 83835, 83834, 83836, 83836, 83835], device='cuda:0')
mask_embedding shape: torch.Size([11, 312])
bin_embedding shape: torch.Size([16, 312])
tensor([83835, 83836, 83836, 83836, 83837, 83837, 83836, 83836, 83836, 83833,
        83836, 83836, 83836, 83835, 83835, 83837], device='cuda:1')
mask_embedding shape: torch.Size([5, 312])
bin_embedding shape: torch.Size([16, 312])


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_23/2258028468.py", line 92, in forward
    similarity_loss = self.criterion_similarity(mask_embedding, bin_embedding, torch.ones(mask_embedding.size(0)))
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 1299, in forward
    return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/functional.py", line 3581, in cosine_embedding_loss
    return torch.cosine_embedding_loss(input1, input2, target, margin, reduction_enum)
RuntimeError: The size of tensor a (11) must match the size of tensor b (16) at non-singleton dimension 0


In [None]:
# Initialize an empty list to store the rows
rows = []

# Iterate through the dictionary
for seed, experiments in combined_history.items():
    for experiment_name, metrics in experiments.items():
        num_epochs = len(metrics['train_loss'])
        for epoch in range(num_epochs):
            row = {
                'random_seed': seed,
                'experiment_name': experiment_name,
                'epoch': epoch + 1,
                
                'train_loss': metrics['train_loss'][epoch],
                'test_loss': metrics['test_loss'][epoch],
                
                'train_r2': metrics['train_r2'][epoch],
                'test_r2': metrics['test_r2'][epoch],
                'test_r2_with_catboost': metrics['test_r2_with_catboost'][epoch]
                
                'train_rmse': metrics['train_rmse'][epoch],
                'test_rmse': metrics['test_rmse'][epoch],
                'test_rmse_with_catboost': metrics['test_rmse_with_catboost'][epoch]
            
                'train_mae': metrics['train_mae'][epoch],
                'test_mae': metrics['test_mae'][epoch],
                'test_mae_with_catboost': metrics['test_mae_with_catboost'][epoch]
            }
            rows.append(row)

# Convert the list of rows into a DataFrame
history = pd.DataFrame(rows)

# Display the DataFrame
print(history)

In [None]:
# Group by experiment name and epoch, then calculate mean and std
grouped_history = history.groupby(['experiment_name', 'epoch']).agg(
    train_loss_mean=('train_loss', 'mean'),
    train_loss_std=('train_loss', 'std'),
    test_loss_mean=('test_loss', 'mean'),
    test_loss_std=('test_loss', 'std'),
    train_r2_mean=('train_r2', 'mean'),
    train_r2_std=('train_r2', 'std'),
    test_r2_mean=('test_r2', 'mean'),
    test_r2_std=('test_r2', 'std'),
    test_r2_with_catboost_mean=('test_r2_with_catboost', 'mean'),
    test_r2_with_catboost_std=('test_r2_with_catboost', 'std')
).reset_index()

# Display the grouped DataFrame
print("\nGrouped DataFrame with Mean and Std:")
print(grouped_history)

In [None]:
# model_name = "cointegrated/LaBSE-en-ru"


# Dataset and DataLoader
# Prepare data
# text_col_1 = 'description_no_numbers_v2_with_prompt' # for mask pooling
# text_col_2 = 'title_company_location_skills_source_with_prompt' # for mask pooling

# X = df[[text_col_1, text_col_2]]
# y = df['log_salary_from']

# # Split # already done in previous cell
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)


# Initialize the model
# model = SingleBERTWithMLP(hidden_size, mlp_hidden_size)


# word_embedding_model1 = models.Transformer(model_name)
# pooling_model1 = models.Pooling(word_embedding_model1.get_word_embedding_dimension(), "cls")
# bert1 = SentenceTransformer(modules=[word_embedding_model1, pooling_model1])

# word_embedding_model2 = models.Transformer(model_name)
# pooling_model2 = models.Pooling(word_embedding_model2.get_word_embedding_dimension(), "cls")
# bert2 = SentenceTransformer(modules=[word_embedding_model2, pooling_model2])


# # Save the trained model
# torch.save(model.state_dict(), "./models/final_model.pth")