# Fine-tuning transformer + MLP head

### Getting the data

#### Train/test data

In [1]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

import pandas as pd

df = pd.read_csv('./data/dataset.csv')
df.info()

--2024-12-16 09:01:06--  https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.202.132, 2607:f8b0:4001:c06::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.202.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 295792946 (282M) [application/octet-stream]
Saving to: './data/dataset.csv'


2024-12-16 09:01:10 (153 MB/s) - './data/dataset.csv' saved [295792946/295792946]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22224 entries, 0 to 22223
Data columns (total 15 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   title                                 22224 non-null  object 
 1   company                               22224 non-null  object 
 2   location                             

#### Catboost predictions

In [2]:
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1gbA1owa-9aTxTaGABpTffGvE8y31qHze&export=download&authuser=1&confirm=t' -O './data/best_catboost_model_train_eval_history.pickle'

import pickle

file = './data/best_catboost_model_train_eval_history.pickle'
with open(file, 'rb') as f:
    catboost_history = pickle.load(f)

--2024-12-16 09:01:17--  https://drive.usercontent.google.com/download?id=1gbA1owa-9aTxTaGABpTffGvE8y31qHze&export=download&authuser=1&confirm=t
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.202.132, 2607:f8b0:4001:c06::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.202.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 428138 (418K) [application/octet-stream]
Saving to: './data/best_catboost_model_train_eval_history.pickle'


2024-12-16 09:01:19 (70.0 MB/s) - './data/best_catboost_model_train_eval_history.pickle' saved [428138/428138]



### Extra dependencies

In [3]:
%pip install -U sentence-transformers -qqq

Note: you may need to restart the kernel to use updated packages.


### Service functions

In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import gc
import re
import os
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from numba import cuda


os.environ["WANDB_DISABLED"] = "true"


def memory_cleanup():
    "Clean up memory"
    gc.collect()
    torch.cuda.empty_cache()


def get_sentence_lengths(text):
    "Get number of words in each sentence in the text"
    # pattern = r'(?<=[.!?])\s+'
    pattern = r'(?<=[.!?])'
    sentences = re.split(pattern, text)
    # remove empty strings
    sentences = [sentence for sentence in sentences if len(sentence) > 0]
    # get number of words in each sentence
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    return sentences, sentence_lengths


def set_seed(seed: int) -> None:
    "Set seed for reproducibility"
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def plot(history):
    "Plot training and validation metrics, skip the first epoch"
    # Plot Loss
    plt.plot(range(2, len(history["train_loss"]) + 1), history["train_loss"][1:], label="Train Loss")
    plt.plot(range(2, len(history["test_loss"]) + 1), history["test_loss"][1:], label="Test Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Test Loss Over Epochs 2-...")
    plt.legend()
    plt.show()
    
    # Plot R2 Score
    plt.plot(range(2, len(history["train_r2"]) + 1), history["train_r2"][1:], label="Train R2")
    plt.plot(range(2, len(history["test_r2"]) + 1), history["test_r2"][1:], label="Test R2")
    plt.xlabel("Epoch")
    plt.ylabel("R2 Score")
    plt.title("Train/Test R2 Score Over Epochs 2-...")
    plt.legend()
    plt.show()


def train_tsdae_bert(model_name, train_sentences):
    """Train a denoising auto-encoder model with BERT model.
    more examples at https://sbert.net/examples/unsupervised_learning/TSDAE/README.html"""
    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Create the special denoising dataset that adds noise on-the-fly
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
    
    # DataLoader to batch your data
    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    
    # Use the denoising auto-encoder loss
    train_loss = losses.DenoisingAutoEncoderLoss(
        model, decoder_name_or_path=model_name, tie_encoder_decoder=True,
    )
    
    # Call the fit method
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        weight_decay=0,
        scheduler="constantlr",
        optimizer_params={"lr": 3e-5},
        show_progress_bar=True,
    )
    
    return model

#### Traning-related classes

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoModel

# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        print('Creating the dataset...')
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target


# single bert
class SingleBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(SingleBERTWithMLP, self).__init__()
        # Initialize a single BERT model
        self.bert = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),  # Double hidden size for concatenation
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Pass both inputs through the same BERT model
        cls1 = self.bert(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token for input1
        cls2 = self.bert(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token for input2

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output


# Define the dual BERT model with an MLP head
class DualBERTWithMLP(nn.Module):
    def __init__(self, config):
        super(DualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        model_name = config['model_name']
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        cls1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token
        # mask1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 23, :]  # mask token 

        # Forward pass through BERT2
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token
        # mask2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 23, :]  # mask token

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]
        # concat mask embeddings
        # combined_mask = torch.cat([mask1, mask2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        # output = self.mlp(combined_mask)
        return output

# Define the dual BERT model with an MLP head and MASK pooling
class MASKPoolDualBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(MASKPoolDualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        mask1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 23, :]  # mask token 

        # Forward pass through BERT2
        mask2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 23, :]  # mask token

        # concat mask embeddings
        combined_mask = torch.cat([mask1, mask2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_mask)
        return output

# Define the dual BERT model with an MLP head
class TSDAEDualBERTWithMLP(nn.Module):
    def __init__(self, config, bert1, bert2):
        super(TSDAEDualBERTWithMLP, self).__init__()
        # Load TSDAE-ed BERT models
        self.bert1 = bert1
        self.bert2 = bert2

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # idea from: https://github.com/UKPLab/sentence-transformers/issues/2494
        # Forward pass through BERT1
        input_dict1 = {
            'input_ids': input1,
            'attention_mask': attention_mask1
        }
        cls1 = self.bert1(input_dict1)['sentence_embedding']
        
        # Forward pass through BERT2
        input_dict2 = {
            'input_ids': input2,
            'attention_mask': attention_mask2
        }
        cls2 = self.bert2(input_dict2)['sentence_embedding']

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output

In [6]:
class DualTextDatasetWithBins(Dataset):
    def __init__(self, df, text_col_1, text_col_2, bin_targets_col, targets, tokenizer, max_len):
        print('Creating the dataset...')
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")

        # Targets
        self.targets = targets.tolist()  # Log salary target for regression
        self.bin_targets = df[bin_targets_col].tolist()  # Bin IDs for mask token prediction
        
        # Get the bin token ID offset from the tokenizer's vocabulary
        self.bin_token_offset = tokenizer.convert_tokens_to_ids("[BIN_0]")  # This will give the token ID of the first bin token
        self.bin_to_id_mapping = {i: torch.tensor(i + self.bin_token_offset, dtype=torch.long) for i in range(15)}
        
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        
        # Map the bin_targets (0 to 14) to the corresponding token IDs
        # bin_target = torch.tensor(self.bin_targets[idx] + self.bin_token_offset, dtype=torch.long)  # Add offset
        bin_target = self.bin_to_id_mapping[self.bin_targets[idx]]
        
        return inputs1, inputs2, target, bin_target


# Define the dual BERT model with an MLP head
class MaskBinDualBERTWithMLP(nn.Module):
    def __init__(self, config):
        super(MaskBinDualBERTWithMLP, self).__init__()

        # Initialize two independent BERT models
        model_name = config['model_name']
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Add new bin tokens to the tokenizer and resize the model embeddings
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        new_tokens = ["[BIN_0]", "[BIN_1]", "[BIN_2]", "[BIN_3]", "[BIN_4]", "[BIN_5]", "[BIN_6]", "[BIN_7]",
                      "[BIN_8]", "[BIN_9]", "[BIN_10]", "[BIN_11]", "[BIN_12]", "[BIN_13]", "[BIN_14]"]
        self.tokenizer.add_tokens(new_tokens)
        self.bert1.resize_token_embeddings(len(self.tokenizer))  # Resize the model to accommodate new tokens
        self.mask_token_index = config['mask_token_index']

        # Define MLP head for regression
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

        # CosineEmbeddingLoss setup for MASK token prediction
        self.criterion_similarity = nn.CosineEmbeddingLoss(margin=0.0)

    def get_tokenizer(self):
        return self.tokenizer


    def forward(self, input1, attention_mask1, input2, attention_mask2, bin_ids=None):
        # Forward pass through BERT1 (for MASK token prediction and bin prediction)
        outputs1 = self.bert1(input_ids=input1, attention_mask=attention_mask1)
        cls1 = outputs1.last_hidden_state[:, 0, :]  # CLS token
        mask_idx = (input2 == self.mask_token_index).nonzero(as_tuple=True)  # Assume [MASK] token is at index self.mask_token_index
        mask_embedding = outputs1.last_hidden_state[mask_idx]  # Mask token embedding

        # Forward pass through BERT2 (for CLS token regression)
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token
        
        # Concatenate CLS tokens from BERT1 and BERT2 and pass through MLP for regression output
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Concatenate CLS embeddings
        
        # Pass through MLP head for salary regression
        salary_output = self.mlp(combined_cls)
        
        # If bin_ids are provided, compute cosine similarity between MASK token and bin embeddings
        if bin_ids is not None:
            # Retrieve bin embeddings from the BERT model's embedding layer
            print(bin_ids)
            bin_embedding = self.bert1.embeddings.word_embeddings(bin_ids)  # [batch_size, hidden_size]
            print(f"mask_embedding shape: {mask_embedding.shape}")
            print(f"bin_embedding shape: {bin_embedding.shape}")
            similarity_loss = self.criterion_similarity(mask_embedding, bin_embedding, torch.ones(mask_embedding.size(0)))
            return salary_output, similarity_loss
        
        return salary_output, None  # For inference, return salary output only

#### Training method

In [17]:
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


def fit_eval(
    seed,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    catboost_preds,
    criterion,
    tokenizer,
    config,
    text_col_1 = 'description_no_numbers_v2',
    text_col_2 = 'title_company_location_skills_source',
):
    set_seed(seed)
    
    # Memory cleanup
    memory_cleanup()

    # Unpack config
    learning_rate = config["learning_rate"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    seq_length = config["seq_length"]

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Make datasets
    train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train, tokenizer, seq_length)
    test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test, tokenizer, seq_length)
    # Make dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Training and Evaluation Loop
    history = {
                "train_loss": [],
               "test_loss": [], 
                "train_rmse": [],
               "test_rmse": [], 
               "test_rmse_with_catboost": [],
               "train_r2": [],
               "test_r2": [],
               "test_r2_with_catboost": [],
               "train_mae": [],
               "test_mae": [],
               "test_mae_with_catboost": [],
               "max_test_loss": float('inf'),
               "max_test_r2": float('-inf'),
               "max_test_mae": float('inf'),
               "best_preds": []
               }
    
    # test_labels = y_test
    # test_preds = []
    print('Starting training/eval loop...')
    for epoch in range(num_epochs):
        print('Starting training...')
        # Training Phase
        model.train()
        train_losses = []
        all_preds = []
        all_labels = []
        for batch in train_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model(input1, attention_mask1, input2, attention_mask2)
            outputs = outputs.flatten()
            # loss = criterion(outputs.squeeze(), targets)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            # all_preds.extend(outputs.squeeze().cpu().detach().numpy())
            all_preds.extend(outputs.cpu().detach().numpy())
            all_labels.extend(targets.cpu().numpy())
    
        train_loss = np.mean(train_losses)
        
        train_r2 = r2_score(all_labels, all_preds)

        train_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        
        train_mae = mean_absolute_error(all_labels, all_preds)
        
        history["train_loss"].append(train_loss)
        history["train_r2"].append(train_r2)
        history["train_rmse"].append(train_rmse)
        history["train_mae"].append(train_mae)
    
        # Evaluation Phase
        print('Epoch done, evaluating...')
        model.eval()
        test_losses = []
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in test_dataloader:
                inputs1, inputs2, targets = batch
                input1 = inputs1["input_ids"].squeeze(1).to(device)
                attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
                input2 = inputs2["input_ids"].squeeze(1).to(device)
                attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
                targets = targets.to(device)
    
                outputs = model(input1, attention_mask1, input2, attention_mask2)
                outputs = outputs.flatten()
                # loss = criterion(outputs.squeeze(), targets)
                loss = criterion(outputs, targets)
                test_losses.append(loss.item())
                # all_preds.extend(outputs.squeeze().cpu().numpy())
                all_preds.extend(outputs.cpu().numpy())
                all_labels.extend(targets.cpu().numpy())
                
        test_loss = np.mean(test_losses)

        test_r2 = r2_score(all_labels, all_preds)
        test_r2_with_catboost = r2_score(all_labels, (np.array(all_preds) + catboost_preds) / 2)
        
        test_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        test_rmse_with_catboost = mean_squared_error(all_labels, (np.array(all_preds) + catboost_preds) / 2, squared=False)
        
        test_mae = mean_absolute_error(all_labels, all_preds)
        test_mae_with_catboost = mean_absolute_error(all_labels, (np.array(all_preds) + catboost_preds) / 2)
    
        history["test_loss"].append(test_loss)
        
        history["test_r2"].append(test_r2)
        history["test_r2_with_catboost"].append(test_r2_with_catboost)
        
        history["test_rmse"].append(test_rmse)
        history["test_rmse_with_catboost"].append(test_rmse_with_catboost)
        
        history["test_mae"].append(test_mae)
        history["test_mae_with_catboost"].append(test_mae_with_catboost)
        
        if test_r2 > history["max_test_r2"]:
            history["max_test_r2"] = test_r2
            history["best_preds"] = all_preds
    
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, "
              f"Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}, Test R2 with catboost: {test_r2_with_catboost:.4f}")

        print(f"Epoch {epoch + 1}/{num_epochs}, Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}, "
              f"Test MAE with catboost: {test_mae_with_catboost:.4f}")

    return model, history


def fit_eval_with_bins(
    seed,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    bin_targets_col,
    catboost_preds,
    criterion,
    tokenizer,
    config,
    text_col_1 = 'description_no_numbers_v2',
    text_col_2 = 'title_company_location_skills_source',
):
    set_seed(seed)
    
    # Memory cleanup
    memory_cleanup()

    # Unpack config
    learning_rate = config["learning_rate"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    seq_length = config["seq_length"]

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Make datasets
    train_dataset = DualTextDatasetWithBins(X_train, text_col_1, text_col_2, bin_targets_col, y_train, tokenizer, seq_length)
    test_dataset = DualTextDatasetWithBins(X_test, text_col_1, text_col_2, bin_targets_col, y_test, tokenizer, seq_length)
    # Make dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Training and Evaluation Loop
    history = {
                "train_loss": [],
               "test_loss": [], 
                "train_rmse": [],
               "test_rmse": [], 
               "test_rmse_with_catboost": [],
               "train_r2": [],
               "test_r2": [],
               "test_r2_with_catboost": [],
               "train_mae": [],
               "test_mae": [],
               "test_mae_with_catboost": [],
               "max_test_loss": float('inf'),
               "max_test_r2": float('-inf'),
               "max_test_mae": float('inf'),
               "best_preds": []
               }
    
    # test_labels = y_test
    # test_preds = []
    print('Starting training/eval loop...')
    for epoch in range(num_epochs):
        print('Starting training...')
        # Training Phase
        model.train()
        train_losses = []
        all_preds = []
        all_labels = []
        for batch in train_dataloader:
            inputs1, inputs2, targets, bin_ids = batch  # Now bin_ids are included in the dataset
            print(len(inputs1))
            print(len(bin_ids))
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)
            bin_ids = bin_ids.to(device)  # Ensure bin_ids are moved to the same device
    
            optimizer.zero_grad()
            # Forward pass: include bin_ids
            salary_output, similarity_loss = model(input1, attention_mask1, input2, attention_mask2, bin_ids)
            loss = criterion(salary_output, targets) + similarity_loss  # Combine regression loss and similarity loss
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            all_preds.extend(salary_output.cpu().detach().numpy())
            all_labels.extend(targets.cpu().numpy())
            
            # outputs = model(input1, attention_mask1, input2, attention_mask2)
            # outputs = outputs.flatten()
            # # loss = criterion(outputs.squeeze(), targets)
            # loss = criterion(outputs, targets)
            # loss.backward()
            # optimizer.step()
            # train_losses.append(loss.item())
            # # all_preds.extend(outputs.squeeze().cpu().detach().numpy())
            # all_preds.extend(outputs.cpu().detach().numpy())
            # all_labels.extend(targets.cpu().numpy())
    
        train_loss = np.mean(train_losses)
        
        train_r2 = r2_score(all_labels, all_preds)

        train_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        
        train_mae = mean_absolute_error(all_labels, all_preds)
        
        history["train_loss"].append(train_loss)
        history["train_r2"].append(train_r2)
        history["train_rmse"].append(train_rmse)
        history["train_mae"].append(train_mae)
    
        # Evaluation Phase
        print('Epoch done, evaluating...')
        model.eval()
        test_losses = []
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in test_dataloader:
                inputs1, inputs2, targets, bin_ids = batch  # Get bin_ids during evaluation
                input1 = inputs1["input_ids"].squeeze(1).to(device)
                attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
                input2 = inputs2["input_ids"].squeeze(1).to(device)
                attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
                targets = targets.to(device)
                bin_ids = bin_ids.to(device)  # Ensure bin_ids are moved to the same device
    
                # Forward pass: include bin_ids
                salary_output, similarity_loss = model(input1, attention_mask1, input2, attention_mask2, bin_ids)
                loss = criterion(salary_output, targets) + similarity_loss  # Combine losses
                test_losses.append(loss.item())
                all_preds.extend(salary_output.cpu().numpy())
                all_labels.extend(targets.cpu().numpy())
                
        test_loss = np.mean(test_losses)

        test_r2 = r2_score(all_labels, all_preds)
        test_r2_with_catboost = r2_score(all_labels, (np.array(all_preds) + catboost_preds) / 2)
        
        test_rmse = mean_squared_error(all_labels, all_preds, squared=False)
        test_rmse_with_catboost = mean_squared_error(all_labels, (np.array(all_preds) + catboost_preds) / 2, squared=False)
        
        test_mae = mean_absolute_error(all_labels, all_preds)
        test_mae_with_catboost = mean_absolute_error(all_labels, (np.array(all_preds) + catboost_preds) / 2)
    
        history["test_loss"].append(test_loss)
        
        history["test_r2"].append(test_r2)
        history["test_r2_with_catboost"].append(test_r2_with_catboost)
        
        history["test_rmse"].append(test_rmse)
        history["test_rmse_with_catboost"].append(test_rmse_with_catboost)
        
        history["test_mae"].append(test_mae)
        history["test_mae_with_catboost"].append(test_mae_with_catboost)
        
        if test_r2 > history["max_test_r2"]:
            history["max_test_r2"] = test_r2
            history["best_preds"] = all_preds
    
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, "
              f"Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}, Test R2 with catboost: {test_r2_with_catboost:.4f}")

        print(f"Epoch {epoch + 1}/{num_epochs}, Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}, "
              f"Test MAE with catboost: {test_mae_with_catboost:.4f}")

    return model, history

#### Training-eval loop with experiments

In [18]:
memory_cleanup()

In [19]:
# import logging
# Set the logging level for the transformers library to ERROR
# logging.getLogger("transformers").setLevel(logging.ERROR)
# logging.getLogger("transformers").setLevel(logging.WARNING)

import torch.nn as nn
import pickle
import warnings
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split


# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


seeds = [42, 78687, 123123]
combined_history = {}

# Hyperparameters
# seq_length = 512
# hidden_size = 768  # BERT base hidden size
# mlp_hidden_size = 256

config = {
    "model_name": "sergeyzh/rubert-tiny-turbo",
    "batch_size": 32,
    "seq_length": 1024,
    "hidden_size": 312,
    "mlp_hidden_size": 128,
    "num_epochs": 10,
    # "num_epochs": 2,
    "learning_rate": 5e-6,
    "mask_token_index": 17,
    
}

memory_cleanup()
model_name = config['model_name']
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and DataLoader
# Prepare data

# Compute bin edges for 15 equal bins
num_bins = 15
bin_edges = np.linspace(
    df["log_salary_from"].min(), df["log_salary_from"].max(), num_bins + 1
)

# Assign bins to a new column
df["salary_bin"] = np.digitize(df["log_salary_from"], bins=bin_edges, right=True) - 1

# Ensure bin labels are within range
df["salary_bin"] = df["salary_bin"].clip(0, num_bins - 1)

prompt = """\
[CLS] Далее указано описание вакансии. \
Судя по описанию, зарплата на этой позиции составляет [MASK].[SEP]\
"""

df['description_no_numbers_v2_with_prompt'] = prompt + df['description_no_numbers_v2']


text_col_1_with_prompt = 'description_no_numbers_v2_with_prompt'
text_col_1 = 'description_no_numbers_v2'
text_col_2 = 'title_company_location_skills_source'
bin_targets_col = 'salary_bin'


X = df[[text_col_1, text_col_1_with_prompt, text_col_2, bin_targets_col]][:200]
y = df['log_salary_from'][:200]

for seed in seeds:
    
    memory_cleanup()
    print(f'Starting for seed {str(seed)}...')
    catboost_preds = catboost_history[seed]['y_pred'][:40]

    combined_history[seed] = {}

    set_seed(seed)

    # Split train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    # fit-eval non-TSDAE-ed model
    # model = DualBERTWithMLP(config)
    model = MaskBinDualBERTWithMLP(config)
    tokenizer = model.get_tokenizer()
    model = torch.nn.DataParallel(model).to(device)
    # Loss Function
    # criterion = nn.MSELoss()  # For regression
    criterion = nn.HuberLoss()

    print(f'double_huber_multitask model...')
    # model, history = fit_eval(seed, model, X_train, X_test, y_train, y_test, catboost_preds, criterion, tokenizer, config)
    model, history = fit_eval_with_bins(
        seed, model, X_train, X_test, y_train, y_test, bin_targets_col, catboost_preds, criterion, tokenizer, config,
        text_col_1 = text_col_1_with_prompt,
    )
    memory_cleanup()

    combined_history[seed]['double_huber_multitask'] = history    

    # # further split train data into regression train and tsdae train data
    # X_train_tsdae, X_tsdae, y_train_tsdae, y_tsdae = train_test_split(X_train, y_train, test_size=0.01, random_state=seed)

    # # convert text_col_1 data into set of sentences and select 20-60 word sentences as a feature column:
    # # Create a DataFrame of unique sentences and their lengths for X_tsdae
    # unique_sentences = []
    # unique_sentence_lengths = []
    # for text in X_tsdae[text_col_1]:
    #     sentences, sentence_lengths = get_sentence_lengths(text)
    #     unique_sentences.extend(sentences)
    #     unique_sentence_lengths.extend(sentence_lengths)

    # unique_sentences_df = pd.DataFrame({
    #     'sentence': unique_sentences,
    #     'length': unique_sentence_lengths
    # })

    # unique_sentences = unique_sentences_df[(unique_sentences_df.length >= 10) & (unique_sentences_df.length <= 60)]['sentence']

    # # get array with features for each bert
    # train_sentences_array = [
    #     unique_sentences.tolist(),
    #     # X_tsdae[text_col_1].tolist(),
    #     X_tsdae[text_col_2].tolist(),
    # ]

    # berts_after_tsdae = []
    # for index, train_sentences in enumerate(train_sentences_array):
    #     memory_cleanup()
    #     berts_after_tsdae.append(train_tsdae_bert(model_name, train_sentences))
    # memory_cleanup()

    # tsdae_bert1, tsdae_bert2 = berts_after_tsdae

    # # Initialize the non-TSDAE-ed BERT models
    # model = TSDAEDualBERTWithMLP(config, tsdae_bert1, tsdae_bert2)
    # model = torch.nn.DataParallel(model).to(device)
    # # Loss Function
    # # criterion = nn.MSELoss()  # For regression
    # criterion = nn.HuberLoss()

    # print(f'tsdae model...')
    # model, history = fit_eval(seed, model, X_train_tsdae, X_test, y_train_tsdae, y_test, catboost_preds, criterion, tokenizer, config)

    # combined_history[seed]['double_huber_1p_sentences_tsdae'] = history

# save the history as pickle
with open('./models/combined_history_exp_3.pickle', 'wb') as handle:
    pickle.dump(combined_history, handle, protocol=pickle.HIGHEST_PROTOCOL)

Starting for seed 42...
double_huber_multitask model...
Creating the dataset...
Creating the dataset...
Starting training/eval loop...
Starting training...
3
32
tensor([83835, 83836, 83836, 83836, 83836, 83836, 83833, 83836, 83836, 83835,
        83835, 83835, 83834, 83836, 83836, 83835], device='cuda:0')
mask_embedding shape: torch.Size([11, 312])
bin_embedding shape: torch.Size([16, 312])
tensor([83835, 83836, 83836, 83836, 83837, 83837, 83836, 83836, 83836, 83833,
        83836, 83836, 83836, 83835, 83835, 83837], device='cuda:1')
mask_embedding shape: torch.Size([5, 312])
bin_embedding shape: torch.Size([16, 312])


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_23/2258028468.py", line 92, in forward
    similarity_loss = self.criterion_similarity(mask_embedding, bin_embedding, torch.ones(mask_embedding.size(0)))
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 1299, in forward
    return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/functional.py", line 3581, in cosine_embedding_loss
    return torch.cosine_embedding_loss(input1, input2, target, margin, reduction_enum)
RuntimeError: The size of tensor a (11) must match the size of tensor b (16) at non-singleton dimension 0


In [None]:
# Initialize an empty list to store the rows
rows = []

# Iterate through the dictionary
for seed, experiments in combined_history.items():
    for experiment_name, metrics in experiments.items():
        num_epochs = len(metrics['train_loss'])
        for epoch in range(num_epochs):
            row = {
                'random_seed': seed,
                'experiment_name': experiment_name,
                'epoch': epoch + 1,
                
                'train_loss': metrics['train_loss'][epoch],
                'test_loss': metrics['test_loss'][epoch],
                
                'train_r2': metrics['train_r2'][epoch],
                'test_r2': metrics['test_r2'][epoch],
                'test_r2_with_catboost': metrics['test_r2_with_catboost'][epoch]
                
                'train_rmse': metrics['train_rmse'][epoch],
                'test_rmse': metrics['test_rmse'][epoch],
                'test_rmse_with_catboost': metrics['test_rmse_with_catboost'][epoch]
            
                'train_mae': metrics['train_mae'][epoch],
                'test_mae': metrics['test_mae'][epoch],
                'test_mae_with_catboost': metrics['test_mae_with_catboost'][epoch]
            }
            rows.append(row)

# Convert the list of rows into a DataFrame
history = pd.DataFrame(rows)

# Display the DataFrame
print(history)

In [None]:
# Group by experiment name and epoch, then calculate mean and std
grouped_history = history.groupby(['experiment_name', 'epoch']).agg(
    train_loss_mean=('train_loss', 'mean'),
    train_loss_std=('train_loss', 'std'),
    test_loss_mean=('test_loss', 'mean'),
    test_loss_std=('test_loss', 'std'),
    train_r2_mean=('train_r2', 'mean'),
    train_r2_std=('train_r2', 'std'),
    test_r2_mean=('test_r2', 'mean'),
    test_r2_std=('test_r2', 'std'),
    test_r2_with_catboost_mean=('test_r2_with_catboost', 'mean'),
    test_r2_with_catboost_std=('test_r2_with_catboost', 'std')
).reset_index()

# Display the grouped DataFrame
print("\nGrouped DataFrame with Mean and Std:")
print(grouped_history)

In [None]:
# model_name = "cointegrated/LaBSE-en-ru"


# Dataset and DataLoader
# Prepare data
# text_col_1 = 'description_no_numbers_v2_with_prompt' # for mask pooling
# text_col_2 = 'title_company_location_skills_source_with_prompt' # for mask pooling

# X = df[[text_col_1, text_col_2]]
# y = df['log_salary_from']

# # Split # already done in previous cell
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)


# Initialize the model
# model = SingleBERTWithMLP(hidden_size, mlp_hidden_size)


# word_embedding_model1 = models.Transformer(model_name)
# pooling_model1 = models.Pooling(word_embedding_model1.get_word_embedding_dimension(), "cls")
# bert1 = SentenceTransformer(modules=[word_embedding_model1, pooling_model1])

# word_embedding_model2 = models.Transformer(model_name)
# pooling_model2 = models.Pooling(word_embedding_model2.get_word_embedding_dimension(), "cls")
# bert2 = SentenceTransformer(modules=[word_embedding_model2, pooling_model2])


# # Save the trained model
# torch.save(model.state_dict(), "./models/final_model.pth")