# Fine-tuning transformer + MLP head

In [None]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

import pandas as pd

df = pd.read_csv('./data/dataset.csv')
df.info()

In [None]:
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1Hysy7OYhugP0lrvn7sCfckNk0AUx2N69&export=download&authuser=1&confirm=t' -O './data/catboost_preds.npy'

import numpy as np

catboost_preds = np.load('./data/catboost_preds.npy')

code example from https://sbert.net/examples/unsupervised_learning/TSDAE/README.html

In [None]:
%pip install -U sentence-transformers -qqq

#### Service functions

In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import gc
import re
import os
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses


os.environ["WANDB_DISABLED"] = "true"


def memory_cleanup():
    gc.collect()
    torch.cuda.empty_cache()

# get number of words in each sentence in the text
def get_sentence_lengths(text):
    # pattern = r'(?<=[.!?])\s+'
    pattern = r'(?<=[.!?])'
    sentences = re.split(pattern, text)
    # remove empty strings
    sentences = [sentence for sentence in sentences if len(sentence) > 0]
    # get number of words in each sentence
    sentence_lengths = [len(sentence.split()) for sentence in sentences]
    return sentences, sentence_lengths


# Set the seed for reproducibility
def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


# Plot Training and Validation Metrics
def plot(history):
    # Plot Loss
    plt.plot(range(2, len(history["train_loss"]) + 1), history["train_loss"][1:], label="Train Loss")
    plt.plot(range(2, len(history["test_loss"]) + 1), history["test_loss"][1:], label="Test Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training and Test Loss Over Epochs 2-...")
    plt.legend()
    plt.show()
    
    # Plot R2 Score
    plt.plot(range(2, len(history["train_r2"]) + 1), history["train_r2"][1:], label="Train R2")
    plt.plot(range(2, len(history["test_r2"]) + 1), history["test_r2"][1:], label="Test R2")
    plt.xlabel("Epoch")
    plt.ylabel("R2 Score")
    plt.title("Train/Test R2 Score Over Epochs 2-...")
    plt.legend()
    plt.show()


# Function to train the TSDAE model
def train_tsdae_bert(model_name, train_sentences):
    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Create the special denoising dataset that adds noise on-the-fly
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
    
    # DataLoader to batch your data
    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    
    # Use the denoising auto-encoder loss
    train_loss = losses.DenoisingAutoEncoderLoss(
        model, decoder_name_or_path=model_name, tie_encoder_decoder=True,
    )
    
    # Call the fit method
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        weight_decay=0,
        scheduler="constantlr",
        optimizer_params={"lr": 3e-5},
        show_progress_bar=True,
    )
    
    return model

#### Traning-related classes

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoModel

# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        print('Creating the dataset...')
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target

# single bert
class SingleBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(SingleBERTWithMLP, self).__init__()
        # Initialize a single BERT model
        self.bert = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),  # Double hidden size for concatenation
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Pass both inputs through the same BERT model
        cls1 = self.bert(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token for input1
        cls2 = self.bert(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token for input2

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output


# Define the dual BERT model with an MLP head
class DualBERTWithMLP(nn.Module):
    def __init__(self, config):
        super(DualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        model_name = config['model_name']
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        cls1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token
        # mask1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 23, :]  # mask token 

        # Forward pass through BERT2
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token
        # mask2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 23, :]  # mask token

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]
        # concat mask embeddings
        # combined_mask = torch.cat([mask1, mask2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        # output = self.mlp(combined_mask)
        return output

# Define the dual BERT model with an MLP head and MASK pooling
class MASKPoolDualBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(MASKPoolDualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        mask1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 23, :]  # mask token 

        # Forward pass through BERT2
        mask2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 23, :]  # mask token

        # concat mask embeddings
        combined_mask = torch.cat([mask1, mask2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_mask)
        return output

# Define the dual BERT model with an MLP head
class TSDAEDualBERTWithMLP(nn.Module):
    def __init__(self, config, bert1, bert2):
        super(TSDAEDualBERTWithMLP, self).__init__()
        # Load TSDAE-ed BERT models
        self.bert1 = bert1
        self.bert2 = bert2

        # Define MLP head
        hidden_size = config['hidden_size']
        mlp_hidden_size = config['mlp_hidden_size']
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # idea from: https://github.com/UKPLab/sentence-transformers/issues/2494
        # Forward pass through BERT1
        input_dict1 = {
            'input_ids': input1,
            'attention_mask': attention_mask1
        }
        cls1 = self.bert1(input_dict1)['sentence_embedding']
        
        # Forward pass through BERT2
        input_dict2 = {
            'input_ids': input2,
            'attention_mask': attention_mask2
        }
        cls2 = self.bert2(input_dict2)['sentence_embedding']

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output


#### Training method

In [None]:
from torch.utils.data import DataLoader
from sklearn.metrics import r2_score


def fit_eval(
    seed,
    model,
    X_train,
    X_test,
    y_train,
    y_test,
    catboost_preds,
    criterion,
    tokenizer,
    config,
    text_col_1 = 'description_no_numbers_v2',
    text_col_2 = 'title_company_location_skills_source',
):
    set_seed(seed)
    
    # Memory cleanup
    memory_cleanup()

    # Unpack config
    learning_rate = config["learning_rate"]
    num_epochs = config["num_epochs"]
    batch_size = config["batch_size"]
    seq_length = config["seq_length"]


    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    # Make datasets
    train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train, tokenizer, seq_length)
    test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test, tokenizer, seq_length)
    # Make dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        
    # Training and Evaluation Loop
    history = {"train_loss": [],
               "test_loss": [], 
               "train_r2": [],
               "test_r2": [],
               "test_r2_with_catboost": [],
               "max_test_r2": float('-inf'),
               "best_preds": []
               }
    
    # test_labels = y_test
    # test_preds = []
    print('Starting training/eval loop...')
    for epoch in range(num_epochs):
        print('Starting training...')
        # Training Phase
        model.train()
        train_losses = []
        all_preds = []
        all_labels = []
        for batch in train_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)
    
            optimizer.zero_grad()
            outputs = model(input1, attention_mask1, input2, attention_mask2)
            outputs = outputs.flatten()
            # loss = criterion(outputs.squeeze(), targets)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            # all_preds.extend(outputs.squeeze().cpu().detach().numpy())
            all_preds.extend(outputs.cpu().detach().numpy())
            all_labels.extend(targets.cpu().numpy())
    
        train_loss = np.mean(train_losses)
        train_r2 = r2_score(all_labels, all_preds)
        history["train_loss"].append(train_loss)
        history["train_r2"].append(train_r2)
    
        # Evaluation Phase
        print('Epoch done, evaluating...')
        model.eval()
        test_losses = []
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in test_dataloader:
                inputs1, inputs2, targets = batch
                input1 = inputs1["input_ids"].squeeze(1).to(device)
                attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
                input2 = inputs2["input_ids"].squeeze(1).to(device)
                attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
                targets = targets.to(device)
    
                outputs = model(input1, attention_mask1, input2, attention_mask2)
                outputs = outputs.flatten()
                # loss = criterion(outputs.squeeze(), targets)
                loss = criterion(outputs, targets)
                test_losses.append(loss.item())
                # all_preds.extend(outputs.squeeze().cpu().numpy())
                all_preds.extend(outputs.cpu().numpy())
                all_labels.extend(targets.cpu().numpy())
                
        test_loss = np.mean(test_losses)
        test_r2 = r2_score(all_labels, all_preds)
        test_r2_with_catboost = r2_score(all_labels, (np.array(all_preds) + catboost_preds) / 2)
    
        history["test_loss"].append(test_loss)
        history["test_r2"].append(test_r2)
        history["test_r2_with_catboost"].append(test_r2_with_catboost)
    
        if test_r2 > history["max_test_r2"]:
            history["max_test_r2"] = test_r2
            history["best_preds"] = all_preds
    
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, "
              f"Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}, Test R2 with catboost: {test_r2_with_catboost:.4f}")

    return model, history

#### Training-eval loop with experiments

In [None]:
import torch.nn as nn
import pickle
import warnings
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


seeds = [42, 78687, 123123]
repetitions = 2
combined_history = {}

# Hyperparameters
# seq_length = 512
# hidden_size = 768  # BERT base hidden size
# mlp_hidden_size = 256

config = {
    "model_name": "sergeyzh/rubert-tiny-turbo",
    "batch_size": 32,
    "seq_length": 1024,
    "hidden_size": 312,
    "mlp_hidden_size": 128,
    # "num_epochs": 10,
    "num_epochs": 2,
    "learning_rate": 5e-6,
}

model_name = config['model_name']
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset and DataLoader
# Prepare data
text_col_1 = 'description_no_numbers_v2'
text_col_2 = 'title_company_location_skills_source'


X = df[[text_col_1, text_col_2]][:200]
y = df['log_salary_from'][:200]

for _ in range(repetitions):
    for seed in seeds:

        combined_history[seed] = {}

        set_seed(seed)

        # Split train-test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        catboost_preds = np.zeros(len(y_test)) # dont use catboost preds for now

        # fit-eval non-TSDAE-ed model
        model = DualBERTWithMLP(config)
        model = torch.nn.DataParallel(model).to(device)
        # Loss Function
        # criterion = nn.MSELoss()  # For regression
        criterion = nn.HuberLoss()

        model, history = fit_eval(seed, model, X_train, X_test, y_train, y_test, catboost_preds, criterion, tokenizer, config)

        combined_history[seed]['double_huber'] = history    

        # further split train data into regression train and tsdae train data
        X_train_tsdae, X_tsdae, y_train_tsdae, y_tsdae = train_test_split(X_train, y_train, test_size=0.01, random_state=seed)

        # # convert text_col_1 data into set of sentences and select 20-60 word sentences as a feature column:
        # # Create a DataFrame of unique sentences and their lengths for X_tsdae
        # unique_sentences = []
        # unique_sentence_lengths = []
        # for text in X_tsdae[text_col_1]:
        #     sentences, sentence_lengths = get_sentence_lengths(text)
        #     unique_sentences.extend(sentences)
        #     unique_sentence_lengths.extend(sentence_lengths)

        # unique_sentences_df = pd.DataFrame({
        #     'sentence': unique_sentences,
        #     'length': unique_sentence_lengths
        # })

        # unique_sentences = unique_sentences_df[(unique_sentences_df.length >= 10) & (unique_sentences_df.length <= 60)]['sentence']

        # get array with features for each bert
        train_sentences_array = [
            # unique_sentences.tolist(),
            X_tsdae[text_col_1].tolist(),
            X_tsdae[text_col_2].tolist(),
        ]

        berts_after_tsdae = []
        for index, train_sentences in enumerate(train_sentences_array):
            berts_after_tsdae.append(train_tsdae_bert(model_name, train_sentences))
            memory_cleanup()

        tsdae_bert1, tsdae_bert2 = berts_after_tsdae

        # Initialize the non-TSDAE-ed BERT models
        model = TSDAEDualBERTWithMLP(config, tsdae_bert1, tsdae_bert2)
        model = torch.nn.DataParallel(model).to(device)
        # Loss Function
        # criterion = nn.MSELoss()  # For regression
        criterion = nn.HuberLoss()

        model, history = fit_eval(seed, model, X_train, X_test, y_train, y_test, catboost_preds, criterion, tokenizer, config)

        combined_history[seed]['double_huber_1p_descriptions_tsdae'] = history

    # save the history as pickle
    with open('./models/combined_history.pickle', 'wb') as handle:
        pickle.dump(combined_history, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# model_name = "cointegrated/LaBSE-en-ru"


# Dataset and DataLoader
# Prepare data
# text_col_1 = 'description_no_numbers_v2_with_prompt' # for mask pooling
# text_col_2 = 'title_company_location_skills_source_with_prompt' # for mask pooling

# X = df[[text_col_1, text_col_2]]
# y = df['log_salary_from']

# # Split # already done in previous cell
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)


# Initialize the model
# model = SingleBERTWithMLP(hidden_size, mlp_hidden_size)


# word_embedding_model1 = models.Transformer(model_name)
# pooling_model1 = models.Pooling(word_embedding_model1.get_word_embedding_dimension(), "cls")
# bert1 = SentenceTransformer(modules=[word_embedding_model1, pooling_model1])

# word_embedding_model2 = models.Transformer(model_name)
# pooling_model2 = models.Pooling(word_embedding_model2.get_word_embedding_dimension(), "cls")
# bert2 = SentenceTransformer(modules=[word_embedding_model2, pooling_model2])


# # Save the trained model
# torch.save(model.state_dict(), "./models/final_model.pth")