# 2. Embeddings from pre-trained transformers + MLP, combine (or not) with catboost

#### Code from task 2 to be adjusted

In [None]:
import warnings

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def average_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def pool(hidden_state, mask, pooling_method="cls"):
    if pooling_method == "mean":
        s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
        d = mask.sum(axis=1, keepdim=True).float()
        return s / d
    elif pooling_method == "cls":
        return hidden_state[:, 0]


# Set the seed
set_seed(42)
model_name = "sismetanin/sbert-ru-sentiment-rureviews"
# Load AutoModel from huggingface model repository
# tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
# model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
# tokenizer = AutoTokenizer.from_pretrained("ai-forever/ru-en-RoSBERTa")
# model = AutoModel.from_pretrained("ai-forever/ru-en-RoSBERTa")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# tokenizer = AutoTokenizer.from_pretrained("deepvk/USER-bge-m3")
# model = AutoModel.from_pretrained("deepvk/USER-bge-m3")
# tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-large-instruct")
# model = AutoModel.from_pretrained("intfloat/multilingual-e5-large-instruct")

model.eval()

# Check if multiple GPUs are available and use DataParallel
device = "cuda" if torch.cuda.is_available() else "cpu"
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model, device_ids=[0, 1])

model.to(device)

# Create dataset and dataloader
class FiveDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_len):
        self.data = dataframe
        self.text = dataframe['text'].tolist()
        # self.text = dataframe['text_with_instruct'].tolist()
        self.targets = dataframe['rate'].tolist() if 'rate' in dataframe else None
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index):
        text = str(self.text[index])
        # text = ' '.join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        if self.targets is not None:
            return {
                'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }
        else:
            return {
                'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
            }

    def __len__(self) -> int:
        return len(self.text)

# Assuming train_split, val_split, and test_data are already defined
# train_dataset = FiveDataset(train_data_no_duplicates, tokenizer, MAX_LEN)
train_dataset = FiveDataset(train_data, tokenizer, MAX_LEN)
test_dataset = FiveDataset(test_data, tokenizer, MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Function to compute and save pooled embeddings
def compute_and_save_embeddings(dataloader, model, device):
    embeddings = []
    labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pooled_embeddings = mean_pooling(outputs, attention_mask)  # Ensure attention_mask is on the same device
            cls_embeddings = outputs[0][:, 0]
            pooled_embeddings = torch.cat((pooled_embeddings, cls_embeddings), 1)
            
            # pooled_embeddings = pool(outputs.last_hidden_state, attention_mask)  # Ensure attention_mask is on the same device
            # pooled_embeddings = outputs[0][:, 0]
            # pooled_embeddings = average_pool(outputs.last_hidden_state, attention_mask)  # Ensure attention_mask is on the same device
            
            embeddings.append(pooled_embeddings.cpu().numpy())
            if 'targets' in batch:
                labels.append(batch['targets'].cpu().numpy())
    embeddings = np.vstack(embeddings)
    labels = np.hstack(labels) if labels else None
    return embeddings, labels

device = "cuda" if torch.cuda.is_available() else "cpu"
# name = 'e5_instr_en_full'
# name = 'USER-bge-m3_full'
model_name = model_name.replace('/', '_')
train_embeddings, train_labels = compute_and_save_embeddings(train_dataloader, model, device)
np.save(f'train_embeddings_{model_name}.npy', train_embeddings)
np.save(f'train_labels_{model_name}.npy', train_labels)
test_embeddings, test_labels = compute_and_save_embeddings(test_dataloader, model, device)

# Save embeddings and labels
np.save(f'test_embeddings_{model_name}.npy', test_embeddings)
np.save(f'test_labels_{model_name}.npy', test_labels)

#### Code by GPT for this task

In [246]:
import warnings
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from typing import Dict, Tuple

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Set the seed for reproducibility
def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Parameters
MAX_LEN = 256  # Maximum sequence length
BATCH_SIZE = 32  # Batch size
model_name = "intfloat/multilingual-e5-large-instruct"


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# CLS Pooling - Use when only the sentence embedding is needed
def cls_pooling(hidden_state, mask) -> torch.Tensor:
    return hidden_state[:, 0]


# Dataset class
class JobDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_seq_len: int, text_column: str):
        self.data = dataframe
        self.texts = dataframe[text_column].tolist()
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __getitem__(self, index: int) -> Dict:
        text = str(self.texts[index])
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_seq_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

    def __len__(self) -> int:
        return len(self.texts)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Compute embeddings
def compute_embeddings(dataloader: DataLoader, model, device) -> Tuple[np.ndarray, np.ndarray]:
    mean_embeddings = []
    cls_embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Compute mean and CLS embeddings
            mean_emb = mean_pooling(outputs, attention_mask)
            cls_emb = cls_pooling(outputs)
            
            mean_embeddings.append(mean_emb.cpu().numpy())
            cls_embeddings.append(cls_emb.cpu().numpy())

    return np.vstack(mean_embeddings), np.vstack(cls_embeddings)


tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
df_short = df.sample(100, random_state=42)
# Prepare data
X = df_short[['description', 'title_company_location_skills_source']]
y = df_short['log_salary_from']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Description embeddings
train_description_dataset = JobDataset(X_train, tokenizer, MAX_LEN, text_column='description')
test_description_dataset = JobDataset(X_test, tokenizer, MAX_LEN, text_column='description')
train_description_loader = DataLoader(train_description_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_description_loader = DataLoader(test_description_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Title+Company+Location+Skills+Source embeddings
train_title_dataset = JobDataset(X_train, tokenizer, MAX_LEN, text_column='title_company_location_skills_source')
test_title_dataset = JobDataset(X_test, tokenizer, MAX_LEN, text_column='title_company_location_skills_source')
train_title_loader = DataLoader(train_title_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_title_loader = DataLoader(test_title_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Compute embeddings
train_mean_desc, train_cls_desc = compute_embeddings(train_description_loader, model, device)
test_mean_desc, test_cls_desc = compute_embeddings(test_description_loader, model, device)
train_mean_title, train_cls_title = compute_embeddings(train_title_loader, model, device)
test_mean_title, test_cls_title = compute_embeddings(test_title_loader, model, device)

# Combine embeddings
train_embeddings = np.hstack([train_mean_desc, train_cls_desc, train_mean_title, train_cls_title])
test_embeddings = np.hstack([test_mean_desc, test_cls_desc, test_mean_title, test_cls_title])

# Save embeddings
np.save('train_embeddings.npy', train_embeddings)
np.save('test_embeddings.npy', test_embeddings)
np.save('y_train.npy', y_train.values)
np.save('y_test.npy', y_test.values)


# Fine-tuning transformer + MLP head

## Two berts

In [None]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

import pandas as pd


df = pd.read_csv('./data/dataset.csv')
df.info()

In [None]:
import warnings
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc

gc.collect()
torch.cuda.empty_cache()

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Set the seed for reproducibility
def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
SEED = 42
set_seed(SEED)

# Hyperparameters
batch_size = 16
seq_length = 512
hidden_size = 768  # BERT base hidden size
mlp_hidden_size = 256
num_epochs = 10
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "cointegrated/LaBSE-en-ru"


# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target

# Define the dual BERT model with an MLP head
class DualBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(DualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        cls1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token

        # Forward pass through BERT2
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dataset and DataLoader
# Prepare data
text_col_1 = 'description_no_numbers_v2'
text_col_2 = 'title_company_location_skills_source'
X = df[[text_col_1, text_col_2]]
y = df['log_salary_from']
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Make datasets
train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train, tokenizer, seq_length)
test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test, tokenizer, seq_length)
# Make dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = DualBERTWithMLP(hidden_size, mlp_hidden_size)
model = torch.nn.DataParallel(model).to(device)

# Optimizer and Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()  # For regression

# Enhanced Training and Evaluation Loop
history = {"train_loss": [],
           "test_loss": [], 
           "train_r2": [],
           "test_r2": [],
           "max_test_r2": float('-inf'),
           "best_preds": []
           }

test_labels = y_test
test_preds = []
for epoch in range(num_epochs):
    # Training Phase
    model.train()
    train_losses = []
    all_preds = []
    all_labels = []
    for batch in train_dataloader:
        inputs1, inputs2, targets = batch
        input1 = inputs1["input_ids"].squeeze(1).to(device)
        attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
        input2 = inputs2["input_ids"].squeeze(1).to(device)
        attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(input1, attention_mask1, input2, attention_mask2)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        all_preds.extend(outputs.squeeze().cpu().detach().numpy())
        all_labels.extend(targets.cpu().numpy())

    train_loss = np.mean(train_losses)
    train_r2 = r2_score(all_labels, all_preds)
    history["train_loss"].append(train_loss)
    history["train_r2"].append(train_r2)

    # Evaluation Phase
    model.eval()
    test_losses = []
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)

            outputs = model(input1, attention_mask1, input2, attention_mask2)
            loss = criterion(outputs.squeeze(), targets)
            test_losses.append(loss.item())
            all_preds.extend(outputs.squeeze().cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    test_loss = np.mean(test_losses)
    test_r2 = r2_score(all_labels, all_preds)

    history["test_loss"].append(test_loss)
    history["test_r2"].append(test_r2)

    if test_r2 > history["max_test_r2"]:
        history["max_test_r2"] = test_r2
        history["best_preds"] = all_preds

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}")
    torch.save(model.state_dict(), f'./models/model_epoch_{epoch + 1}.pth')

# Plot Training and Validation Metrics
# plt.figure(figsize=(12, 5))

# Plot Loss
# plt.subplot(1, 2, 1)
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["test_loss"], label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Test Loss Over Epochs")
plt.legend()
plt.show()

# Plot R2 Score
# plt.subplot(1, 2, 2)
plt.plot(history["train_r2"], label="Train R2")
plt.plot(history["test_r2"], label="Test R2")
plt.xlabel("Epoch")
plt.ylabel("R2 Score")
plt.title("Train/Test R2 Score Over Epochs")
plt.legend()
plt.show()

# plt.tight_layout()

# Save the trained model
torch.save(model.state_dict(), "./models/final_model.pth")

# save history as pickle
import pickle

with open('./models/history.pkl', 'wb') as f:
    pickle.dump(history, f)

# last run:
# Epoch 1/10, Train Loss: 0.2170, Test Loss: 0.1120, Test R2: 0.7158
# Epoch 2/10, Train Loss: 0.1062, Test Loss: 0.1018, Test R2: 0.7416

## One bert

In [None]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'
import pandas as pd


df = pd.read_csv('./data/dataset.csv')
df.info()

In [None]:
import warnings
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc

gc.collect()
torch.cuda.empty_cache()

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Set the seed for reproducibility
def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
SEED = 42
set_seed(SEED)

# Hyperparameters
batch_size = 16
seq_length = 512
hidden_size = 768  # BERT base hidden size
mlp_hidden_size = 256
num_epochs = 10
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "cointegrated/LaBSE-en-ru"


# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
        print(f"len(self.tokenized_texts1['input_ids']): {len(self.tokenized_texts1['input_ids'])}")
        print(f"len(self.tokenized_texts2['input_ids']): {len(self.tokenized_texts2['input_ids'])}")

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target


class SingleBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(SingleBERTWithMLP, self).__init__()
        # Initialize a single BERT model
        self.bert = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),  # Double hidden size for concatenation
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Pass both inputs through the same BERT model
        cls1 = self.bert(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token for input1
        cls2 = self.bert(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token for input2

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dataset and DataLoader
# Prepare data
text_col_1 = 'description_no_numbers_v2'
text_col_2 = 'title_company_location_skills_source'
X = df[[text_col_1, text_col_2]]
y = df['log_salary_from']
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Make datasets
train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train, tokenizer, seq_length)
test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test, tokenizer, seq_length)
# Make dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = SingleBERTWithMLP(hidden_size, mlp_hidden_size)
model = torch.nn.DataParallel(model).to(device)

# Optimizer and Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()  # For regression

# Enhanced Training and Evaluation Loop
history = {"train_loss": [],
           "test_loss": [], 
           "train_r2": [],
           "test_r2": [],
           "max_test_r2": float('-inf'),
           "best_preds": []
           }

test_labels = y_test
test_preds = []
for epoch in range(num_epochs):
    # Training Phase
    model.train()
    train_losses = []
    all_preds = []
    all_labels = []
    for batch in train_dataloader:
        inputs1, inputs2, targets = batch
        input1 = inputs1["input_ids"].squeeze(1).to(device)
        attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
        input2 = inputs2["input_ids"].squeeze(1).to(device)
        attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(input1, attention_mask1, input2, attention_mask2)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        all_preds.extend(outputs.squeeze().cpu().detach().numpy())
        all_labels.extend(targets.cpu().numpy())

    train_loss = np.mean(train_losses)
    train_r2 = r2_score(all_labels, all_preds)
    history["train_loss"].append(train_loss)
    history["train_r2"].append(train_r2)

    # Evaluation Phase
    model.eval()
    test_losses = []
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)

            outputs = model(input1, attention_mask1, input2, attention_mask2)
            loss = criterion(outputs.squeeze(), targets)
            test_losses.append(loss.item())
            all_preds.extend(outputs.squeeze().cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    test_loss = np.mean(test_losses)
    test_r2 = r2_score(all_labels, all_preds)

    history["test_loss"].append(test_loss)
    history["test_r2"].append(test_r2)

    if test_r2 > history["max_test_r2"]:
        history["max_test_r2"] = test_r2
        history["best_preds"] = all_preds

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}")
    torch.save(model.state_dict(), f'./models/model_epoch_{epoch + 1}.pth')

# Plot Training and Validation Metrics
# plt.figure(figsize=(12, 5))

# Plot Loss
# plt.subplot(1, 2, 1)
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["test_loss"], label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Test Loss Over Epochs")
plt.legend()
plt.show()

# Plot R2 Score
# plt.subplot(1, 2, 2)
plt.plot(history["train_r2"], label="Train R2")
plt.plot(history["test_r2"], label="Test R2")
plt.xlabel("Epoch")
plt.ylabel("R2 Score")
plt.title("Train/Test R2 Score Over Epochs")
plt.legend()
plt.show()

# plt.tight_layout()

# Save the trained model
torch.save(model.state_dict(), "./models/final_model.pth")

# save history as pickle
import pickle

with open('./models/history.pkl', 'wb') as f:
    pickle.dump(history, f)

# last run:
# Epoch 1/10, Train Loss: 0.2170, Test Loss: 0.1120, Test R2: 0.7158
# Epoch 2/10, Train Loss: 0.1062, Test Loss: 0.1018, Test R2: 0.7416

## Two berts + Huber loss

In [None]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

import pandas as pd


df = pd.read_csv('./data/dataset.csv')
df.info()

In [None]:
import warnings
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc

gc.collect()
torch.cuda.empty_cache()

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Set the seed for reproducibility
def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
SEED = 42
set_seed(SEED)

# Hyperparameters
batch_size = 16
seq_length = 512
hidden_size = 768  # BERT base hidden size
mlp_hidden_size = 256
num_epochs = 10
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "cointegrated/LaBSE-en-ru"


# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target

# Define the dual BERT model with an MLP head
class DualBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(DualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        cls1 = self.bert1(input_ids=input1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]  # CLS token

        # Forward pass through BERT2
        cls2 = self.bert2(input_ids=input2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]  # CLS token

        # Concatenate CLS embeddings
        combined_cls = torch.cat([cls1, cls2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_cls)
        return output

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dataset and DataLoader
# Prepare data
text_col_1 = 'description_no_numbers_v2'
text_col_2 = 'title_company_location_skills_source'
X = df[[text_col_1, text_col_2]]
y = df['log_salary_from']
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Make datasets
train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train, tokenizer, seq_length)
test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test, tokenizer, seq_length)
# Make dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = DualBERTWithMLP(hidden_size, mlp_hidden_size)
model = torch.nn.DataParallel(model).to(device)

# Optimizer and Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.HuberLoss()

# Enhanced Training and Evaluation Loop
history = {"train_loss": [],
           "test_loss": [], 
           "train_r2": [],
           "test_r2": [],
           "max_test_r2": float('-inf'),
           "best_preds": []
           }

test_labels = y_test
test_preds = []
for epoch in range(num_epochs):
    # Training Phase
    model.train()
    train_losses = []
    all_preds = []
    all_labels = []
    for batch in train_dataloader:
        inputs1, inputs2, targets = batch
        input1 = inputs1["input_ids"].squeeze(1).to(device)
        attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
        input2 = inputs2["input_ids"].squeeze(1).to(device)
        attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(input1, attention_mask1, input2, attention_mask2)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        all_preds.extend(outputs.squeeze().cpu().detach().numpy())
        all_labels.extend(targets.cpu().numpy())

    train_loss = np.mean(train_losses)
    train_r2 = r2_score(all_labels, all_preds)
    history["train_loss"].append(train_loss)
    history["train_r2"].append(train_r2)

    # Evaluation Phase
    model.eval()
    test_losses = []
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)

            outputs = model(input1, attention_mask1, input2, attention_mask2)
            loss = criterion(outputs.squeeze(), targets)
            test_losses.append(loss.item())
            all_preds.extend(outputs.squeeze().cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    test_loss = np.mean(test_losses)
    test_r2 = r2_score(all_labels, all_preds)

    history["test_loss"].append(test_loss)
    history["test_r2"].append(test_r2)

    if test_r2 > history["max_test_r2"]:
        history["max_test_r2"] = test_r2
        history["best_preds"] = all_preds

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}")
    torch.save(model.state_dict(), f'./models/model_epoch_{epoch + 1}.pth')

# Plot Training and Validation Metrics
# plt.figure(figsize=(12, 5))

# Plot Loss
# plt.subplot(1, 2, 1)
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["test_loss"], label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Test Loss Over Epochs")
plt.legend()
plt.show()

# Plot R2 Score
# plt.subplot(1, 2, 2)
plt.plot(history["train_r2"], label="Train R2")
plt.plot(history["test_r2"], label="Test R2")
plt.xlabel("Epoch")
plt.ylabel("R2 Score")
plt.title("Train/Test R2 Score Over Epochs")
plt.legend()
plt.show()

# plt.tight_layout()

# Save the trained model
torch.save(model.state_dict(), "./models/final_model.pth")

# save history as pickle
import pickle

with open('./models/history.pkl', 'wb') as f:
    pickle.dump(history, f)

# last run:
# Epoch 1/10, Train Loss: 0.2170, Test Loss: 0.1120, Test R2: 0.7158
# Epoch 2/10, Train Loss: 0.1062, Test Loss: 0.1018, Test R2: 0.7416

## Two berts + Mean pooling

In [None]:
!mkdir data
!mkdir models
!wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1_o4xDSF6j95vAiYdd97VavyWq4EHHPdP&export=download&authuser=1&confirm=t' -O './data/dataset.csv'

import pandas as pd


df = pd.read_csv('./data/dataset.csv')
df.info()

In [None]:
import warnings
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import gc

gc.collect()
torch.cuda.empty_cache()

# Suppress all FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# Set the seed for reproducibility
def set_seed(seed: int) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
SEED = 42
set_seed(SEED)

# Hyperparameters
batch_size = 16
seq_length = 512
hidden_size = 768  # BERT base hidden size
mlp_hidden_size = 256
num_epochs = 10
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "cointegrated/LaBSE-en-ru"

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


# Dataset for dual textual features
class DualTextDataset(Dataset):
    def __init__(self, df, text_col_1, text_col_2, targets, tokenizer, max_len):
        # Pre-tokenize and store inputs
        self.tokenized_texts1 = tokenizer(df[text_col_1].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.tokenized_texts2 = tokenizer(df[text_col_2].tolist(), max_length=max_len, padding="max_length", truncation=True, return_tensors="pt")
        self.targets = targets.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return only the slice for idx
        inputs1 = {key: val[idx] for key, val in self.tokenized_texts1.items()}
        inputs2 = {key: val[idx] for key, val in self.tokenized_texts2.items()}
        target = torch.tensor(self.targets[idx], dtype=torch.float)
        return inputs1, inputs2, target

# Define the dual BERT model with an MLP head
class DualBERTWithMLP(nn.Module):
    def __init__(self, hidden_size, mlp_hidden_size):
        super(DualBERTWithMLP, self).__init__()
        # Initialize two independent BERT models
        self.bert1 = AutoModel.from_pretrained(model_name)
        self.bert2 = AutoModel.from_pretrained(model_name)

        # Define MLP head
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, 1)  # Regression output
        )

    def forward(self, input1, attention_mask1, input2, attention_mask2):
        # Forward pass through BERT1
        output1 = self.bert1(input_ids=input1, attention_mask=attention_mask1)
        mean_embedding_1 = mean_pooling(output1, attention_mask1)

        # Forward pass through BERT1
        output2 = self.bert2(input_ids=input2, attention_mask=attention_mask2)
        mean_embedding_2 = mean_pooling(output2, attention_mask2)


        # Concatenate mean embeddings
        combined_mean_embeddings = torch.cat([mean_embedding_1, mean_embedding_2], dim=-1)  # Shape: [batch_size, 2 * hidden_size]

        # Pass through MLP head
        output = self.mlp(combined_mean_embeddings)
        return output

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dataset and DataLoader
# Prepare data
text_col_1 = 'description_no_numbers_v2'
text_col_2 = 'title_company_location_skills_source'
X = df[[text_col_1, text_col_2]]
y = df['log_salary_from']
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Make datasets
train_dataset = DualTextDataset(X_train, text_col_1, text_col_2, y_train, tokenizer, seq_length)
test_dataset = DualTextDataset(X_test, text_col_1, text_col_2, y_test, tokenizer, seq_length)
# Make dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model
model = DualBERTWithMLP(hidden_size, mlp_hidden_size)
model = torch.nn.DataParallel(model).to(device)

# Optimizer and Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()  # For regression

# Enhanced Training and Evaluation Loop
history = {"train_loss": [],
           "test_loss": [], 
           "train_r2": [],
           "test_r2": [],
           "max_test_r2": float('-inf'),
           "best_preds": []
           }

test_labels = y_test
test_preds = []
for epoch in range(num_epochs):
    # Training Phase
    model.train()
    train_losses = []
    all_preds = []
    all_labels = []
    for batch in train_dataloader:
        inputs1, inputs2, targets = batch
        input1 = inputs1["input_ids"].squeeze(1).to(device)
        attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
        input2 = inputs2["input_ids"].squeeze(1).to(device)
        attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(input1, attention_mask1, input2, attention_mask2)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        all_preds.extend(outputs.squeeze().cpu().detach().numpy())
        all_labels.extend(targets.cpu().numpy())

    train_loss = np.mean(train_losses)
    train_r2 = r2_score(all_labels, all_preds)
    history["train_loss"].append(train_loss)
    history["train_r2"].append(train_r2)

    # Evaluation Phase
    model.eval()
    test_losses = []
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            inputs1, inputs2, targets = batch
            input1 = inputs1["input_ids"].squeeze(1).to(device)
            attention_mask1 = inputs1["attention_mask"].squeeze(1).to(device)
            input2 = inputs2["input_ids"].squeeze(1).to(device)
            attention_mask2 = inputs2["attention_mask"].squeeze(1).to(device)
            targets = targets.to(device)

            outputs = model(input1, attention_mask1, input2, attention_mask2)
            loss = criterion(outputs.squeeze(), targets)
            test_losses.append(loss.item())
            all_preds.extend(outputs.squeeze().cpu().numpy())
            all_labels.extend(targets.cpu().numpy())

    test_loss = np.mean(test_losses)
    test_r2 = r2_score(all_labels, all_preds)

    history["test_loss"].append(test_loss)
    history["test_r2"].append(test_r2)

    if test_r2 > history["max_test_r2"]:
        history["max_test_r2"] = test_r2
        history["best_preds"] = all_preds

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Train R2: {train_r2:.4f}, Test R2: {test_r2:.4f}")
    torch.save(model.state_dict(), f'./models/model_epoch_{epoch + 1}.pth')

# Plot Training and Validation Metrics
# plt.figure(figsize=(12, 5))

# Plot Loss
# plt.subplot(1, 2, 1)
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["test_loss"], label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Test Loss Over Epochs")
plt.legend()
plt.show()

# Plot R2 Score
# plt.subplot(1, 2, 2)
plt.plot(history["train_r2"], label="Train R2")
plt.plot(history["test_r2"], label="Test R2")
plt.xlabel("Epoch")
plt.ylabel("R2 Score")
plt.title("Train/Test R2 Score Over Epochs")
plt.legend()
plt.show()

# plt.tight_layout()

# Save the trained model
torch.save(model.state_dict(), "./models/final_model.pth")

# save history as pickle
import pickle

with open('./models/history.pkl', 'wb') as f:
    pickle.dump(history, f)

# last run:
# Epoch 1/10, Train Loss: 0.2170, Test Loss: 0.1120, Test R2: 0.7158
# Epoch 2/10, Train Loss: 0.1062, Test Loss: 0.1018, Test R2: 0.7416

## Regression on the [MASK] token

## TSDAE 