In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [53]:
import pandas as pd
import numpy as np
import torch
import optuna
from torch import nn
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from torch.optim import Adam
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from IPython.display import clear_output

In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/hhhack24/data/output.csv')
train_data['Target'] = train_data['Target'].apply(lambda x: 1 if x == 'confirmed' else 0)
train_data.sample(2)

Unnamed: 0,Vacancy UUID,Vacancy Name,Keywords,Description,Comment,Resume UUID,First Name,Last Name,Birth Date,Country,...,Position,Experience Description,Year,Organization,Faculty,Specialty,Result,Education Type,Education Level,Target
118,9d98eba0-13bb-38d3-b742-4fd445954b3d,Product manager,,"- Продактов в компании сейчас порядка 250, вс...",,138b7d7e-cb84-35e7-8dd7-325b010294ed,София,Куликова,1986-01-01,Россия,...,Head of Product / Руководитель продуктового на...,МЭШ (Московская Электронная Школа) - единая об...,,,,,,,,0
622,aecfdaf6-e12c-3309-8f1b-157028ef63d5,Java-разработчик,,Опыт работы с java от 3 лет Уверенные знания ...,,71f5a179-11bd-31c8-b1e6-fcde910885b7,Дарья,Лаврентьева,,Россия,...,Team Lead,Бэкенд-разработка высоконагруженных микросерв...,2024.0,Московский государственный технический универс...,ИУ,10.05.07 - «Противодействие техническим развед...,,Основное,Высшее,0


In [4]:
vacancies_train_examples = []
resumes_train_examples = []
labels = []

for index, example in train_data.iterrows():

    vacancy_features = [
        f"Название вакансии: {example['Vacancy Name']}",
        f"Описание: {example['Description']}"
    ]
    vacancy_text = " ".join(vacancy_features)
    vacancy_text = " ".join([feature for feature in vacancy_features if feature.split(': ')[1] != 'None'])

    # Concatenating all resume fields with their Russian names for each example
    resume_features = [
        f"Дата рождения: {example['Birth Date']}",
        f"Страна: {example['Country']}",
        f"Город: {example['City']}",
        f"Ключевые навыки: {example['Key Skills']}",
        f"Должность: {example['Position']}",
        f"Описание опыта: {example['Experience Description']}",
        f"Организация: {example['Organization']}",
        f"Факультет: {example['Faculty']}",
        f"Специальность: {example['Specialty']}",
        f"Уровень образования: {example['Education Level']}"
    ]
    resume_text = " ".join([feature for feature in resume_features if feature.split(': ')[1] != 'None'])

    vacancies_train_examples.append(vacancy_text)
    resumes_train_examples.append(resume_text)
    labels.append(example['Target'])

In [5]:
len(vacancies_train_examples), len(resumes_train_examples), len(labels)

(656, 656, 656)

In [6]:
combined_features = list(zip(vacancies_train_examples, resumes_train_examples))

train_features, valid_features, train_labels, valid_labels = train_test_split(
    combined_features,
    labels,
    test_size=0.2,
    random_state=42
)

vacancies_train, resumes_train = zip(*train_features)
vacancies_valid, resumes_valid = zip(*valid_features)

vacancies_train = list(vacancies_train)
resumes_train = list(resumes_train)
vacancies_valid = list(vacancies_valid)
resumes_valid = list(resumes_valid)

print(f"Train set size: {len(vacancies_train)}")
print(f"Valid set size: {len(vacancies_valid)}")

Train set size: 524
Valid set size: 132


In [11]:
class DuoDataset(Dataset):
    def __init__(self, text1, text2, labels):
        self.text1 = text1
        self.text2 = text2
        self.labels = torch.tensor(labels, dtype=torch.float32) if labels is not None else None

    def __len__(self):
        return len(self.text1)

    def __getitem__(self, idx):
        text1_sample = self.text1[idx]
        text2_sample = self.text2[idx]

        if self.labels is not None:
            label = self.labels[idx]
            return text1_sample, text2_sample, label
        else:
            # You may consider returning some placeholder for label if necessary
            return text1_sample, text2_sample, torch.tensor(0, dtype=torch.float32)

val_dataset = DuoDataset(resumes_valid, vacancies_valid, labels=valid_labels)
train_dataset = DuoDataset(resumes_train, vacancies_train, labels=train_labels)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=4)

In [134]:
DEVICE = 'cuda'
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2").to(DEVICE)

EPOCHS = 50
optimizer = Adam(model.parameters(), lr=1e-6)

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

class ContrastiveLoss(nn.Module):
    def __init__(self, margin=0.3):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, y1, y2, label):
        # Calculate the cosine similarity
        cos_sim = nn.functional.cosine_similarity(y1, y2)

        # Contrastive loss calculation
        # If label is 1 (meaning y1 and y2 are similar), we want cos_sim to be 1, so we minimize (1 - cos_sim)
        # If label is 0 (meaning y1 and y2 are different), we want cos_sim to be less than margin, so we minimize max(0, cos_sim - margin)
        loss_positive = (1 - cos_sim) * label  # Loss for similar pairs
        loss_negative = (cos_sim - self.margin).clamp(min=0) * (1 - label)  # Loss for dissimilar pairs

        # Combine the losses
        loss = loss_positive + loss_negative
        return loss.mean()

criterion = ContrastiveLoss(margin=0.35)

In [None]:
best_valid_loss = float('inf')
epochs_no_improve = 0
early_stop = 10

train_losses, valid_losses = [], []

for epoch in tqdm(range(EPOCHS)):
    # Training step
    train_batch_losses = []
    model.train()
    for resume, vacancy, batch_labels in train_dataloader:
        batch_labels = batch_labels.to(DEVICE)
        resume_input = tokenizer(resume, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
        vacancy_input = tokenizer(vacancy, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

        resume_embeddings = model(**resume_input)
        vacancy_embeddings = model(**vacancy_input)

        resume_embeddings = mean_pooling(resume_embeddings, resume_input['attention_mask'])
        vacancy_embeddings = mean_pooling(vacancy_embeddings, vacancy_input['attention_mask'])

        resume_embeddings.requires_grad_()
        vacancy_embeddings.requires_grad_()

        loss = criterion(resume_embeddings, vacancy_embeddings, batch_labels)
        loss.backward()
        optimizer.step()
        train_batch_losses.append(loss.item())

    # Validation step
    valid_batch_losses = []
    model.eval()
    with torch.no_grad():
        for resume, vacancy, batch_labels in val_dataloader:
            batch_labels = batch_labels.to(DEVICE)
            resume_input = tokenizer(resume, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
            vacancy_input = tokenizer(vacancy, padding=True, truncation=True, return_tensors="pt").to(DEVICE)

            resume_embeddings = model(**resume_input)
            vacancy_embeddings = model(**vacancy_input)

            resume_embeddings = mean_pooling(resume_embeddings, resume_input['attention_mask'])
            vacancy_embeddings = mean_pooling(vacancy_embeddings, vacancy_input['attention_mask'])

            loss = criterion(resume_embeddings, vacancy_embeddings, batch_labels)
            valid_batch_losses.append(loss.item())

    average_train_loss = sum(train_batch_losses) / len(train_batch_losses)
    average_valid_loss = sum(valid_batch_losses) / len(valid_batch_losses)
    train_losses.append(average_train_loss)
    valid_losses.append(average_valid_loss)

    # Early stopping and saving best model
    if average_valid_loss < best_valid_loss:
        best_valid_loss = average_valid_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), f'/content/drive/MyDrive/hhhack24/data/epoch_{epoch+1}_model_weights.pth')  # Save your model weights
    else:
        epochs_no_improve += 1
        if epochs_no_improve == early_stop:
            print(f'Early stopping at epoch {epoch + 1}, no improvement for {early_stop} epochs')
            break

    print(f"\nEpoch {epoch+1}/{EPOCHS}, Train Loss: {average_train_loss:.4f}, Validation Loss: {average_valid_loss:.4f}")

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
eval_preds, eval_labels = [], []

state_dict = torch.load('/content/epoch_9_model_weights.pth')
model.load_state_dict(state_dict)
model.to('cuda')
model.eval()

with torch.no_grad():
    for batch in tqdm(val_dataloader):
        clear_output()
        texts1, texts2, labels = batch
        inp1 = tokenizer(texts1, padding=True, truncation=True,
                        return_tensors='pt').to('cuda')
        inp2 = tokenizer(texts2, padding=True, truncation=True,
                        return_tensors='pt').to('cuda')
        inp1 = {key: val.to('cuda') for key, val in inp1.items()}
        inp2 = {key: val.to('cuda') for key, val in inp2.items()}

        out1 = model(**inp1)
        out2 = model(**inp2)

        emb1 = mean_pooling(out1, inp1['attention_mask'])
        emb2 = mean_pooling(out2, inp2['attention_mask'])
        cos_sim = nn.functional.cosine_similarity(emb1, emb2, dim=1)
        preds = cos_sim
        eval_preds.append(preds.cpu().tolist())
        eval_labels.append(labels.cpu().tolist())

In [None]:
eval_preds_flat = [pred for sublist in eval_preds for pred in sublist]
eval_labels_flat = [label for sublist in eval_labels for label in sublist]

In [None]:
def objective(trial):
    thresh = trial.suggest_float('thresh', 0.0, 1.0)
    binary_preds = [int(pred > thresh) for pred in eval_preds_flat]
    f1 = f1_score(eval_labels_flat, binary_preds)

    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_thresh = study.best_params['thresh']
print(f'Best Threshold: {best_thresh}')

binary_preds_optimized = [int(pred > best_thresh) for pred in eval_preds_flat]
print(f'Optimized F1-Score: {f1_score(eval_labels_flat, binary_preds_optimized):.4f}')

In [None]:
accuracy = accuracy_score(eval_labels_flat, binary_preds_optimized)
print(f'Accuracy: {accuracy:.4f}')
print(classification_report(eval_labels_flat, binary_preds_optimized, target_names=['Class 0', 'Class 1']))