In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cosinesim-ft-talent-v1/COSINESIM_FINETUNE_DATASET_16k_final.csv
/kaggle/input/ml-talent-hack-parsedtrain/output.csv


## **paraphrase-multilingual-mpnet-base-v2 (Multilingual XLM-roBERTa) FINE-TUNING v2**

# Here, we assume our dataset for fine-tuning is fully ready (text corpuses are preproccessed, cosine similarity is calcluated):

In [2]:
df = pd.read_csv('/kaggle/input/cosinesim-ft-talent-v1/COSINESIM_FINETUNE_DATASET_16k_final.csv')
df.sample(3)

Unnamed: 0,CONCATED_x,CONCATED_y,COSINE_SIM
9016,ЗП: 140000 руб.. Ищет работу на должность:: Пр...,title: Директор филиала в Екатеринбург. salary...,0.495071
2780,ЗП: 139999 руб.. Ищет работу на должность:: Пр...,"title: Ведущий маркетолог Шоу бар, Спа. salary...",0.495395
15689,ЗП: 100000 руб.. Ищет работу на должность:: Ру...,title: DevOps инженер. salary: з/п не указана....,0.596939


## Let's import and initialize everything we need:

In [3]:
# !pip install sentence-transformers NO NEED TO IN OUR CASE (we've written everything on clean PyTorch)

In [4]:
from transformers import AutoTokenizer, AutoModel
# from sentence_transformers.losses import ContrastiveLoss
# from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn

model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

#for clean PyTorch this can be used:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    #as it contains all embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    #att mask size -> token embs size
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    #paddings to zero
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    #normalization excluding zero
    
    return sum_embeddings / sum_mask #returning avg

#for simplicity let's use one already incl. pooling
# model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

### Case without splitting texts into corpuses as we will fine-tune on the full corpus data embeddings' similarity with dropping several columns (check README for reference):

In [5]:
val_data = pd.read_csv('/kaggle/input/ml-talent-hack-parsedtrain/output.csv')
cols = list(np.array(val_data.columns))
print(cols)

['Vacancy UUID', 'Vacancy Name', 'Keywords', 'Description', 'Comment', 'Resume UUID', 'First Name', 'Last Name', 'Birth Date', 'Country', 'City', 'About', 'Key Skills', 'Starts', 'Ends', 'Employer', 'Experience City', 'Position', 'Experience Description', 'Year', 'Organization', 'Faculty', 'Specialty', 'Result', 'Education Type', 'Education Level', 'Target']


In [6]:
cols_drop1 = cols[0:9] + cols[13:16] + [cols[19]] + [cols[-1]]
resumes_val = val_data.drop(columns=cols_drop1)
vac_val = val_data.iloc[:, 1:5]
labels_val = np.array(pd.get_dummies(val_data.iloc[:, -1]).astype(int).drop(columns='failed'))

In [7]:
resumes_val_text = []
for i in range(len(resumes_val)):
    curr_str = ""
    for j in range(len(resumes_val.columns)):
        if not pd.isna(resumes_val.iloc[i, j]):
            curr_str += str(resumes_val.iloc[i, j]) + ". "
    resumes_val_text.append(curr_str)
vac_val_text = []
for i in range(len(vac_val)):
    curr_str = ""
    for j in range(len(vac_val.columns)):
        if not pd.isna(vac_val.iloc[i, j]):
            curr_str += str(vac_val.iloc[i, j]) + ". "
    vac_val_text.append(curr_str)

In [8]:
len(vac_val_text), len(resumes_val_text), len(labels_val)

(656, 656, 656)

In [9]:
from torch.utils.data import DataLoader, Dataset

class DuoDataset(Dataset):
    def __init__(self, text1, text2, labels=None):
        self.text1 = np.array(text1)
        self.text2 = np.array(text2)
        if len(labels) > 0:
            self.labels = torch.tensor(labels, dtype=torch.float32)
        
    def __len__(self):
        return len(self.text1)
    
    def __getitem__(self, idx):
        text1_sample = self.text1[idx]
        text2_sample = self.text2[idx]
        
        if len(self.labels) > 0:
            label = self.labels[idx]
            return text1_sample, text2_sample, label
        else:
            return text1_sample, text2_sample
    
val_dataset = DuoDataset(resumes_val_text, vac_val_text, labels=labels_val)
train_dataset = DuoDataset(df.iloc[:, 0], df.iloc[:, 1], labels=df.iloc[:, 2])

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=8)

# Testing metrics first:
### With threshold = 0.75

In [45]:
from tqdm import tqdm
from IPython.display import clear_output
eval_preds, eval_labels = [], []

model.to('cuda')
model.eval()
with torch.no_grad():
    for batch in tqdm(val_dataloader):
        clear_output()
        texts1, texts2, labels = batch
        inp1 = tokenizer(texts1, padding=True, truncation=True,
                        return_tensors='pt').to('cuda')
        inp2 = tokenizer(texts2, padding=True, truncation=True,
                        return_tensors='pt').to('cuda')
        inp1 = {key: val.to('cuda') for key, val in inp1.items()}
        inp2 = {key: val.to('cuda') for key, val in inp2.items()}
        
        out1 = model(**inp1)
        out2 = model(**inp2)
            
        emb1 = mean_pooling(out1, inp1['attention_mask'])
        emb2 = mean_pooling(out2, inp2['attention_mask'])
        cos_sim = nn.functional.cosine_similarity(emb1, emb2)
        preds = (cos_sim > 0.88).int() #ideal ~0.7 in distrib. proportion (W/O FINETUNE)
        eval_preds.append(preds.cpu().tolist())
        eval_labels.append(labels.cpu().tolist())

100%|██████████| 82/82 [00:21<00:00,  3.76it/s]


In [46]:
eval_preds = np.array(eval_preds).reshape(-1, 1)
eval_labels = np.array(eval_labels).reshape(-1, 1)

In [47]:
import matplotlib.pyplot as plt
from collections import Counter

Counter(eval_preds.flatten().tolist()), Counter(eval_labels.flatten().tolist())

(Counter({0: 512, 1: 144}), Counter({0.0: 463, 1.0: 193}))

In [17]:
from sklearn.metrics import classification_report

print(classification_report(eval_labels.flatten().tolist(), eval_preds.flatten().tolist())) #0.75 THR. NON-TRAINED

              precision    recall  f1-score   support

         0.0       0.71      0.87      0.78       463
         1.0       0.31      0.14      0.19       193

    accuracy                           0.66       656
   macro avg       0.51      0.51      0.49       656
weighted avg       0.59      0.66      0.61       656



In [48]:
from sklearn.metrics import classification_report

print(classification_report(eval_labels.flatten().tolist(), eval_preds.flatten().tolist())) #0.88 THR. AFTER 1 EPOCH (THRESHOLD UP) at margin = 2

              precision    recall  f1-score   support

         0.0       0.70      0.78      0.74       463
         1.0       0.28      0.21      0.24       193

    accuracy                           0.61       656
   macro avg       0.49      0.49      0.49       656
weighted avg       0.58      0.61      0.59       656



In [None]:
from sklearn.metrics import classification_report

print(classification_report(eval_labels.flatten().tolist(), eval_preds.flatten().tolist())) #0.88 THR. AFTER 2 EPOCHS (THRESHOLD UP) at margin = 3

# 0.66 ACCURACY MAX.

### Acceptable, but we want more.

In [93]:
# # #Cleaning cache
# import gc
# gc.collect()
# torch.cuda.empty_cache()
# gc.collect()

0

# TODO: CUSTOM DATASET CLASS

In [None]:
from transformers import AdamW
from sklearn.metrics import classification_report
from tqdm import tqdm
import time
from IPython.display import clear_output

class ContrastiveLoss_v1(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss_v1, self).__init__()
        self.margin = margin

    def forward(self, y1, y2, labels):
        # euclid
        euc_dist = nn.functional.pairwise_distance(y1, y2)
        
        # calc losses
        losses = (1 - labels) * torch.pow(euc_dist, 2) + \
                 labels * torch.pow(torch.clamp(self.margin - euc_dist, min=0.0), 2)
                 
        loss = torch.mean(losses)
        return loss


model.to('cuda')

loss_func = ContrastiveLoss_v1()
optimizer = AdamW(model.parameters(), lr=2e-5)

NUM_EPOCHS = 2 #3

for epoch in tqdm(range(NUM_EPOCHS)):
    print(f"EPOCH {epoch+1}")
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}", unit="batch")):
        optimizer.zero_grad()
        
        clear_output(wait=True)
        texts1, texts2, labels = batch
        inp1 = tokenizer(texts1, padding=True, truncation=True,
                        return_tensors='pt').to('cuda')
        inp2 = tokenizer(texts2, padding=True, truncation=True,
                        return_tensors='pt').to('cuda')
        inp1 = {key: val.to('cuda') for key, val in inp1.items()}
        inp2 = {key: val.to('cuda') for key, val in inp2.items()}
        
        out1 = model(**inp1)
        out2 = model(**inp2)
            
        emb1 = mean_pooling(out1, inp1['attention_mask'])
        emb2 = mean_pooling(out2, inp2['attention_mask'])

        loss = loss_func(emb1.to('cuda'), emb2.to('cuda'), labels.to('cuda'))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if step % 100 == 1:
            print(f"Step {step}: Loss - {total_loss/step}")
        
    avg_loss = total_loss / len(train_dataloader)
    print(f"Training loss: {avg_loss:.4f}")
    
    ###val below
    
    model.eval()
    eval_preds, eval_preds1, eval_preds2 = [], [], []
    with torch.no_grad():
        val_loss = 0
        for step, batch in enumerate(val_dataloader):
            
            texts1, texts2, labels = batch
            inp1 = tokenizer(texts1, padding=True, truncation=True,
                            return_tensors='pt').to('cuda')
            inp2 = tokenizer(texts2, padding=True, truncation=True,
                            return_tensors='pt').to('cuda')
            inp1 = {key: val.to('cuda') for key, val in inp1.items()}
            inp2 = {key: val.to('cuda') for key, val in inp2.items()}

            out1 = model(**inp1)
            out2 = model(**inp2)

            emb1 = mean_pooling(out1, inp1['attention_mask'])
            emb2 = mean_pooling(out2, inp2['attention_mask'])
            
            loss = loss_func(emb1.to('cuda'), emb2.to('cuda'), labels.to('cuda'))
            val_loss += loss.item()
            
            cos_sim = nn.functional.cosine_similarity(emb1, emb2)
            preds = (cos_sim > 0.65).int()
            preds1 = (cos_sim > 0.7).int()
            preds2 = (cos_sim > 0.75).int()
            eval_preds.append(preds.cpu().tolist())
            eval_preds1.append(preds1.cpu().tolist())
            eval_preds2.append(preds2.cpu().tolist())
            
    eval_preds = np.array(eval_preds).reshape(-1, 1)
    eval_preds1 = np.array(eval_preds1).reshape(-1, 1)
    eval_preds2 = np.array(eval_preds2).reshape(-1, 1)
    
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Validation loss: {avg_val_loss:.4f}\n")
    print("THRESHOLD 0.65:\n", classification_report(eval_labels, eval_preds))
    print("THRESHOLD 0.7:\n", classification_report(eval_labels, eval_preds1))
    print("THRESHOLD 0.75:\n", classification_report(eval_labels, eval_preds2)) 
    time.sleep(10)


Epoch 1:   0%|          | 8/1985 [00:07<30:26,  1.08batch/s][A

In [None]:
print("FINE-TUNING DONE")

In [None]:
model.save_pretrained('MODEL_XLM_v1_3ep_clmrgn1')