In [None]:
import json
import jsonlines
import random
import numpy as np
import torch
import evaluate
import torch.nn as nn
import pandas as pd
from datasets import Dataset
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from hr_research.config import output_path
from hr_research.models.seniority import RobertaForRegression
from os.path import join
from transformers import Trainer, TrainingArguments
from torch.utils.data import random_split

In [None]:
checkpoint_path = join(output_path, "seniority_model_out/checkpoint-29000")

training_path = join(output_path, "seniority_model_v2/")
training_logs = join(training_path, "logs/")
dataset_path = join(output_path, "seniority_pairs.jsonl")
reallife_dataset_path = join(output_path, "reallife_seniority_pairs.jsonl")
SEQ_MAX_LEN = 64

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, use_fast=True)

In [None]:
random_generator = torch.Generator().manual_seed(42)

In [None]:
class JobTitleDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        super().__init__()
        self.file_path = file_path
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.data = self.load_data()

    def load_data(self):
        with jsonlines.open(self.file_path, 'r') as reader:
            data = [obj for obj in reader]
        return data

    def tokenize_data(self, junior_title, senior_title):
        tokenized_pair = self.tokenizer(text=(junior_title, senior_title), truncation=True, padding='max_length', max_length=self.max_length)
        return tokenized_pair

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        junior_title = random.choice(item['junior']).title()
        senior_title = random.choice(item['senior']).title()
        tokenized_pair = self.tokenize_data(junior_title, senior_title)
        return {
            'input_ids': torch.tensor(tokenized_pair['input_ids']),
            'attention_mask': torch.tensor(tokenized_pair['attention_mask']),
            'labels': torch.ones(2)
        }

In [None]:
MARGIN = 0.01

class PairwiseTrainer(Trainer):
    def get_logits(self, model, inputs):
        input_junior = {
            'input_ids': inputs['input_ids'][:, 0, :].squeeze(dim=1),
            'attention_mask': inputs['attention_mask'][:, 0, :].squeeze(dim=1),
        }

        input_senior = {
            'input_ids': inputs['input_ids'][:, 1, :].squeeze(dim=1),
            'attention_mask': inputs['attention_mask'][:, 1, :].squeeze(dim=1),
        }
        
        output_junior = model(**input_junior)
        output_senior = model(**input_senior)

        return output_junior, output_senior
    
    def loss_from_logits(self, model, output_junior, output_senior):
        diff = MARGIN + output_junior - output_senior
        seniority_loss = torch.where(diff >= 0, 
                                     model.distance_loss(output_junior, torch.zeros_like(output_junior)) 
                                     + model.distance_loss(output_senior, torch.ones_like(output_senior)),
                                     diff * 0)

        # seniority_loss = torch.where(diff >= 0, 
        #                              #model.distance_loss(diff, -torch.ones_like(diff)),
        #                              model.distance_loss(output_junior, output_junior + 1) 
        #                              + model.distance_loss(output_senior, output_senior - 1),
        #                              diff * 0)

        # seniority_loss = torch.mean(torch.log1p(torch.exp(3*diff - 0.8)))

        # similarity_penalizer = 0.8 * torch.exp(-torch.pow(20*diff, 2))
        # distance_penalizer = torch.log1p(torch.exp(3*diff - 1))

        # seniority_loss = torch.where(diff >= 0,
        #                              similarity_penalizer + distance_penalizer,
        #                              0)

        # min_jr = torch.min(torch.min(output_junior), torch.min(output_senior))
        # max_sr = 1 - torch.max(torch.max(output_junior), torch.max(output_senior))
        return torch.mean(seniority_loss)# + min_jr + max_sr


    def compute_loss(self, model, inputs):
        output_junior, output_senior = self.get_logits(model, inputs)
        
        return self.loss_from_logits(model, output_junior, output_senior)
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        with torch.no_grad():
            output_junior, output_senior = self.get_logits(model, inputs)
            loss = self.loss_from_logits(model, output_junior, output_senior)
            logits = torch.stack((output_junior, output_senior), dim=1).detach()

            # return loss and other outputs for evaluation
            return (loss, logits, torch.ones_like(logits))

In [None]:
def compute_seniority_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, _ = eval_preds # labels are fake
    logits = logits.squeeze()
    correct_preds = 1. * (logits[:, 0] < logits[:, 1]) # 1. * converts to float
    incorrect_indexes = np.where(logits[:, 0] >= logits[:, 1])

    with open("incorrect_predictions.json", "w") as f:
        json.dump({
            "indices": incorrect_indexes[0].tolist(),
            "values": logits[incorrect_indexes].tolist()
        }, f)
    print(f"INCORRECT: {incorrect_indexes}")
    return metric.compute(predictions=correct_preds, references=np.ones_like(correct_preds))

In [None]:
dataset = JobTitleDataset(dataset_path, tokenizer, max_length=SEQ_MAX_LEN)
dataset

In [None]:
reallife_set = JobTitleDataset(reallife_dataset_path, tokenizer, max_length=SEQ_MAX_LEN)

In [None]:
train_set, val_set = random_split(dataset, [0.95, 0.05], generator=random_generator)

In [None]:
model = RobertaForRegression.from_pretrained(checkpoint_path)

training_args = TrainingArguments(
    output_dir=training_path,          # output directory
    num_train_epochs=40,              # total number of training epochs
    per_device_train_batch_size=12,  # batch size per device during training
    per_device_eval_batch_size=24,   # batch size for evaluation
    eval_steps=500,
    learning_rate=1e-5,
    evaluation_strategy='steps',
    save_strategy='steps',
    save_steps=500,
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=training_logs,       # directory for storing logs
    report_to='tensorboard',
)

trainer = PairwiseTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_set,
    eval_dataset=val_set,
    compute_metrics=compute_seniority_metrics
)

In [None]:
trainer.evaluate()

In [None]:
with open("incorrect_predictions.json", "r") as f:
    incorrect_preds = json.load(f)

wrong_preds = [(val_set.dataset[i_dataset], incorrect_preds["values"][i_self]) 
                for i_self, i_dataset in enumerate(incorrect_preds['indices'])]

def decode_pair(pair):
    junior = tokenizer.decode(token_ids=pair[0]["input_ids"][0, :], skip_special_tokens=True)
    senior = tokenizer.decode(token_ids=pair[0]["input_ids"][1, :], skip_special_tokens=True)
    return junior, senior
decoded_wrong = [(decode_pair(w), w[1][1] - w[1][0], w[1]) for w in wrong_preds]

decoded_wrong

In [None]:
pd.DataFrame(decoded_wrong, columns=["Pair", "Difference", "Seniorities"])[:20]

In [None]:
wrong_preds[0][0]["input_ids"][0, :]

In [None]:
len(incorrect_preds["values"]), len(incorrect_preds['indices'])

In [None]:
trainer.train()

In [None]:
x = np.arange(100.).reshape(100, 1)
np.where( x > 5 )

In [None]:
text = "Engineer II"
tokenized = tokenizer(text, return_tensors='pt').to('cuda')
model.eval()
model(**tokenized)