In [24]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

In [25]:
class ContrastiveModel(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', embedding_dim=768):
        super(ContrastiveModel, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.proj = nn.Linear(embedding_dim, 512)

    def mean_pooling(self, model_output, attention_mask):
        # Average token embeddings
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def forward(self, input_ids, attention_mask):
        # Forward pass

        # Get contextual representations
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

        # Combine encoder outputs into one sentence level embedding
        embeddings = self.mean_pooling(outputs, attention_mask)

        # map embeddings from bert native to 512
        embeddings = self.proj(embeddings)
        # L2 normalization
        embedding = F.normalize(embeddings, p=2, dim=1)

        return embeddings

In [26]:
model = ContrastiveModel()
test_id = torch.randint(0, 1000, (2,128))
test_mask = torch.ones_like(test_id)
embeddings = model(test_id, test_mask)

In [29]:
embeddings.shape

torch.Size([2, 512])

In [43]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=0.5, loss_type='contrastive'):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.loss_type = loss_type

    def forward(self, anchor, positive, negative=None, labels=None):
        if self.loss_type == 'contrastive':
            distances = torch.norm(anchor - positive, dim=1)
            
            # For positive labels minimize, for negative ensure distance is at least margin
            losses = labels * distances + (1 -labels) * F.relu(self.margin - distances)
            return losses.mean()
            
        elif self.loss_type == 'cosine':
            cos_sim = F.cosine_similarity(anchor, positive)
            # maximize similarity
            return -cos_sim.mean()
        

In [47]:
loss_fn = ContrastiveLoss(margin=0.5, loss_type='contrastive')
anchor = torch.randn(4, 512)
positive = torch.randn(4, 512)
labels = torch.tensor([1, 1, 0, 0], dtype=torch.float)
loss = loss_fn(F.normalize(anchor, dim=1), F.normalize(positive, dim=1), labels=labels)
loss.item()

0.7039779424667358

In [48]:
class ResumeJobDataset(Dataset):
    def __init__(self, pairs_df, resume_df, job_df, tokenizer, max_length=256):
        self.pairs_df = pairs_df
        self.resume_df = resume_df
        self.job_df = job_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.pairs_df)
    
    def __getitem__(self, idx):
        pair = self.pairs_df.iloc[idx]
        resume_id = pair['resume_id']
        job_id = pair['job_id']
        label = pair['label']
        
        # Get resume text
        resume_text = self.resume_df.loc[self.resume_df['ID'] == resume_id, 'Resume_str'].values[0]
        
        # Get job text
        job_title = self.job_df.loc[self.job_df['Job Id'] == job_id, 'Job Title'].values[0]
        job_desc = self.job_df.loc[self.job_df['Job Id'] == job_id, 'Job Description'].values[0]
        job_text = f"{job_title}. {job_desc}"
        
        # Tokenize
        resume_encoding = self.tokenizer.encode_plus(
            resume_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        job_encoding = self.tokenizer.encode_plus(
            job_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'resume_input_ids': resume_encoding['input_ids'].squeeze(),
            'resume_attention_mask': resume_encoding['attention_mask'].squeeze(),
            'job_input_ids': job_encoding['input_ids'].squeeze(),
            'job_attention_mask': job_encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.float)
        }


In [53]:
def load_data(pairs_path, resume_path, job_path):
    pairs_df = pd.read_csv(pairs_path)
    resume_df = pd.read_csv(resume_path)
    job_df = pd.read_csv(job_path)
    return pairs_df, resume_df, job_df

In [54]:
pairs_df, resume_df, job_df = load_data(
        '/Users/gv/code/school/trustworthy_ai/archive/training_pairs/resume_job_pairs.csv',  # Path to pairs
        '/Users/gv/code/school/trustworthy_ai/archive/Resume/Resume.csv',  # Path to resume data
        '/Users/gv/code/school/trustworthy_ai/archive/Jobs/job_descriptions.csv',  # Path to the job
    )

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
dataset = ResumeJobDataset(pairs_df.head(2), resume_df, job_df, tokenizer)
sample = dataset[0]

In [55]:
print("Sample keys:", sample.keys())
print("Resume input shape:", sample['resume_input_ids'].shape)

Sample keys: dict_keys(['resume_input_ids', 'resume_attention_mask', 'job_input_ids', 'job_attention_mask', 'label'])
Resume input shape: torch.Size([256])


In [58]:
# Create train validation split

def create_train_val_dataloaders(pairs_df, resume_df, job_df, tokenizer, batch_size=32, max_length=256, train_size=0.8):
    #split into train and validation
    train_df, val_df = train_test_split(
        pairs_df,
        train_size=train_size,
        stratify=pairs_df['label'],
        random_state=42
    )

    print(f"Training set: {len(train_df)} pairs")
    print(f"Validation set: {len(val_df)} pairs")

    train_dataset = ResumeJobDataset(
        train_df, resume_df, job_df, tokenizer, max_length=max_length
    )
    val_dataset = ResumeJobDataset(
        val_df, resume_df, job_df, tokenizer, max_length=max_length
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    return train_loader, val_loader

In [59]:
train_loader, val_loader = create_train_val_dataloaders(pairs_df, resume_df, job_df, tokenizer)

Training set: 21544 pairs
Validation set: 5386 pairs
