In [6]:
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

In [21]:
def load_data(resume_path, job_path, resume_sample=1000, job_sample=5000):
    # Load resumes and get random sample if too long
    resume_df = pd.read_csv(resume_path)
    if resume_sample and resume_sample < len(resume_df):
        resume_df = resume_df.sample(resume_sample, random_state=42)

    # Load jobs and get random samples if needed
    job_df = pd.read_csv(job_path)
    if job_sample and job_sample < len(job_df):
        job_df = job_df.sample(job_sample, random_state=42)

    return resume_df, job_df

In [22]:
# Clean text
def preprocess(text):
    text = str(text).lower()
    # Remove punctuation
    keep_chars = 'abcdefghijklmnopqrestuvwxyz0123456789_+# .'
    cleaned_text = ''
    for char in text:
        if char in keep_chars:
            cleaned_text += char
        else:
            cleaned_text += ' '
    # Remove whitespace
    words = cleaned_text.split()
    cleaned_text = ' '.join(words)
    return cleaned_text

In [23]:
def vectorize(resume_df, job_df, resume_col='Resume_str', job_col='Job Description'):
    # Preprocess
    resume_text =resume_df[resume_col].fillna('').apply(preprocess)
    job_text = job_df[job_col].fillna('').apply(preprocess)

    if 'Job Title'in job_df.columns:
        job_title = job_df['Job Title'].fillna('').apply(preprocess)
        job_text = job_title + ' ' + job_text

    all_text = list(resume_text) + list(job_text)

    vectorizer = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.7,ngram_range=(1,2), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_text)

    resume_vectors = tfidf_matrix[:len(resume_df)]
    job_vectors = tfidf_matrix[len(resume_df):]

    return resume_vectors, job_vectors, vectorizer

In [24]:
# Calculate cosine similarity for batch
def process_similarity_batch(resume_batch, job_vectors, batch_start_idx, threshold):
    similarity_matrix = cosine_similarity(resume_batch, job_vectors)

    batch_similarities = []

    for i in range(similarity_matrix.shape[0]):
        resume_idx = batch_start_idx + i
        for job_idx, score in enumerate(similarity_matrix[i]):
            if score > threshold:
                batch_similarities.append((resume_idx, job_idx, float(score)))
    return batch_similarities

def similarity_calculation(resume_vectors, job_vectors, batch_size=100, threshold=0.1):
    n_resumes =resume_vectors.shape[0]
    n_jobs = job_vectors.shape[0]
    similarity_pairs = []

    total_batches = (n_resumes + batch_size - 1)//batch_size

    for batch_start in range(0,n_resumes, batch_size):
        batch_end = min(batch_start + batch_size, n_resumes)
        resume_batch = resume_vectors[batch_start:batch_end]

        batch_similarities = process_similarity_batch(resume_batch, job_vectors, batch_start,threshold)

        similarity_pairs.extend(batch_similarities)
    return similarity_pairs

In [25]:
# Create positive and negative training pairs
def creat_training_pairs(similarities, resume_df, job_df, top_k=10, neg_ratio=2):
    resume_to_job = {}
    for resume_idx, job_idx, sim in similarities:
        if resume_idx not in resume_to_job:
            resume_to_job[resume_idx] = []
        resume_to_job[resume_idx].append((job_idx, sim))

    positive_pairs = []
    negative_pairs = []

    for resume_idx, matches in resume_to_job.items():
        resume_id = resume_df.iloc[resume_idx].get('ID', resume_idx)
        matches.sort(key=lambda x:x[:1], reverse=True)

        # positive matches
        pos_count = min(top_k, len(matches))
        pos_job_ids = []

        for job_idx, sim in matches[:pos_count]:
            job_id = job_df.iloc[job_idx].get('Job Id', job_idx)
            positive_pairs.append((resume_id, job_id, sim))
            pos_job_ids.append(job_id)

        # negative pairs (middle=hard, bottom=soft)
        neg_count = min(pos_count * neg_ratio, len(matches) - pos_count)

        if neg_count > 0:
            mid_start = pos_count
            mid_end = min(pos_count + neg_count // 2, len(matches) - neg_count //2)

            for job_idx, sim in matches[mid_start:mid_end]:
                job_id = job_df.iloc[job_idx].get('Job Id', job_idx)
                if job_id not in pos_job_ids:
                    negative_pairs.append((resume_id, job_id, sim))

            remaining = neg_count - (mid_end - mid_start)
            if remaining > 0:
                for job_idx, sim in matches[-remaining:]:
                    job_id = job_df.iloc[job_idx].get('Job Id', job_idx)
                    if job_id not in pos_job_ids:
                        negative_pairs.append((resume_id, job_id, sim))

    return positive_pairs, negative_pairs

In [26]:
# desktop path
#resume_path = '/home/gv/school/trustworthy_ai/proj/resume_data/archive/Resume/Resume.csv'
#job_path = '/home/gv/school/trustworthy_ai/proj/job_data/job_descriptions.csv'
#output_path = '/home/gv/school/trustworthy_ai/proj/resume_job_matching_trustworthy_ai/resume_job_data/resume_job_pairs.csv'

# mac path
resume_path = '/Users/gv/code/school/trustworthy_ai/archive/Resume/Resume.csv'
job_path = '/Users/gv/code/school/trustworthy_ai/archive/Jobs/job_descriptions.csv'
output_path = '/Users/gv/code/school/trustworthy_ai/archive/training_pairs/resume_job_pairs.csv'

In [27]:
resume_df, job_df = load_data(resume_path, job_path)

In [28]:
resume_vectors, job_vectors, _ =vectorize(resume_df, job_df)

In [29]:
similarities = similarity_calculation(resume_vectors, job_vectors)

In [30]:
pos_pairs, neg_pairs = creat_training_pairs(similarities, resume_df, job_df)

In [34]:
pos_df = pd.DataFrame(pos_pairs, columns=['resume_id', 'job_id', 'score'])
pos_df['label'] = 1
neg_df = pd.DataFrame(neg_pairs, columns=['resume_id', 'job_id', 'score'])
neg_df['label'] = 0

In [38]:
training_pairs_df = pd.concat([pos_df, neg_df], ignore_index=True)
training_pairs_df.to_csv(output_path, index=False)