In [20]:
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

In [21]:
def load_data(resume_path, job_path, resume_sample=None, job_sample=None):
    # Load resumes and get random sample if too long
    resume_df = pd.read_csv(resume_path)
    if resume_sample and resume_sample < len(resume_df):
        resume_df = resume_df.sample(resume_sample, random_state=42)

    # Load jobs and get random samples if needed
    job_df = pd.read_csv(job_path)
    if job_sample and job_sample < len(job_df):
        job_df = job_df.sample(job_sample, random_state=42)

    return resume_df, job_df

In [22]:
# Clean text
def preprocess(text):
    text = str(text).lower()
    # Remove punctuation
    keep_chars = 'abcdefghijklmnopqrestuvwxyz0123456789_+# .'
    cleaned_text = ''
    for char in text:
        if char in keep_chars:
            cleaned_text += char
        else:
            cleaned_text += ' '
    # Remove whitespace
    words = cleaned_text.split()
    cleaned_text = ' '.join(words)
    return cleaned_text

In [42]:
def vectorize(resume_df, job_df, resume_col='Resume_str', job_col='Job Description'):
    # Preprocess
    resume_text =resume_df[resume_col].fillna('').apply(preprocess)
    job_text = job_df[job_col].fillna('').apply(preprocess)

    if 'Job Title'in job_df.columns:
        job_title = job_df['Job Title'].fillna('').apply(preprocess)
        job_text = job_title + ' ' + job_text

    all_text = list(resume_text) + list(job_text)

    vectorizer = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.7,ngram_range=(1,2), stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_text)

    resume_vectors = tfidf_matrix[:len(resume_df)]
    job_vectors = tfidf_matrix[len(resume_df):]

    return resume_vectors, job_vectors, vectorizer

In [55]:
# Calculate cosine similarity for batch
def process_similarity_batch(resume_batch, job_vectors, batch_start_idx, threshold):
    similarity_matrix = cosine_similarity(resume_batch, job_vectors)

    batch_similarities = []

    for i in range(similarity_matrix.shape[0]):
        resume_idx = batch_start_idx + i
        for job_idx, score in enumerate(similarity_matrix[i]):
            if score > threshold:
                batch_similarities.append((resume_idx, job_idx, float(score)))
    return batch_similarities

def similarity_calculation(resume_vectors, job_vectors, batch_size=100, threshold=0.1):
    n_resumes =resume_vectors.shape[0]
    n_jobs = job_vectors.shape[0]
    similarity_pairs = []

    total_batches = (n_resumes + batch_size - 1)//batch_size

    for batch_start in range(0,n_resumes, batch_size):
        batch_end = min(batch_start + batch_size, n_resumes)
        resume_batch = resume_vectors[batch_start:batch_end]

        batch_similarities = process_similarity_batch(resume_batch, job_vectors, batch_start,threshold)

        similarity_pairs.extend(batch_similarities)
    return similarity_pairs

In [25]:
resume_path = '/home/gv/school/trustworthy_ai/proj/resume_data/archive/Resume/Resume.csv'
job_path = '/home/gv/school/trustworthy_ai/proj/job_data/job_descriptions.csv'
output_path = '/home/gv/school/trustworthy_ai/proj/resume_job_matching_trustworthy_ai/resume_job_data/resume_job_pairs.csv'

In [36]:
resume_df, job_df = load_data(resume_path, job_path)

In [48]:
resume_vectors, job_vectors, _ =vectorize(resume_df, job_df)

In [56]:
similarity_calculation(resume_vectors, job_vectors)

[(0, 65, 0.1872081843751535),
 (0, 115, 0.11132151705142068),
 (0, 145, 0.14654563802207854),
 (0, 146, 0.11718408357893115),
 (0, 168, 0.11718408357893115),
 (0, 198, 0.1872081843751535),
 (0, 228, 0.12436883293994831),
 (0, 240, 0.11718408357893115),
 (0, 243, 0.12574369641536257),
 (0, 278, 0.11718408357893115),
 (0, 290, 0.11718408357893115),
 (0, 330, 0.11266407747096005),
 (0, 337, 0.1291090090738947),
 (0, 372, 0.19789950592950956),
 (0, 414, 0.11230577611098667),
 (0, 432, 0.11266407747096005),
 (0, 564, 0.12574369641536257),
 (0, 573, 0.12436883293994831),
 (0, 589, 0.1872081843751535),
 (0, 605, 0.223054138669083),
 (0, 617, 0.11718408357893115),
 (0, 628, 0.12926364691167955),
 (0, 629, 0.11230577611098667),
 (0, 642, 0.11718408357893115),
 (0, 643, 0.12574369641536257),
 (0, 665, 0.19789950592950956),
 (0, 704, 0.1872081843751535),
 (0, 753, 0.11718408357893115),
 (0, 824, 0.11559206263310493),
 (0, 836, 0.11647664040101463),
 (0, 848, 0.12430568791072008),
 (0, 858, 0.1465