In [86]:
# Build and train models
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from typing import List, Tuple
import random

import re
import textwrap

import keras_tuner as kt
import os
from collections import Counter
from transformers import BertTokenizer, TFBertForSequenceClassification
import faiss
from sentence_transformers import SentenceTransformer
import voyageai
from voyageai import Client
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
import warnings

In [81]:
# Load dataset
resumes = pd.read_csv('resumes.csv')
resumes.head()

Unnamed: 0,id,resume_str,category
0,16852973,hr administrator/marketing associate hr admini...,HR
1,22323967,"hr specialist, us hr operations summary versat...",HR
2,33176873,hr director summary over 20 years experience i...,HR
3,27018550,"hr specialist summary dedicated, driven, and d...",HR
4,17812897,hr manager skill highlights hr skills hr depar...,HR


In [82]:
# Load data set
jobs_gb_clean = pd.read_csv('jobs_gb_cleaned.csv')
jobs_gb_clean.head()

Unnamed: 0,id,title,company,location,category,created,description,salary_min,salary_max,contract_type,contract_time,salary_avg
0,5185229320,Early Years Educator Nursery,Partou,"Ware, Hertfordshire",Teaching Jobs,2025-05-07T14:32:36Z,"join partou, where we put children in the lead...",27600.0,27600.0,permanent,part_time,27600.0
1,5172986638,Housekeeping Assistant - Bank - Care Home,Barchester Healthcare,"Melbourn, Royston",Domestic help & Cleaning Jobs,2025-05-01T13:32:39Z,about the role as a bank housekeeping assistan...,29619.0,29619.0,unknown,unknown,29619.0
2,5190346944,Care Home Administrator,Barchester Healthcare,"Harlow Green, Gateshead",Admin Jobs,2025-05-09T13:37:28Z,barchester healthcare are looking for an exper...,28000.0,28000.0,permanent,unknown,28000.0
3,5149584869,Care Assistant - Bank - Care Home,Barchester Healthcare,"Wilton, Salisbury",Healthcare & Nursing Jobs,2025-04-17T13:30:03Z,about the role as a bank care assistant at a b...,33342.0,33342.0,unknown,unknown,33342.0
4,5172986642,Activities Assistant - Care Home,Barchester Healthcare,"Badgeworth, Cheltenham",Hospitality & Catering Jobs,2025-05-01T13:32:39Z,about the role as an activities assistant at a...,26436.0,26436.0,permanent,unknown,26436.0


## Modelling with S-BERT

In [83]:
# Load the pretrained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed and normalize the datasets
def embed_and_normalize(texts):
    embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings

# Convert resumes and job descriptions to lists
resume_texts = resumes['resume_str'].tolist()
job_texts = jobs_gb_clean['description'].tolist()

# Embed and normalize the resumes and job descriptions
resume_embedds = embed_and_normalize(resume_texts)
job_embedds = embed_and_normalize(job_texts)

Batches: 100%|██████████| 78/78 [02:49<00:00,  2.18s/it]
Batches: 100%|██████████| 16/16 [00:16<00:00,  1.05s/it]


In [84]:
# Search for the top k job matches to the resume
def match_resume_to_jobs(resume_embedds, job_embedds, job_description, resumes, resume_idx, job_titles, top_k=5):
    
    print("=" * 50)
    print(f"Resume Preview (Index {resume_idx}):")
    resume_preview = re.sub(r'\s+', ' ', resumes[resume_idx]).strip()
    print(textwrap.fill(resume_preview[:500], width=100))  # wrap at 100 chars
    print("=" * 50)

    job_index = faiss.IndexFlatIP(job_embedds.shape[1])
    job_index.add(job_embedds)

    resume_embedds = resume_embedds.reshape(1, -1)
    scores, indices = job_index.search(resume_embedds, top_k)

    print("Top job matches for resume:")
    for i, idx in enumerate(indices[0]):
        score = scores[0][i]
        job_title = job_titles[idx]
        job_preview = re.sub(r'\s+', ' ', job_description[idx]).strip()

        print(f"\n{i+1}. Score: {score:.4f}")
        print(f"Job Title: {job_title}")
        print("Description:")
        print(textwrap.fill(job_preview[:500], width=100))


job_titles = jobs_gb_clean['title'].tolist()
job_texts = jobs_gb_clean['description'].tolist()


resume_idx = 2
match_resume_to_jobs(
    resume_embedds=resume_embedds[resume_idx],
    job_embedds=job_embedds,
    job_description=job_texts,
    resumes=resume_texts,
    resume_idx=resume_idx,
    top_k=5,
    job_titles=job_titles
)

Resume Preview (Index 2):
hr director summary over 20 years experience in recruiting, 15 plus years in human resources
executive management, 5 years of hris development and maintenance 4 years working in a healthcare
enviroment skills recruiting fmla/eeo/flsa hris development benefit administration policy
development web page development accomplishments kansas health institute -health outcomes for the
state of kansas -1999 memberships and accolades: project management institute member, shrm, chamber
of commerce, 1999 fri
Top job matches for resume:

1. Score: 0.4774
Job Title: Health and Safety Manager
Description:
we are seeking a highly experienced health & safety manager (housing) to join our housing services
team. in this crucial role, you will be responsible for developing, implementing, and overseeing
health and safety policies, ensuring compliance with all relevant legislation, and promoting a
safety-first culture across our housing services. this is a fantastic opportunity to p

In [85]:
# Search for the top k resumes to the jobs
def match_job_to_resumes(job_embedds, resume_embedds, resume_texts, job_descriptions, job_titles, job_idx, top_k=5):
    
    print("=" * 50)
    print(f"Job Description Preview (Index {job_idx}):")
    print(f"Job Title: {job_titles[job_idx]}")
    job_preview = re.sub(r'\s+', ' ', job_descriptions[job_idx]).strip()
    print("Description:")
    print(textwrap.fill(job_preview[:500], width=150))
    print("=" * 50)

    # FAISS index setup
    resume_index = faiss.IndexFlatIP(resume_embedds.shape[1])
    resume_index.add(resume_embedds)

    job_embedds = job_embedds.reshape(1, -1)
    scores, indices = resume_index.search(job_embedds, top_k)

    print("Top resume matches for this job:")
    for i, idx in enumerate(indices[0]):
        score = scores[0][i]
        resume_preview = re.sub(r'\s+', ' ', resume_texts[idx]).strip()

        print(f"\n{i+1}. Score: {score:.4f}")
        print("Resume Preview:")
        print(textwrap.fill(resume_preview[:500], width=150))


job_texts = jobs_gb_clean['description'].tolist()
job_titles = jobs_gb_clean['title'].tolist()
resume_texts = resumes['resume_str'].tolist()

job_idx = 120
match_job_to_resumes(
    job_embedds=job_embedds[job_idx],
    resume_embedds=resume_embedds,
    resume_texts=resume_texts,
    job_descriptions=job_texts,
    job_titles=job_titles,
    job_idx=job_idx,
    top_k=5
)

Job Description Preview (Index 120):
Job Title: Field Sales Representative - Self Employed
Description:
field sales representative- self employed – chester be at the forefront of innovation within payments technology join the 5 rated, 1 ranked payment
provider in the uk with uncapped commission and recurring revenue share benefits earnings between £40-£120k ote (commission only) residual income paid
monthly multiple acquirers & software partners latest hardware and technology continual training and support print & digital marketing materials
supplied apps approved in hours, paid same week support…
Top resume matches for this job:

1. Score: 0.5173
Resume Preview:
revolving credit support specialist professional background energetic, dedicated support specialist with strong interpersonal skills and 8+ years of
contact center and customer service experience. proven ability to work effectively with people of various ages, cultural backgrounds, and socio-
economic statuses. financial and c

## Modelling with voyage embedding

In [None]:
# Set up VoyageAI client
os.environ["VOYAGE_API_KEY"] 
client = Client(api_key=os.getenv("VOYAGE_API_KEY"))

# Embed and normalize function using VoyageAI
def embed_and_normalize(texts, batch_size=64):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        response = client.embed(batch, model="voyage-2", input_type="document")
        emb = np.array(response.embeddings).astype('float32')
        emb = emb / np.linalg.norm(emb, axis=1, keepdims=True)
        all_embeddings.append(emb)
    return np.vstack(all_embeddings)


# Embed resumes and job descriptions with Voyage
resume_embedds_voy = embed_and_normalize(resume_texts)
job_embedds_voy = embed_and_normalize(job_texts)

# Save the embeddings to files
np.save("resume_embeds_voyage.npy", resume_embedds_voy)
np.save("job_embeds_voyage.npy", job_embedds_voy)


In [87]:
# Load the saved embeddings
resume_embedds_voy = np.load("resume_embeds_voyage.npy")
job_embedds_voy = np.load("job_embeds_voyage.npy")

# Check their shapes to verify loading
print(resume_embedds_voy.shape)
print(job_embedds_voy.shape)


(2484, 1024)
(500, 1024)


In [88]:
# Match resume to top k jobs
def match_resume_to_jobs_voy(resume_embedds_voy, job_embedds_voy, job_description, resumes, resume_idx, job_titles, top_k=5):
    
    print("=" * 50)
    print(f"Resume Preview (Index {resume_idx}):")
    resume_preview = re.sub(r'\s+', ' ', resumes[resume_idx]).strip()
    print(textwrap.fill(resume_preview[:500], width=150))
    print("=" * 50)

    # Set up FAISS index with Voyage job embeddings
    job_index = faiss.IndexFlatIP(job_embedds_voy.shape[1])
    job_index.add(job_embedds_voy)

    # Use the Voyage resume embedding
    resume_embedd = resume_embedds_voy[resume_idx].reshape(1, -1)
    scores, indices = job_index.search(resume_embedd, top_k)

    print("Top job matches for resume:")
    for i, idx in enumerate(indices[0]):
        score = scores[0][i]
        job_title = job_titles[idx]
        job_preview = re.sub(r'\s+', ' ', job_description[idx]).strip()

        print(f"\n{i+1}. Score: {score:.4f}")
        print(f"Job Title: {job_title}")
        print("Description:")
        print(textwrap.fill(job_preview[:500], width=150))

job_titles = jobs_gb_clean['title'].tolist()
job_texts = jobs_gb_clean['description'].tolist()
resume_texts = resumes['resume_str'].tolist()

# Choose the resume index to evaluate
resume_idx = 2

match_resume_to_jobs_voy(
    resume_embedds_voy=resume_embedds_voy,
    job_embedds_voy=job_embedds_voy,
    job_description=job_texts,
    resumes=resume_texts,
    resume_idx=resume_idx,
    job_titles=job_titles,
    top_k=5
)


Resume Preview (Index 2):
hr director summary over 20 years experience in recruiting, 15 plus years in human resources executive management, 5 years of hris development and
maintenance 4 years working in a healthcare enviroment skills recruiting fmla/eeo/flsa hris development benefit administration policy development web
page development accomplishments kansas health institute -health outcomes for the state of kansas -1999 memberships and accolades: project management
institute member, shrm, chamber of commerce, 1999 fri
Top job matches for resume:

1. Score: 0.8325
Job Title: HR Administrator
Description:
no experience necessary are you looking to build a career working in an office environment? does a future in hr sound exciting? if so, this
opportunity could be for you due to a severe skills shortage in the marketplace, hr personnel are in high demand. we have a pool of employers who are
seeking to employ newly trained individuals who are motivated to pursue a career in human resour

In [89]:
# Match job to top k resumes
def match_job_to_resumes_voy(job_embedds_voy, resume_embedds_voy, resume_texts, job_descriptions, job_titles, job_idx, top_k=5):
    
    print("=" * 50)
    print(f"Job Description Preview (Index {job_idx}):")
    print(f"Job Title: {job_titles[job_idx]}")
    job_preview = re.sub(r'\s+', ' ', job_descriptions[job_idx]).strip()
    print("Description:")
    print(textwrap.fill(job_preview[:500], width=150))
    print("=" * 50)

    # FAISS index setup
    resume_index = faiss.IndexFlatIP(resume_embedds_voy.shape[1])
    resume_index.add(resume_embedds_voy)

    job_embedd = job_embedds_voy[job_idx].reshape(1, -1)
    scores, indices = resume_index.search(job_embedd, top_k)

    print("Top resume matches for this job:")
    for i, idx in enumerate(indices[0]):
        score = scores[0][i]
        resume_preview = re.sub(r'\s+', ' ', resume_texts[idx]).strip()

        print(f"\n{i+1}. Score: {score:.4f}")
        print("Resume Preview:")
        print(textwrap.fill(resume_preview[:500], width=150))


job_texts = jobs_gb_clean['description'].tolist()
job_titles = jobs_gb_clean['title'].tolist()
resume_texts = resumes['resume_str'].tolist()

job_idx = 120

match_job_to_resumes_voy(
    job_embedds_voy=job_embedds_voy,
    resume_embedds_voy=resume_embedds_voy,
    resume_texts=resume_texts,
    job_descriptions=job_texts,
    job_titles=job_titles,
    job_idx=job_idx,
    top_k=5
)

Job Description Preview (Index 120):
Job Title: Field Sales Representative - Self Employed
Description:
field sales representative- self employed – chester be at the forefront of innovation within payments technology join the 5 rated, 1 ranked payment
provider in the uk with uncapped commission and recurring revenue share benefits earnings between £40-£120k ote (commission only) residual income paid
monthly multiple acquirers & software partners latest hardware and technology continual training and support print & digital marketing materials
supplied apps approved in hours, paid same week support…
Top resume matches for this job:

1. Score: 0.8384
Resume Preview:
sales associate professional summary i am talented individual who will bring my sales talent, fashion sense, and passion for clothing to your company.
i have a proven track record of success in sales .i am looking for suitable position with a company that offers there staff superb career
opportunities, job enrichment and a sup

## Model Evaluation

In [55]:
# Extract category labels
resume_cats = resumes["category"].tolist()
job_cats = jobs_gb_clean["category"].tolist()

In [None]:
# Compute the silhouette score of the embeddings
def compute_silhouette(embeddings: np.ndarray, n_clusters: int):
    if n_clusters < 2:
        return None
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(embeddings)
    score = silhouette_score(embeddings, labels, metric="cosine")
    return score

# Estimate number of clusters separately for resumes and jobs
n_clusters_resumes = len(set(resume_cats))  
n_clusters_jobs = len(set(job_cats))       

# Compute silhouette scores for resumes and jobs 
sil_resumes_sbert = compute_silhouette(resume_embedds, n_clusters_resumes)
sil_jobs_sbert = compute_silhouette(job_embedds, n_clusters_jobs)

sil_resumes_voyage = compute_silhouette(resume_embedds_voy, n_clusters_resumes)
sil_jobs_voyage = compute_silhouette(job_embedds_voy, n_clusters_jobs)

print("Silhouette Scores:")
print(f"SBERT Resumes: {sil_resumes_sbert:.4f}")
print(f"SBERT Jobs: {sil_jobs_sbert:.4f}")
print(f"VoyageAI Resumes: {sil_resumes_voyage:.4f}")
print(f"VoyageAI Jobs: {sil_jobs_voyage:.4f}")


Silhouette Scores:
SBERT Resumes: 0.1184
SBERT Jobs: 0.0898
VoyageAI Resumes: 0.1070
VoyageAI Jobs: 0.1024


- Although the silhouette scores are relatively low, SBERT seems to perform better on the resumes compares to VoyageAI which does better on the job embeddings.