# 0. Setup and Load Cleaned Dataset

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.1.2


In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

In [15]:
BASE_DIR = "../Data"

job_path    = f"{BASE_DIR}/cleaned_data.json" #Use Job Data Dedup version
resume_path = f"{BASE_DIR}/cleaned_resume.csv"

# Try loading from cleaned_data.json first (default for NLP)
if os.path.exists(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    job_df = pd.DataFrame(data)

resume_df = pd.read_csv(resume_path)  # column: 'cleaned_text'

# Normalize column names
job_df    = job_df.rename(columns={"job_text_cleaned": "job_text"})
resume_df = resume_df.rename(columns={"cleaned_text": "resume_text"})

print(job_df.head(2))
print(resume_df.head(2))

   job_id                                           job_text
0       0  Digital Marketing Specialist\nManage and grow ...
1       1  Web Developer\nDesign and code user interfaces...
                                         resume_text
0  SKILLS\n - Programming & Analytics: Python, R ...


# 1. Load SBERT Model (with GPU if available)

In [16]:
# Choose device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load Sentence-BERT model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model = model.to(device)

Using device: cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# 2. Encode Job Descriptions & Resumes into Embeddings

In [None]:
# Convert to lists of strings
job_texts    = job_df["job_text"].astype(str).tolist()
resume_texts = resume_df["resume_text"].astype(str).tolist()

# Encode with SBERT
job_embeddings = model.encode(
    job_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device
)

resume_embeddings = model.encode(
    resume_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device
)

print("Job embeddings shape:", job_embeddings.shape)
print("Resume embeddings shape:", resume_embeddings.shape)

Batches:   0%|          | 0/231 [00:00<?, ?it/s]

#3. Rank Jobs for a Given Resume (Semantic Score with Cosine Similarity)

In [None]:
def rank_jobs_for_resume_sbert(resume_idx, job_df, resume_df,
                               job_embeds, resume_embeds,
                               top_k=5):
    """
    Rank jobs for a single resume using SBERT cosine similarity.

    resume_idx : index of the resume in resume_df (0-based)
    job_df     : dataframe with at least 'job_text'
    resume_df  : dataframe with at least 'resume_text'
    job_embeds : torch.Tensor of shape (num_jobs, dim)
    resume_embeds : torch.Tensor of shape (num_resumes, dim)
    top_k      : number of top jobs to return
    """
    # Ensure 0-based clean indexing
    job_df_reset    = job_df.reset_index(drop=True)
    resume_df_reset = resume_df.reset_index(drop=True)

    # 1. Get the resume embedding (1, dim)
    resume_vec = resume_embeds[resume_idx].unsqueeze(0)  # shape (1, dim)

    # 2. Cosine similarity between this resume and all job embeddings
    cos_scores = util.cos_sim(resume_vec, job_embeds)[0]  # shape (num_jobs,)

    # 3. Convert to CPU + numpy for easier handling
    cos_scores_np = cos_scores.cpu().numpy()

    # 4. Attach to dataframe
    result = job_df_reset.copy()
    result["semantic_score"] = cos_scores_np

    # 5. Sort by descending similarity
    result = result.sort_values("semantic_score", ascending=False)

    if top_k is not None:
        result = result.head(top_k)

    return result

In [None]:
top_jobs_for_resume0 = rank_jobs_for_resume_sbert(
    resume_idx=0,
    job_df=job_df,
    resume_df=resume_df,
    job_embeds=job_embeddings,
    resume_embeds=resume_embeddings,
    top_k=5
)

# Show text & score (adjust column names if you later add more columns)
top_jobs_for_resume0[["job_text", "semantic_score"]].head(5)

Unnamed: 0,job_text,semantic_score
9424,Senior FP&A Analyst\nAbout the job Opportunity...,0.682754
3567,Senior Data Management Professional - Data Eng...,0.677219
7446,Data Product Owner / Data Consultant with veri...,0.677138
11471,Python Developer\nAbout the job Job Title: Pyt...,0.661784
1843,Frontend Engineer\nAbout the job Why Numeric E...,0.654277
