# 0. Setup and Load Cleaned Dataset

In [None]:
!pip install gensim
!pip install scikit-learn

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

Mounted at /content/drive


In [None]:
# Base folder where your files live
BASE_DIR = "/content/drive/MyDrive/NLP project/Data_Cleaning"

job_path    = f"{BASE_DIR}/cleaned_job_data_dedup.csv"  #Use Job Data Dedup version
resume_path = f"{BASE_DIR}/cleaned_resume.csv"

# Load cleaned job postings (column: 'job_text_cleaned')
job_df = pd.read_csv(job_path)

# Load cleaned resumes (column: 'cleaned_text')
resume_df = pd.read_csv(resume_path)

# Normalize column names to something easier to remember
job_df    = job_df.rename(columns={"job_text_cleaned": "job_text"})
resume_df = resume_df.rename(columns={"cleaned_text": "resume_text"})

print(job_df.head(2))
print(resume_df.head(2))
print("Job columns:", job_df.columns)
print("Resume columns:", resume_df.columns)

   job_id                                           job_text
0       0  Digital Marketing Specialist\nManage and grow ...
1       1  Web Developer\nDesign and code user interfaces...
                                         resume_text
0  SKILLS\n - Programming & Analytics: Python, R ...
Job columns: Index(['job_id', 'job_text'], dtype='object')
Resume columns: Index(['resume_text'], dtype='object')


# 1. Tokenization

In [None]:
def simple_tokenize(text):
    """
    Super simple tokenizer.
    Assumes `text` is already cleaned & lowercased.
    """
    if pd.isna(text):
        return []
    return str(text).split()

job_df["tokens"]    = job_df["job_text"].apply(simple_tokenize)
resume_df["tokens"] = resume_df["resume_text"].apply(simple_tokenize)

print(job_df["tokens"].head(2))
print(resume_df["tokens"].head(2))

0    [Digital, Marketing, Specialist, Manage, and, ...
1    [Web, Developer, Design, and, code, user, inte...
Name: tokens, dtype: object
0    [SKILLS, -, Programming, &, Analytics:, Python...
Name: tokens, dtype: object


# 2. Train Word2Vec on Jobs + Resumes

In [None]:
# --------- 1) COMBINE ALL TOKENS INTO ONE TRAINING CORPUS ----------
training_corpus = list(job_df["tokens"]) + list(resume_df["tokens"])

print("Example sentence:", training_corpus[0][:20])

# --------- 2) FASTER WORD2VEC SETTINGS ----------
EMBEDDING_DIM = 300   # smaller than 300 = faster
WINDOW_SIZE   = 5
MIN_COUNT     = 10    # ignore very rare words, mentioned less than 10 times
SG            = 1     # 0 = CBOW (faster), 1 = Skip-gram (slower but better for this case)

w2v_model = Word2Vec(
    sentences=training_corpus,
    vector_size=EMBEDDING_DIM,
    window=WINDOW_SIZE,
    min_count=MIN_COUNT,
    workers=4,
    sg=SG,
    epochs=10
)

# Note: hyperparameter setting used above has been tested, this is the best tuning version.

print("Vocabulary size:", len(w2v_model.wv))

Example sentence: ['Digital', 'Marketing', 'Specialist', 'Manage', 'and', 'grow', 'social', 'media', 'accounts,', 'create', 'engaging', 'content,', 'and', 'interact', 'with', 'the', 'online', 'community.', 'Develop', 'social']
Vocabulary size: 32451


#3. Build Document Embeddings for All Jobs & Resumes

In [None]:
import numpy as np

def get_doc_embedding(tokens, model, vector_size=EMBEDDING_DIM):
    """
    Turn a list of tokens into a single document vector
    by averaging all in-vocabulary word vectors.
    """
    vectors = [model.wv[word] for word in tokens if word in model.wv]

    if not vectors:
        return np.zeros(vector_size, dtype="float32")

    return np.mean(vectors, axis=0)

job_df["embedding"] = job_df["tokens"].apply(
    lambda toks: get_doc_embedding(toks, w2v_model)
)

resume_df["embedding"] = resume_df["tokens"].apply(
    lambda toks: get_doc_embedding(toks, w2v_model)
)

print("One job embedding shape:", job_df["embedding"].iloc[0].shape)
print("One resume embedding shape:", resume_df["embedding"].iloc[0].shape)

One job embedding shape: (300,)
One resume embedding shape: (300,)


# 4. Cosine Similarity: Rank Jobs for a Given Resume

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def rank_jobs_for_resume(resume_idx, job_df, resume_df, top_k=20):
    """
    Compute cosine similarity between a single resume and all job descriptions.
    Returns a copy of job_df with 'semantic_score', sorted descending.
    """
    # Ensure clean indices
    job_df_reset    = job_df.reset_index(drop=True)
    resume_df_reset = resume_df.reset_index(drop=True)

    # 1. Get the resume embedding
    resume_vec = resume_df_reset.loc[resume_idx, "embedding"].reshape(1, -1)  # (1, dim)

    # 2. Stack all job embeddings
    job_matrix = np.stack(job_df_reset["embedding"].values)  # (n_jobs, dim)

    # 3. Cosine similarity
    sims = cosine_similarity(resume_vec, job_matrix)[0]  # (n_jobs,)

    # 4. Attach scores to jobs
    result = job_df_reset.copy()
    result["semantic_score"] = sims

    # 5. Sort & slice
    result = result.sort_values("semantic_score", ascending=False)

    if top_k is not None:
        result = result.head(top_k)

    return result

# Example: top 3 jobs for the first resume
top_jobs_for_first_resume = rank_jobs_for_resume(
    resume_idx=0,
    job_df=job_df,
    resume_df=resume_df,
    top_k=5
)

# Adjust columns you want to see here, also how many top jobs you want to see
display_cols = [col for col in ["job_text", "semantic_score"] if col in top_jobs_for_first_resume.columns]
top_jobs_for_first_resume[display_cols].head(5)

Unnamed: 0,job_text,semantic_score
735,E-Commerce Analyst (Hospitality Investments) w...,0.958193
8243,Operations Data Analyst - W2 Only with verific...,0.953139
14648,"Professional, Business Intelligence Analyst - ...",0.951397
4191,Senior Business Intelligence Analyst\nAbout th...,0.951236
1528,Data Analyst\nAbout the job About Us Sports Re...,0.950588
