# Job Recommendation

In [1]:

import spacy
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor
import docx2txt
import PyPDF2
import os
import fitz  # PyMuPDF
import json
import warnings
import pandas as pd
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


warnings.filterwarnings("ignore", message="\[W008\]")


Text Extraction Functions

In [2]:
def extract_text_from_pdf_fitz(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".pdf":
        try:
            # Try PyPDF2 first
            with open(file_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() or ""
                if text.strip():
                    return text
                else:
                    # fallback to PyMuPDF if empty
                    return extract_text_from_pdf_fitz(file_path)
        except Exception:
            return extract_text_from_pdf_fitz(file_path)

    elif ext == ".docx":
        try:
            return docx2txt.process(file_path)
        except Exception as e:
            raise ValueError(f"Failed to read DOCX: {e}")
    
    else:
        raise ValueError("Unsupported file type (.pdf/.docx only)")
    


print CV_Text

Custom Skill Loading and Extraction

In [3]:
def load_skills_from_json(file_path="skills.json"):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
            return data.get("skills", [])
    except Exception as e:
        print(f" Could not load custom skills from JSON: {e}")
        return []

def extract_skills_from_text(text, skill_keywords):
    return [skill for skill in skill_keywords if skill.lower() in text.lower()]


Initialize NLP and SkillExtractor

In [4]:
# Load SpaCy model
nlp = spacy.load("en_core_web_lg")

# Initialize SkillExtractor
try:
    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
except Exception as e:
    print(f" Warning: Could not initialize SkillExtractor: {e}")
    skill_extractor = None


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


Load CV and Extract Text

In [5]:
file_path = input("Input path of your CV here (pdf or docx): ").strip()
if not os.path.exists(file_path):
    raise FileNotFoundError(f" File not found: {file_path}")

try:
    text = extract_text_from_file(file_path)
except Exception as e:
    print(f" Error extracting text: {e}")
    exit()


Extract Skills Using SkillExtractor (NLP) and Custom Keywords

In [6]:
nlp_skills = set()

if skill_extractor:
    try:
        annotations = skill_extractor.annotate(text)
        full_matches = annotations["results"].get("full_matches", [])
        ngram_matches = annotations["results"].get("ngram_scored", [])

        for match in full_matches + ngram_matches:
            if "doc_node_value" in match:
                nlp_skills.add(match["doc_node_value"])

    except Exception as e:
        print(f" Error using SkillExtractor: {e}")

custom_skills_list = load_skills_from_json()
keyword_skills = set(extract_skills_from_text(text, custom_skills_list))

all_skills = nlp_skills.union(keyword_skills)


Print Extracted Skills Summary

In [7]:
print(f"\n Total Skills Extracted: {len(all_skills)}")
print(" NLP-based:", list(nlp_skills))
print(" Keyword-based:", list(keyword_skills))
# print(" Combined:", list(all_skills))
# Normalize and deduplicate
normalized_skills = set(skill.strip() for skill in all_skills)

# Optionally convert back to list and sort
unique_skills = sorted(normalized_skills)
ignore = ["c", "d","m", "a", "b", "e", "f", "g", "h", "i", "j", "k", "l", "n","r", "o", "p", "q","s", "t", "u", "v", "w", "x", "y", "z"]

# Remove ignored words from unique_skills
unique_skills = [s for s in unique_skills if s not in ignore]
print("Combined:", list(unique_skills))




 Total Skills Extracted: 21
 NLP-based: ['teamwork', 'communication', 'languages khmer', 'm', 'microsoft office', 'english', 'com', 'creativity']
 Keyword-based: ['Teamwork', 'PowerPoint', 'English', 'R', 'AI', 'Word', 'Excel', 'Go', 'C', 'Khmer', 'D', 'Communication', 'Creativity']
Combined: ['AI', 'C', 'Communication', 'Creativity', 'D', 'English', 'Excel', 'Go', 'Khmer', 'PowerPoint', 'R', 'Teamwork', 'Word', 'com', 'communication', 'creativity', 'english', 'languages khmer', 'microsoft office', 'teamwork']


Load Job Dataset and Preprocess

In [8]:
df = pd.read_csv("camhr_cleaned_data.csv")  
df['job_text_lower'] = df['job_text'].fillna('').str.lower()


Prepare CV Skills Text for TF-IDF

In [9]:
cv_skills_set = set(skill.lower() for skill in unique_skills)
cv_text = ' '.join(cv_skills_set)


 TF-IDF Vectorization and Cosine Similarity Calculation

In [10]:
# Load a small, fast BERT model

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [11]:
#Generate embeddings
cv_embedding = model.encode(cv_text, convert_to_tensor=True)
# job_embeddings = model.encode(df['job_text'].tolist(), convert_to_tensor=True)
try:
    job_embeddings = torch.tensor(np.load("job_embeddings.npy"))
except FileNotFoundError:
    job_embeddings = model.encode(df['job_text'].tolist(), convert_to_tensor=True, batch_size=32, show_progress_bar=True)
    np.save("job_embeddings.npy", job_embeddings.cpu().numpy())


In [12]:
#Compute cosine similarity
cosine_scores = util.cos_sim(cv_embedding, job_embeddings)[0]
df['bert_match_score'] = cosine_scores.cpu().numpy()
print(df['bert_match_score'])

0        0.321903
1        0.243452
2        0.263744
3        0.360261
4        0.343817
           ...   
20131    0.263744
20132    0.210190
20133    0.304808
20134    0.362656
20135    0.218359
Name: bert_match_score, Length: 20136, dtype: float32


In [13]:
#Compute TF-IDF similarity
texts = df['job_text'].tolist() + [cv_text]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
cv_vector = tfidf_matrix[-1]  # Last vector is for CV
job_vectors = tfidf_matrix[:-1]  # All job vectors

cosine_scores_tfidf = cosine_similarity(cv_vector, job_vectors).flatten()
df['tfidf_match_score'] = cosine_scores_tfidf
print(df['tfidf_match_score'])

0        0.020195
1        0.049239
2        0.014848
3        0.015733
4        0.050476
           ...   
20131    0.014848
20132    0.006251
20133    0.000000
20134    0.014540
20135    0.016296
Name: tfidf_match_score, Length: 20136, dtype: float64


In [14]:
#Combine BERT + TF-IDF
# You can tune the weights (e.g., 0.6 for BERT, 0.4 for TF-IDF)
df['final_score'] = 0.6 * df['bert_match_score'] + 0.4 * df['tfidf_match_score']
print(df['final_score'])

0        0.201219
1        0.165767
2        0.164186
3        0.222449
4        0.226481
           ...   
20131    0.164186
20132    0.128614
20133    0.182885
20134    0.223410
20135    0.137534
Name: final_score, Length: 20136, dtype: float64


In [15]:
#Compute matched skills

def match_skills(job_text):
    return {skill for skill in cv_skills_set if skill in job_text.lower()}

df['matched_skills'] = df['job_text'].apply(match_skills)
print(df['matched_skills'])

0              {d, excel, c, ai, microsoft office, com, r}
1        {d, excel, c, ai, microsoft office, com, word, r}
2                              {d, c, english, com, r, go}
3                              {d, c, english, com, r, go}
4        {d, excel, teamwork, c, communication, english...
                               ...                        
20131                          {d, c, english, com, r, go}
20132             {d, excel, c, communication, ai, com, r}
20133                                   {d, c, com, r, go}
20134         {d, excel, c, ai, english, com, word, r, go}
20135                          {d, c, ai, english, com, r}
Name: matched_skills, Length: 20136, dtype: object


In [16]:
#Sort and filter matches
df_sorted = df.sort_values(by='final_score', ascending=False)
top_matches = df_sorted[df_sorted['final_score'] > 0.3]
top_matches = top_matches.drop_duplicates(subset=['Company Name', 'Job Title'], keep='first')
  # You can adjust threshold


# Display Top 5 Job For User

In [17]:
#Show top recommendations with scores and skills

display(top_matches[['Job Title', 'Company Name', 'final_score', 'bert_match_score', 'tfidf_match_score', 'matched_skills', 'Link URL']].head())


Unnamed: 0,Job Title,Company Name,final_score,bert_match_score,tfidf_match_score,matched_skills,Link URL
5799,Coordinator,"Suosdey Works Pte., Ltd",0.445273,0.621728,0.180591,"{d, excel, c, english, com, khmer, word, r}",https://www.camhr.com/a/job/10591233
13788,Academic Coordinator (Khmer Academic Program),Westline School,0.431231,0.627288,0.137144,"{d, excel, c, ai, khmer, word, r}",https://www.camhr.com/a/job/10587817
443,Sales & Marketing Admin,"AQUALIFE CO., LTD.",0.419976,0.542577,0.236074,"{d, excel, c, communication, microsoft office,...",https://www.camhr.com/a/job/10600546
11791,Vice Principal (Khmer Academic Program) at Wes...,Westline School,0.416121,0.618974,0.11184,"{d, excel, c, ai, khmer, word, r}",https://www.camhr.com/a/job/10583765
17136,គ្រូភាសាខ្មែរ (មធ្យមសិក្សាបឋមភូមិ និងទុតិយភូមិ...,ក្រុមហ៊ុន ម៉េងលី ជេ. គួច អេឌ្យូខេសិន (Mengly J...,0.415006,0.643891,0.071678,"{d, excel, c, ai, khmer, word, r}",https://www.camhr.com/a/job/10596672


In [18]:
# import pandas as pd
# import numpy as np

# embeddings = np.load("job_embeddings.npy")
# df_embed = pd.DataFrame(embeddings)
# df_embed.to_csv("job_embeddings_preview.csv", index=False)


In [19]:
# texts = df['job_text_lower'].tolist() + [cv_text]
# vectorizer = TfidfVectorizer()
# tfidf_matrix = vectorizer.fit_transform(texts)

# cv_vector = tfidf_matrix[-1]
# job_vectors = tfidf_matrix[:-1]

# cosine_similarities = cosine_similarity(cv_vector, job_vectors).flatten()
# df['match_score'] = cosine_similarities

# print(df['match_score'])





In [20]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# plt.xlabel('Feature Index')
# plt.ylabel('TF-IDF Value')

# # # Plot the first job vector and the CV vector
# # plt.plot(job_vectors.toarray()[0], label='First Job Vector')
# # plt.plot(cv_vector.toarray()[0], label='CV Vector')

# plt.title('TF-IDF Comparison: Job vs CV')
# plt.scatter(range(len(job_vectors.toarray()[0])), job_vectors.toarray()[0], label='First Job Vector', alpha=0.7)
# plt.scatter(range(len(cv_vector.toarray()[0])), cv_vector.toarray()[0], label='CV Vector', alpha=0.7)
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()


 Matched Skills (Keyword Overlap) per Job

In [21]:
# def match_skills(text):
#     return {skill for skill in cv_skills_set if skill in text}

# df['matched_skills'] = df['job_text_lower'].apply(match_skills)


# print(df['matched_skills'])


filter by experience

In [22]:
# import re
# from dateutil import parser
# from datetime import datetime

# def extract_experience_ranges(file_path):
#     # Match common date range formats
#     patterns = [
#         r'([A-Za-z]{3,9} \d{4})\s*[-–]\s*(Present|Current|[A-Za-z]{3,9} \d{4})',
#         r'(\d{4}-\d{2})\s*[-–]\s*(Present|Current|\d{4}-\d{2})'
#     ]

#     total_months = 0
#     for pattern in patterns:
#         matches = re.findall(pattern, file_path)
#         for start_str, end_str in matches:
#             try:
#                 start_date = parser.parse(start_str)
#                 end_date = datetime.today() if end_str.lower() in ['present', 'current'] else parser.parse(end_str)
#                 months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
#                 total_months += max(months, 0)
#             except Exception as e:
#                 continue

#     return round(total_months / 12, 1)


In [23]:
# # cv_text = "Feb 2020 – Present\nMar 2018 – Dec 2019"
# years_exp = extract_experience_ranges(cv_text)
# print("Estimated total years of experience:", years_exp)


In [24]:
# from sklearn.tree import DecisionTreeClassifier

# # Updated dataset with new categories
# data = [
#     (0, 'Entry-Level'),
#     (1, 'Entry-Level'),
#     (2, 'Junior'),
#     (3, 'Junior'),
#     (4, 'Mid-Level'),
#     (5, 'Mid-Level'),
#     (6, 'Mid-Level'),
#     (7, 'Senior'),
#     (8, 'Senior'),
#     (10, 'Senior'),
#     (15, 'Senior'),
#     (20, 'Senior'),
# ]

# # Split into features and labels
# X = [[years] for years, label in data]
# y = [label for years, label in data]

# # Train the model
# model = DecisionTreeClassifier()
# model.fit(X, y)

# # Get user input
# y_ex = years_exp

# # Predict category
# prediction = model.predict([[y_ex]])
# print("You are categorized as:", prediction[0])


# Display Top 5 Job For User

Sort and Deduplicate by Company, Show Top Matches

In [25]:
# # Convert 'Year of Exp.' to numeric, coerce errors to NaN, then fill NaN with 0 for comparison
# df['Year of Exp.'] = pd.to_numeric(df['Year of Exp.'], errors='coerce').fillna(0)

# df_sorted = df.sort_values(by='match_score', ascending=False)

# df_sorted = df_sorted[df_sorted['match_score'] > 0.1]  # Filter out low match scores
# #df_sorted = df_sorted[(df_sorted['Year of Exp.'] <= y_ex)] #| (df_sorted['Year of Exp.'] < y_ex - 2)]
# df_unique_companies = df_sorted.drop_duplicates(subset='Company Name', keep='first')
# display(df_unique_companies[['Job Title', 'Company Name', 'Year of Exp.', 'Salary', 'matched_skills', 'match_score', 'Link URL']].head(5))

