In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer #to tun text into numbers
from sklearn.metrics.pairwise import cosine_similarity #to help mesure how similar tow texts are 


In [2]:
#we load the candidate list
df = pd.read_csv("potential_talents.csv")
#we then open the top five columns
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
df.isnull().sum() #we are checking missing values 

id              0
job_title       0
location        0
connection      0
fit           104
dtype: int64

In [4]:
#we want to check information about this data , like how many rows it has etc
df.info()
len(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB


104

In [5]:
#we want to access columns 
df["job_title"].unique() #this line help us to see whats in the columnns

array(['2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional',
       'Native English Teacher at EPIK (English Program in Korea)',
       'Aspiring Human Resources Professional',
       'People Development Coordinator at Ryan',
       'Advisory Board Member at Celal Bayar University',
       'Aspiring Human Resources Specialist',
       'Student at Humber College and Aspiring Human Resources Generalist',
       'HR Senior Specialist',
       'Seeking Human Resources HRIS and Generalist Positions',
       'Student at Chapman University',
       'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR',
       'Human Resources Coordinator at InterContinental Buckhead Atlanta',
       'Aspiring Human Resources Management student seeking an internship',
       'Seeking Human Resources Opportunities',
       'Experienced Retail Manager and aspiring Human Resources Professional',
       'H

In [6]:
#we want to check how many of the unique value we have 
df['job_title'].value_counts()

job_title
2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 7
Aspiring Human Resources Professional                                                                                    7
Student at Humber College and Aspiring Human Resources Generalist                                                        7
People Development Coordinator at Ryan                                                                                   6
Native English Teacher at EPIK (English Program in Korea)                                                                5
Aspiring Human Resources Specialist                                                                                      5
HR Senior Specialist                                                                                                     5
Advisory Board Member at Celal Bayar University                                                                          4
Seekin

Now this is the feedback part.
I will star by id, and the we will check wheteher the overall K becomes more HR-like

In [55]:


# we first  Clean

df["job_title"] = df["job_title"].fillna("").astype(str)


# 1) we then define role keywords

role_keywords = "aspiring human resources seeking human resources hr"


# 2) HR terms for baseline + evaluation proxy (these are the words that we will be looking for in the resumes)

hr_terms = [
    "human resources", "hr ", " hr", "recruit", "recruitment",
    "talent acquisition", "talent", "hrbp", "people operations", "people ops",
    "payroll", "benefits", "employee relations", "staffing"
]

def has_hr_terms(title: str) -> int:
    t = title.lower()
    return int(any(term in t for term in hr_terms))


# 3) Light filter , for roless that are not related (this is optional)
Non_hr_terms = [
    "software engineer", "developer", "backend", "frontend", "full stack",
    "data engineer", "devops", "machine learning", "cloud engineer"
]

def is_obvious_non_hr(title: str) -> int:
    t = title.lower()
    return int(any(term in t for term in Non_hr_terms))

df_filtered = df[df["job_title"].apply(is_obvious_non_hr) == 0].copy()


# 4) we then do a baseline ranking for fair comparison

df_filtered["baseline_relevant"] = df_filtered["job_title"].apply(has_hr_terms)
baseline_ranked = df_filtered.sort_values("baseline_relevant", ascending=False).reset_index(drop=True)


# 5) TF-IDF(term frequency - inverse document freequency)  ranking based on filtered data

vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
X = vec.fit_transform(df_filtered["job_title"].tolist() + [role_keywords])

df_filtered["tfidf_score"] = cosine_similarity(X[:-1], X[-1]).ravel()
tfidf_ranked = df_filtered.sort_values("tfidf_score", ascending=False).reset_index(drop=True)


# 6) we star Star the 7th candidate from TF-IDF list , we can always change this

starred_id = tfidf_ranked.loc[6, "id"]
starred_title = tfidf_ranked.loc[tfidf_ranked["id"] == starred_id, "job_title"].iloc[0]


# 7) Re-rank after star : bascially as resumes come in we always updade the star

text_to_numbers_2 = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
vectors_2 = text_to_numbers_2.fit_transform(tfidf_ranked["job_title"].tolist() + [role_keywords, starred_title])

candidate_vectors_2 = vectors_2[:-2]
keyword_vector_2 = vectors_2[-2]
star_vector_2 = vectors_2[-1]

keyword_scores = cosine_similarity(candidate_vectors_2, keyword_vector_2).ravel()
star_scores = cosine_similarity(candidate_vectors_2, star_vector_2).ravel()

tfidf_ranked["fit_score_updated"] = 0.8 * keyword_scores + 0.2 * star_scores
updated_ranked_list = tfidf_ranked.sort_values("fit_score_updated", ascending=False).reset_index(drop=True)


# 8) then we use Metric function, which is my function

def hr_hit_rate_at_k(ranked_df: pd.DataFrame, k: int) -> float:
    topk = ranked_df.head(k).copy()
    return topk["job_title"].apply(has_hr_terms).mean()

# we print both K=50 and K=10 (same metric, better sensitivity)
print("Baseline HR hit-rate@50:", hr_hit_rate_at_k(baseline_ranked, 50))
print("TF-IDF   HR hit-rate@50:", hr_hit_rate_at_k(tfidf_ranked, 50))
print("AFTER starr  HR hit-rate@50:", hr_hit_rate_at_k(updated_ranked_list, 50))

print("Baseline HR hit-rate_at10:", hr_hit_rate_at_k(baseline_ranked, 10))
print("TF-IDF   HR hit-rate_at10:", hr_hit_rate_at_k(tfidf_ranked, 10))
print("AFTER starr  HR hit-rate_at10:", hr_hit_rate_at_k(updated_ranked_list, 10))


Baseline HR hit-rate@50: 1.0
TF-IDF   HR hit-rate@50: 0.98
AFTER starr  HR hit-rate@50: 0.98
Baseline HR hit-rate_at10: 1.0
TF-IDF   HR hit-rate_at10: 1.0
AFTER starr  HR hit-rate_at10: 1.0


In [56]:
# store star similarity on the same df BEFORE sorting
tfidf_ranked["star_similarity"] = star_scores

# re-create updated_ranked_list (so it carries the column)
updated_ranked_list = tfidf_ranked.sort_values("fit_score_updated", ascending=False).reset_index(drop=True)

print("Mean star similarity_at10 BEFORE:", tfidf_ranked.head(10)["star_similarity"].mean())
print("Mean star similarity_at10 AFTER :", updated_ranked_list.head(10)["star_similarity"].mean())


Mean star similarity_at10 BEFORE: 0.7488848661977843
Mean star similarity_at10 AFTER : 0.8752768586945964


In [57]:

# We store scores

tfidf_ranked["keyword_score"] = keyword_scores
tfidf_ranked["star_similarity"] = star_scores

# Updated score: your chosen rule (80% keywords, 20% star)
tfidf_ranked["fit_score_updated"] = (
    0.8 * tfidf_ranked["keyword_score"] +
    0.2 * tfidf_ranked["star_similarity"]
)

# we then rerank after star
updated_ranked_list = tfidf_ranked.sort_values(
    "fit_score_updated", ascending=False
).reset_index(drop=True)


# this is the real evidemce... the metric 

print("Mean keyword score_at10 BEFORE:", tfidf_ranked.head(10)["keyword_score"].mean())
print("Mean keyword score_at10 AFTER :", updated_ranked_list.head(10)["keyword_score"].mean())

print("Mean star similarity_at10 BEFORE:", tfidf_ranked.head(10)["star_similarity"].mean())
print("Mean star similarity_at10 AFTER :", updated_ranked_list.head(10)["star_similarity"].mean())

# Cluster effect: did MORE similar candidates rise?
threshold = 0.70
before_count = (tfidf_ranked.head(10)["star_similarity"] >= threshold).sum()
after_count  = (updated_ranked_list.head(10)["star_similarity"] >= threshold).sum()

print("Count of star-similar >= 0.70 in top 10 BEFORE:", before_count)
print("Count of star-similar >= 0.70 in top 10 AFTER :", after_count)


Mean keyword score_at10 BEFORE: 0.3805534743635245
Mean keyword score_at10 AFTER : 0.35951762114789937
Mean star similarity_at10 BEFORE: 0.7488848661977843
Mean star similarity_at10 AFTER : 0.8752768586945964
Count of star-similar >= 0.70 in top 10 BEFORE: 7
Count of star-similar >= 0.70 in top 10 AFTER : 8


#  Candidate Ranking Assignment

## Purpose
This project builds a system to **rank candidates** for a role and **update the ranking** when a recruiter provides feedback by starring a candidate.

---

## Data
Each row represents one candidate and includes:
- `id` – unique identifier  
- `job_title` – candidate’s role (text)  
- `location` – candidate location  
- `connection` – number of professional connections  
- `fit` – desired target (not provided in the dataset)

 Since the `fit` column has no labels, a supervised model cannot be trained.  
Instead, a **fit score** is created using text similarity and recruiter feedback.

---

## Method

### 1. Initial Ranking
- Compare each candidate’s job title with role keywords  
  (“Aspiring human resources”, “Seeking human resources”, “HR”)  
- Use TF-IDF and cosine similarity  
- Rank candidates by similarity score (0–1)

---

### 2. Light Filtering
- Remove obvious non-HR technical roles  
- Keep unclear cases to avoid losing strong candidates

---

### 3. Recruiter Feedback 
- A recruiter stars a candidate they consider a strong fit  
- The starred candidate is treated as an ideal example

---

### 4. Re-Ranking After Feedback
Updated fit score:



This keeps role relevance dominant while learning from feedback.

---

## Evaluation
Because true labels are not available, improvement is measured by:
- Increased similarity of top candidates to the starred profile  
- Stable keyword relevance after re-ranking  
- More candidates similar to the starred one appearing in the top results  

This shows the ranking improves without overfitting.

---

## Cut-Off Strategy
- Use a **Top-K approach (K = 50)**  
- Works across roles and avoids hiding high-potential candidates

---

## Summary
This notebook demonstrates a candidate ranking system that:

- Ranks candidates using text similarity

- Learns from recruiter feedback

- Improves ranking over time

- Remains simple, transparent, and explainable

The system supports recruiters rather than replacing them.