# EDA (Basic)

In [2]:
!pip install datasets
!pip install contractions
!pip install transformers

import numpy as np
import pandas as pd

import torch
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

import re
import string
import contractions
from tqdm import tqdm
tqdm.pandas(desc="Progress Bar")


Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.2-py3-none-a

# loading and extracting JD data

In [None]:
df = load_dataset('jacob-hugging-face/job-descriptions', split="train")
df

In [4]:
df1 = pd.DataFrame(df)
df1.head()

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."


In [5]:
print(df1['job_description'][2])

its an amazing time to be joining netflix as we continue to transform entertainment globally netflix is the worlds leading internet entertainment service with over  million paid memberships in over  countries enjoying tv series documentaries and feature films across a wide variety of genres and languages members can watch as much as they want anytime anywhere on any internetconnected screen members can play pause and resume watching all without commercials or commitments

the consumer products team aspires to connect members to our content and each other by bringing their favorite stories to real life our products and campaigns should entertain delight and bring joy to our fans all over the world 

due to the expansion of licensees and categories we are in need of additional support with active and prospective partnerships for current and upcoming titles we are looking for a licensing coordinator who will work across the entire slate of netflix content to bring our content to life via 

In [6]:
df2 = pd.read_csv('pdf1.csv')
df2.head()

Unnamed: 0,Skills,Education,ID,Category
0,Accounting; General Accounting; Accounts Payab...,Northern Maine Community College 1994 Associat...,10554236,ACCOUNTANT
1,"accounting, accounts payable, Accounts Receiva...","Bachelor of Science : Accounting , May 2010 Un...",10674770,ACCOUNTANT
2,"accounts payables, accounts receivables, Accou...",Computer Applications Specialist Certificate P...,11163645,ACCOUNTANT
3,"accounting, balance sheet, budgets, client, cl...","EMORY UNIVERSITY, Goizueta Business School 5 2...",11759079,ACCOUNTANT
4,Aderant/CMS Financial reporting,Bachelor of Business Administration : Accounti...,12065211,ACCOUNTANT


# Text Cleaning

In [8]:
import pandas as pd
import re
import string
from contractions import fix

def text_cleaning(text: str) -> str:

    # Check for empty or null input
    if not text or pd.isnull(text):
        return ""

    # Lowercase everything
    text = text.lower().strip()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Expand contractions
    text = fix(text)

    # Remove URLs, emails, and phone numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\b\d{1,3}[-./]?\d{1,3}[-./]?\d{1,4}\b', '', text)

    # Remove other non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    return text.strip()

# Drop rows where Skills and Education are both null
df_x = df2[~(df2['Skills'].isna() & df2['Education'].isna())].reset_index(drop=True)

# Fill null values in Skills and Education with empty string
df_x = df_x.fillna(value='')

# Concatenate Skills and Education
df_x['CV'] = df_x['Skills'] + ' ' + df_x['Education']

# Apply text cleaning to CV
df_x['CV'] = df_x['CV'].progress_apply(text_cleaning)


Progress Bar: 100%|██████████| 2460/2460 [00:00<00:00, 10446.70it/s]


In [10]:
df_x.shape

(2460, 5)

In [11]:
job_desc = df1['job_description'].apply(text_cleaning)[:13].to_list()

resumes = df_x['CV'].to_list()

# Creating Embedding 

In [None]:
import time

def tokenize_and_embed(tokenizer, model, texts):
  embeddings = []
  for text in texts:
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
      output = model(**tokens)
    embeddings.append(output.last_hidden_state.mean(dim=1).numpy()[0])  # Flatten the embeddings to 1D

  return embeddings


if __name__ == '__main__':
  # Initialize the DistilBERT tokenizer and model
  tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
  model = DistilBertModel.from_pretrained('distilbert-base-uncased')

  # Tokenize and embed job descriptions
  start_time = time.time()
  job_description_embeddings = tokenize_and_embed(tokenizer, model, job_desc)
  end_time = time.time()
  print(f'Tokenized and embedded job descriptions in {end_time - start_time:.2f} seconds.')

  # Tokenize and embed resumes
  start_time = time.time()
  resume_embeddings = tokenize_and_embed(tokenizer, model, resumes)
  end_time = time.time()
  print(f'Tokenized and embedded resumes in {end_time - start_time:.2f} seconds.')


In [13]:
job_description_embeddings[0].shape, resume_embeddings[0].shape

((768,), (768,))

In [14]:
len(job_description_embeddings), len(resume_embeddings)

(13, 2460)

# Calculating Similarity Score & Getting Top 5 Candidates

In [15]:
similarity_scores = cosine_similarity(job_description_embeddings, resume_embeddings)
similarity_scores

array([[0.8208877 , 0.7767726 , 0.78489166, ..., 0.85628176, 0.7705457 ,
        0.60554785],
       [0.78442526, 0.7171359 , 0.74813074, ..., 0.82919896, 0.7884388 ,
        0.6784491 ],
       [0.8139858 , 0.7895926 , 0.78363687, ..., 0.847719  , 0.75802994,
        0.6202629 ],
       ...,
       [0.84438765, 0.8200938 , 0.79944974, ..., 0.88014156, 0.77435607,
        0.6381687 ],
       [0.8395775 , 0.7994318 , 0.8025913 , ..., 0.8699881 , 0.7893372 ,
        0.62778986],
       [0.81677645, 0.77088434, 0.7748317 , ..., 0.8817086 , 0.7708576 ,
        0.6345934 ]], dtype=float32)

In [16]:
# Rank candidates for each job description based on similarity scores
def rank_candidates(similarity_scores, job_descriptions, jd_df, cv_df, num_top_candidates=5):

  top_candidates = []
  for i, job_description in enumerate(job_descriptions):
    candidates_with_scores = list(enumerate(similarity_scores[i]))
    candidates_with_scores.sort(key=lambda x: x[1], reverse=True)
    top_candidates_for_job = candidates_with_scores[:num_top_candidates]
    top_candidates.append(top_candidates_for_job)

  return top_candidates

num_top_candidates=5
# Get the top candidates for each job description
top_candidates = rank_candidates(similarity_scores, job_desc, df1, df_x, num_top_candidates)

# Print the top candidates for each job description
for i, job_description in enumerate(job_desc):
    print(f"Top candidates for JD {i+1} - Postition: {df1['position_title'][i]}")
    for candidate_index, score in top_candidates[i]:
        print(f"  Candidate {candidate_index + 1} - Similarity Score: {score:.4f} - {df_x['Category'][candidate_index]}/{df_x['ID'][candidate_index]}.pdf")
    print()

Top candidates for JD 1 - Postition: Sales Specialist
  Candidate 1942 - Similarity Score: 0.9415 - HR/18827609.pdf
  Candidate 291 - Similarity Score: 0.9388 - AGRICULTURE/62994611.pdf
  Candidate 28 - Similarity Score: 0.9377 - ACCOUNTANT/16237710.pdf
  Candidate 1796 - Similarity Score: 0.9313 - HEALTHCARE/10466208.pdf
  Candidate 2145 - Similarity Score: 0.9296 - PUBLIC-RELATIONS/12237267.pdf

Top candidates for JD 2 - Postition: Apple Solutions Consultant
  Candidate 168 - Similarity Score: 0.9245 - ADVOCATE/22391901.pdf
  Candidate 1724 - Similarity Score: 0.9159 - FITNESS/21238396.pdf
  Candidate 950 - Similarity Score: 0.9155 - CHEF/21869994.pdf
  Candidate 482 - Similarity Score: 0.9146 - ARTS/54100393.pdf
  Candidate 904 - Similarity Score: 0.9126 - BUSINESS-DEVELOPMENT/95382114.pdf

Top candidates for JD 3 - Postition: Licensing Coordinator - Consumer Products
  Candidate 2145 - Similarity Score: 0.9489 - PUBLIC-RELATIONS/12237267.pdf
  Candidate 1186 - Similarity Score: 0.9