In [8]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from PyPDF2 import PdfReader
from nltk.corpus import stopwords

In [9]:
nlp=spacy.load('en_core_web_sm')

In [10]:
job_description="""We are looking for a skilled software engineer to join our team. The ideal candidate should have proficiency in programming languages such as Python, Java, and C++. Experience with data structures, algorithms, and software development best practices is essential. Familiarity with web development frameworks such as React and Django is a plus.
"""

In [11]:
job_description

'We are looking for a skilled software engineer to join our team. The ideal candidate should have proficiency in programming languages such as Python, Java, and C++. Experience with data structures, algorithms, and software development best practices is essential. Familiarity with web development frameworks such as React and Django is a plus.\n'

In [12]:
def extract_text_from_pdf(file_path):
    reader=Pdfreader(file_path)
    text=""
    for page in reader.pages:
        text+=page.extract_text()
    return text

In [21]:
def preprocess_text(text):
    if not text:
        return ""
    text=text.lower()
    stop_words=set(stopwords.words('english'))
    doc=nlp(text)
    tokens=[token.text for token in doc if token.text.isalpha() and token.text not in stop_words]
    return " ".join(tokens)

In [22]:
resume_a=pd.read_csv('resumes_500.csv')

In [23]:
resume_a

Unnamed: 0,name,resume
0,Krista Wade,HTML C++ Database management React Node.js Web...
1,David Hicks,CSS Algorithms Python Backend technologies Nod...
2,Kathleen Molina,JavaScript Database management API development...
3,Steven Griffin,CSS Python HTML Java Data structures Backend t...
4,Alexander Wilkinson,Backend technologies Machine learning JavaScri...
...,...,...
495,Daniel Fitzgerald,Node.js HTML CSS Data structures Machine learning
496,Benjamin Stephens,API development Java C++ Backend technologies ...
497,Jesse Booth,CSS HTML Data structures Machine learning Algo...
498,Katelyn Sanchez,React Node.js C++ Machine learning CSS


In [24]:
job_description_newcl=preprocess_text(job_description)
resume_a['new_resume']=resume['resume'].apply(preprocess_text)

In [25]:
resume_a

Unnamed: 0,name,resume,new_resume
0,Krista Wade,HTML C++ Database management React Node.js Web...,html database management react web development...
1,David Hicks,CSS Algorithms Python Backend technologies Nod...,css algorithms python backend technologies
2,Kathleen Molina,JavaScript Database management API development...,javascript database management api development...
3,Steven Griffin,CSS Python HTML Java Data structures Backend t...,css python html java data structures backend t...
4,Alexander Wilkinson,Backend technologies Machine learning JavaScri...,backend technologies machine learning javascri...
...,...,...,...
495,Daniel Fitzgerald,Node.js HTML CSS Data structures Machine learning,html css data structures machine learning
496,Benjamin Stephens,API development Java C++ Backend technologies ...,api development java backend technologies data...
497,Jesse Booth,CSS HTML Data structures Machine learning Algo...,css html data structures machine learning algo...
498,Katelyn Sanchez,React Node.js C++ Machine learning CSS,react machine learning css


In [28]:
vectorizer=TfidfVectorizer()
all_texts=[job_description_newcl]+resume_a['new_resume'].tolist()
tfidf_mat=vectorizer.fit_transform(all_texts)

In [33]:
cosine_similarities=cosine_similarity(tfidf_mat[0:1],tfidf_mat[1:])
resume_a['matched_score']=cosine_similarities[0]*100

In [34]:
resume_a

Unnamed: 0,name,resume,new_resume,matched_score
0,Krista Wade,HTML C++ Database management React Node.js Web...,html database management react web development...,8.923274
1,David Hicks,CSS Algorithms Python Backend technologies Nod...,css algorithms python backend technologies,5.526661
2,Kathleen Molina,JavaScript Database management API development...,javascript database management api development...,11.918845
3,Steven Griffin,CSS Python HTML Java Data structures Backend t...,css python html java data structures backend t...,7.544155
4,Alexander Wilkinson,Backend technologies Machine learning JavaScri...,backend technologies machine learning javascri...,6.118369
...,...,...,...,...
495,Daniel Fitzgerald,Node.js HTML CSS Data structures Machine learning,html css data structures machine learning,4.722098
496,Benjamin Stephens,API development Java C++ Backend technologies ...,api development java backend technologies data...,8.741468
497,Jesse Booth,CSS HTML Data structures Machine learning Algo...,css html data structures machine learning algo...,10.552406
498,Katelyn Sanchez,React Node.js C++ Machine learning CSS,react machine learning css,3.136372


In [39]:
resumes_sorted = resume_a.sort_values(by='matched_score', ascending=False)
resumes_sorted

Unnamed: 0,name,resume,new_resume,matched_score
404,Christine Patterson,Data structures Web development Algorithms Pyt...,data structures web development algorithms pyt...,16.682520
170,Mikayla Garcia,C++ CSS Algorithms React Web development Pytho...,css algorithms react web development python ap...,16.605389
207,Andrew Smith,Web development Java Node.js Python C++ Django...,web development java python django data struct...,16.572619
35,Wendy Kim,Algorithms Data structures React Django Web de...,algorithms data structures react django web de...,15.880838
264,Jesus Brown,Problem-solving Django Python Web development ...,problem solving django python web development ...,15.779941
...,...,...,...,...
461,Erin Bridges,Database management CSS JavaScript Problem-sol...,database management css javascript problem sol...,2.008450
148,Amanda Martinez,API development Machine learning Problem-solvi...,api development machine learning problem solvi...,1.982180
270,Jonathan Burns PhD,Problem-solving Machine learning CSS Backend t...,problem solving machine learning css backend t...,0.000000
137,Julie Welch,Database management JavaScript Problem-solving...,database management javascript problem solving...,0.000000


In [41]:
print(resumes_sorted[['name', 'matched_score']]) 

                     name  matched_score
404   Christine Patterson      16.682520
170        Mikayla Garcia      16.605389
207          Andrew Smith      16.572619
35              Wendy Kim      15.880838
264           Jesus Brown      15.779941
304         Matthew Perry      15.566897
79          Rachel Howard      15.176138
369      Miranda Castillo      15.047985
222      Brendan Schwartz      15.009859
117  Alexander Fitzgerald      14.932438


### let find top 10 candidate matching this profile 

In [42]:
print(resumes_sorted[['name', 'matched_score']].head(10)) 

                     name  matched_score
404   Christine Patterson      16.682520
170        Mikayla Garcia      16.605389
207          Andrew Smith      16.572619
35              Wendy Kim      15.880838
264           Jesus Brown      15.779941
304         Matthew Perry      15.566897
79          Rachel Howard      15.176138
369      Miranda Castillo      15.047985
222      Brendan Schwartz      15.009859
117  Alexander Fitzgerald      14.932438
