In [5]:
import PyPDF2
import pandas as pd
import numpy as np

from textblob import TextBlob
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import os
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

dataset_root_folder = './Resumespdf/data/data'
dataset = pd.read_csv('./jobdesc_data.csv')

def text_cleaning(text):
    
    # Remove HTML tags 
    cleaned_text = re.sub(r'<.*?>', ' ', text)
    
    # Remove non-english characters, punctuation,special characters, digits, continous underscores and extra whitespace
    cleaned_text = re.sub('[^a-zA-Z]', ' ', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s]|_', ' ', cleaned_text)
    cleaned_text = re.sub(r'\d+', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text=re.sub('http\S+\s', " ", cleaned_text)
    
    
    
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    
    # Tokenize the cleaned text
    words = word_tokenize(cleaned_text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Tokenize the cleaned text
    words = word_tokenize(cleaned_text)
    
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    cleaned_text = ' '.join(stemmed_words)
    
    return cleaned_text

def extract_specific_lines(pdf_file_path):
    sents = []
    with open(pdf_file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
                       
            text = page.extract_text()
            # Clean extracted text
            cleaned_value = text_cleaning(text)
            # Use TextBlob to split sentences
            blob = TextBlob(cleaned_value)
            sents.extend(blob.sentences)
            
    # Define keywords to search for
    keyword_lines = []
    
    # Iterate through lines and check for keywords
    for sent in sents:
        sent = str(sent)  # Convert TextBlob line to a string
        #print ('sent: ', sent)
        words = TextBlob(sent.lower()).words 
        
        if "educ" not in words and "skill" not in words:
            continue
        
        keyword_lines.append(sent)
        
        #print(keyword_lines)
     
    return keyword_lines

data = []
resumes_list =[]
for category_folder in os.listdir(dataset_root_folder):
    count = 0
    category_folder_path = os.path.join(dataset_root_folder, category_folder)
    if os.path.isdir(category_folder_path):
        for filename in os.listdir(category_folder_path):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(category_folder_path, filename)
                #for every resume extract only the specific lines with keywords
                resume_lines = extract_specific_lines(pdf_path)
                if len(resume_lines) <=0:
                    continue
                #append the resume_lines as strings in reasumes_list
                resumes_list.append(resume_lines[0])                                
                category = category_folder
                filename = filename 
                #append category_folder, filename, resume_lines for creating a pandas dataframe
                data.append([category_folder, filename, resume_lines])
                count += 1
                if count == 2:
                    break
# Create a Pandas DataFrame using the data list
df1 = pd.DataFrame(data, columns=["Category", "File", "Line"])

# Select the first 10 job descriptions and company names
job_descriptions = dataset["job_description"][:10]
companies = dataset["company_name"][:10]

# Create a new DataFrame and set the index
df = pd.DataFrame({'Job Description': job_descriptions})
df.index = companies
#print(df.head())
#Covert Job Description in the dataframe to list of strings and clean
job_description = list(df['Job Description'])
job_description_cleaned = []
for job in job_description:
    job_description_cleaned.append(text_cleaning(job))
#print(job_description_cleaned)


#load model

model  = SentenceTransformer("bert-base-uncased")

# embedding of list

embeddings =  model.encode(resumes_list)
embeddingsSearched = model.encode(job_description_cleaned)

#finding similarity
similarity = cosine_similarity(embeddingsSearched, embeddings)

# Iterate through job descriptions
for i, job_description in enumerate(job_description_cleaned):
    print(f"Job Description {i + 1}:\n {df.index[i]}:\n{df['Job Description'][i]}\n")
    
    # Get similarity scores for this job description
    similarity_scores = similarity[i]
    
    # Sort CVs by similarity score (highest to lowest)
    sorted_indices = similarity_scores.argsort()[::-1]
    
    # List the top 5 CVs for this job description
    top_cv_indices = sorted_indices[:5]
    
    print("Top 5 CVs:")
    for j, cv_index in enumerate(top_cv_indices, start=1):
        print("cv_index= ",cv_index)
        print(f"CV {j} (Similarity Score: {similarity_scores[cv_index]}):")
        print(f"Category: {df1['Category'][cv_index]}")
        print(f"File Name: {df1['File'][cv_index]}")
        print(f"Resume Text: {df1['Line'][cv_index]}\n")


# Now, resumes listed are top 5 resumes (as text) for each job description,
# and the code prints them alongside the job descriptions.



[nltk_data] Downloading package stopwords to C:\Users\Harini
[nltk_data]     Balaji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Harini
[nltk_data]     Balaji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No sentence-transformers model found with name C:\Users\Harini Balaji/.cache\torch\sentence_transformers\bert-base-uncased. Creating a new one with MEAN pooling.


Job Description 1:
 Googleminimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make 