In [10]:

import spacy
import re
import pandas as pd
from tika import parser
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# Resume Parsing
parsed_content = {}

def tparser(text):
    nlp = spacy.load('en_core_web_sm')
    from spacy.matcher import Matcher
    matcher = Matcher(nlp.vocab)

    def extract_categ(text):
        nlp_text = nlp(text)
        pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
        matcher.add('NAME', [pattern], on_match=None)
        matches = matcher(nlp_text)
        for match_id, start, end in matches:
            span = nlp_text[start:end]
            return span.text

    name = extract_categ(text)
    parsed_content['Category'] = name
    Keywords = ["education", "summary", "accomplishments", "executive profile", "professional profile",
                "personal profile", "work background", "academic profile", "other activities", "qualifications",
                "experience", "interests", "skills", "achievements", "publications", "publication", "certifications",
                "workshops", "projects", "internships", "trainings", "hobbies", "overview", "objective",
                "position of responsibility", "jobs"]

    text = text.replace("\n", " ")
    text = text.replace("[^a-zA-Z0-9]", " ")
    re.sub('\W+', '', text)
    text = text.lower()

    content = {}
    indices = []
    keys = []
    for key in Keywords:
        try:
            content[key] = text[text.index(key) + len(key):]
            indices.append(text.index(key))
            keys.append(key)
        except:
            pass

    zipped_lists = zip(indices, keys)
    sorted_pairs = sorted(zipped_lists)
    tuples = zip(*sorted_pairs)
    indices, keys = [list(tuple) for tuple in tuples]

    content = []
    for idx in range(len(indices)):
        if idx != len(indices) - 1:
            content.append(text[indices[idx]: indices[idx + 1]])
        else:
            content.append(text[indices[idx]:])
    for i in range(len(indices)):
        parsed_content[keys[i]] = content[i]

    return parsed_content

In [12]:
folder_path = 'data/data/data/INFORMATION-TECHNOLOGY'
data = {'Category': [''], 'Skills': [''], 'Education': [''], 'Experience': ['']}
for filename in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, filename)):
        file_path = os.path.join(folder_path, filename)
        file_data = parser.from_file(file_path)
        text = file_data['content']
        parsed_content = tparser(text)
        data['Category'].append(parsed_content.get('Category', ''))
        data['Skills'].append(parsed_content.get('skills', ''))
        data['Education'].append(parsed_content.get('education', ''))
        data['Experience'].append(parsed_content.get('experience', ''))


In [13]:
#df_res = pd.DataFrame(data)
#df_res.to_csv('parsed_res_it.csv')
#rint('Resume Parsing Done!')
# Extracting Job Descriptions
df_res = pd.read_csv('parsed_res_it.csv')
raw_df = load_dataset('jacob-hugging-face/job-descriptions', split='train')
df_jds = pd.DataFrame(raw_df)
df_jds = df_jds.sample(frac=1, random_state=42).head(15)
print('Job Descriptions Extracted!')

Job Descriptions Extracted!


In [14]:
def cleanJD(text):
    cleaned_text = text.replace("\n", " ")
    cleaned_text = cleaned_text.replace("[^a-zA-Z0-9]", " ")
    cleaned_text = re.sub(r'[^\w\s]|_', ' ', cleaned_text)
    re.sub('\W+', '', cleaned_text)
    cleaned_text = cleaned_text.lower()
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

skills = []
for i in range(0, 15):
    text = df_jds['model_response'].str.split(',').iloc[i]
    skills.append(cleanJD(text[1]))
df_jds['skills'] = skills
df_jds = df_jds.drop(columns=['job_description', 'model_response', 'description_length'])
df_jds.to_csv('jds.csv', index=False)
print('Job Descriptions Cleaned!')

Job Descriptions Cleaned!


In [15]:

# Similarity Calculation
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

df_res = df_res.sample(frac=1, random_state=42).reset_index(drop=True)
df_res = df_res[1:22]
df_res.drop(7)
sentences_res = df_res["Skills"].to_list()
sentences_res = [s for s in sentences_res if isinstance(s, str)][:20]
sentences_jds = df_jds["skills"].to_list()
sentences_jds = [s for s in sentences_jds if isinstance(s, str)]
print('Similarity Calculation Started!')

Similarity Calculation Started!


In [16]:
def tokenize_sentence(sentences):
    tokens = {'input_ids': [], 'attention_mask': []}
    for sentence in sentences:
        try:
            new_tokens = tokenizer.encode_plus(sentence, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
            tokens['input_ids'].append(new_tokens['input_ids'][0])
            tokens['attention_mask'].append(new_tokens['attention_mask'][0])
        except Exception as e:
            print(f"Error processing sentence: {sentence}")
            print(f"Error message: {str(e)}")
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    return tokens


: 

In [17]:

tokens_res = tokenize_sentence(sentences_res)
tokens_jds = tokenize_sentence(sentences_jds)

outputs_res = model(**tokens_res)
outputs_jds = model(**tokens_jds)
print('Similarity Calculation Done!')

In [None]:
embeddings_res = outputs_res.last_hidden_state
embeddings_jds = outputs_jds.last_hidden_state

attention_mask_res = tokens_res['attention_mask']
attention_mask_jds = tokens_jds['attention_mask']

resized_attention_mask_res = attention_mask_res.unsqueeze(-1).expand(embeddings_res.size()).float()
resized_attention_mask_jds = attention_mask_jds.unsqueeze(-1).expand(embeddings_jds.size()).float()

masked_embedding_res = embeddings_res * resized_attention_mask_res
masked_embedding_jds = embeddings_jds * resized_attention_mask_jds

summed_masked_embeddings_res = torch.sum(masked_embedding_res, 1)
summed_masked_embeddings_jds = torch.sum(masked_embedding_jds, 1)

count_of_one_in_mask_tensor_res = torch.clamp(resized_attention_mask_res.sum(1), min=1e-9)
count_of_one_in_mask_tensor_jds = torch.clamp(resized_attention_mask_jds.sum(1), min=1e-9)

mean_pooled_res = summed_masked_embeddings_res / count_of_one_in_mask_tensor_res
mean_pooled_jds = summed_masked_embeddings_jds / count_of_one_in_mask_tensor_jds

mean_pooled_res = mean_pooled_res.detach().numpy()
mean_pooled_jds = mean_pooled_jds.detach().numpy()

In [None]:
res = cosine_similarity([mean_pooled_jds[0]], mean_pooled_res[0:])
print(f"Job: {df_jds['position_title'][0]}")

categs = df_res['Category']
data = {'Category': categs[1:], 'Scores': res[0]}
df_scores = pd.DataFrame(data)
df_scores = df_scores.sort_values(by='Scores', ascending=False)
print(df_scores[:5])
df_scores.to_csv('scores.csv', index=False)