# Resume Parsing

### Task 1. Implementation Foundation:

1). The resume parser is extended based on the Resume Parsing.ipynb as shown below.   
2). Additional features including education, languages are implemented. 

### Task 2. Resume Parsing Features - Web Application Development:

1). The web application has function to upload a pdf file.   
2). The web application can extract other informations such as education and languages in addition to skills.  
3). The information can be extracted in structured format and can be downloaded as excel file from web page. 

In [1]:
# Importing necessary libraries
import spacy
import pandas as pd
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
import shutil
import os
from pathlib import Path

In [2]:
# Reading the resume data from a CSV file
df_resume = pd.read_csv("data/resume.csv")

# Filtering the resume data to include only entries in the 'ENGINEERING' category
df_resume = df_resume[df_resume.Category=='ENGINEERING']

In [3]:
# Loading the English language model with spaCy, excluding the named entity recognition component
nlp = spacy.load('en_core_web_sm', exclude = ['ner'])

# Path to the file containing patterns for entity recognition
skill_path = 'data/skills.jsonl'

In [4]:
# Adding an entity ruler to the spaCy pipeline and loading patterns from a file
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler']

In [5]:
# Defining patterns for recognizing education-related entities and adding them to the entity ruler
patterns = [
    {"label": 'EDUCATION', "pattern": "phd"},
    {"label": 'EDUCATION', "pattern": "chalmers"},
    {"label": 'EDUCATION', "pattern": "university"},
    {"label": 'EDUCATION', "pattern": "msc"},
    {"label": 'EDUCATION', "pattern": "master"},
    {"label": 'EDUCATION', "pattern": "institute"},
    {"label": 'EDUCATION', "pattern": "college"},
]

ruler.add_patterns(patterns)

In [6]:
# Defining patterns for recognizing language-related entities and adding them to the entity ruler
patterns = [
    {"label": 'LANGUAGES', "pattern": "english"},
    {"label": 'LANGUAGES', "pattern": "chinese"},
    {"label": 'LANGUAGES', "pattern": "thai"},
    {"label": 'LANGUAGES', "pattern": "german"},
    {"label": 'LANGUAGES', "pattern": "swedish"},
    {"label": 'LANGUAGES', "pattern": "japanese"},
]

ruler.add_patterns(patterns)

In [7]:
# Define the output directory
output_dir = "./pipeline"
# Ensure the output directory exists and clean it if it does
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
    
# Create the output directory
os.makedirs(output_dir)

# Save the updated pipeline to the directory
nlp.to_disk(output_dir)

In [8]:
def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [9]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

'         TEST ENGINEERING       Profile    I am seeking the challenging position with a technology company that will allow me to apply and enhance my skills of being Test Engineer to their fullest potential. Possessing over 10 years of experience in the electronics manufacturing and test engineering'

In [10]:
preprocessing(random_resume[:300])

'test engineering profile i seek challenging position technology company allow apply enhance skill test engineer full potential possess 10 year experience electronic manufacturing test engineering'

In [11]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

In [12]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
1690,14206561,ENGINEERING TECHNICIAN High...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,engineering technician highlights pc operating...
1691,15139979,ENGINEERING ASSISTANT Summary ...,"<div class=""RNA skn-cnt4 fontsize fontface vma...",ENGINEERING,engineering assistant summary knowledgeable en...
1692,44624796,ENGINEERING MANAGER Profile ...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,engineering manager profile dedicated engineer...
1693,54227873,ENGINEERING INTERN Profession...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,engineering intern professional summary a comp...
1694,28005884,ENGINEERING TECHNICIAN Summar...,"<div class=""fontsize fontface vmargins hmargin...",ENGINEERING,engineering technician summary to obtain posit...


In [13]:
spacy_load = spacy.load(output_dir)

In [14]:
# Function for getting information
def get_info(text):
    doc = spacy_load(text)
    skills = []
    education = []
    languages = []
    
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            skills.append(ent.text)
        elif ent.label_ == 'EDUCATION':
            education.append(ent.text)
        elif ent.label_ == 'LANGUAGES':
            languages.append(ent.text)    
 
    skills = list(set(skills))
    education = list(set(education))
    languages = list(set(languages))

    skills = ", ".join(skills)
    education = ", ".join(education)
    languages = ", ".join(languages)

    return [skills], [education], [languages]

In [16]:
from PyPDF2 import PdfReader

reader = PdfReader("CV_Pepe.pdf")
text = ""
# Iterate over all pages
for page in reader.pages:
    text += page.extract_text()

In [17]:
text = preprocessing(text)

In [18]:
doc = nlp(text)

In [19]:
# Highlight key information

colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
colors1 = {"EDUCATION": "linear-gradient(90deg, #0a9cfc, #fc9cc7)"}
colors2 = {"LANGUAGES": "linear-gradient(90deg, #0a3cfc, #fe9cc7)"}

# Merge both color dictionaries
merged_colors = {**colors, **colors1, **colors2}

options = {"colors": merged_colors}

displacy.render(doc, style='ent', options=options)

In [20]:
skill, edu, lang = get_info(text)

In [21]:
extract = pd.DataFrame()

In [22]:
extract = pd.DataFrame()
extract['skills'] = skill
extract['education'] = edu
extract['languages'] = lang


In [23]:
# Save excel file
extract.to_excel("download.xlsx", index=False)  