In [1]:
# This project was executed under macbook apple silicon (m1). I was using miniforge3 with this setup:
# conda install seaborn numpy scipy matplotlib scikit-learn xgboost pandas nltk seaborn unidecode gensim spacy PyPDF2 spacy-model-en_core_web_lg spacy-model-en_core_web_md
# you just need a few of them to run this project. Just check the imports if you want to save some space.

# Import PyPDF2 converter to pandas dataframe

In [2]:
from PyPDF2 import PdfReader

# Convert pdf to text

In [3]:
pdf_reader = PdfReader("resume-sample.pdf")

text_resume = ""
for i in range(pdf_reader.numPages):
    pageObj = pdf_reader.getPage(i)
    text_resume += pageObj.extractText()

In [4]:
text_resume

' \nRevision: June 2015  \nRESUME SAMPLES  \nPreparing an effective resume is a difficult and time -consuming task.  This handout \ncontains resume examples  that will help you get started.  Different formats and styles \nare used to illustrate the various suggestions and tips contained in the handout, \n"Preparing Your Resume," also avai lable through the Bellevue University Career Services  \nCenter . \n \nRemember, thes e are intended to serve only as examples.  You should modify or change \nas appropriate to customize your resume according to your skills, experience, education, \nand the job you’re applying.    \n \nFor additional guidance or assistance, contact the Career Servic es Center at  \n(402) 557 -7423, (800) 7 56-7920 ext. 7423 or careerservices @bellevue.edu.  \n \n \n \nA Word of Caution: Please don’t be tempted to use one of the \nResume Wizards or Templates that are available online or \nincluded in many word processing programs.  They can be  difficult \nto work with

# Without preprocessing, lets try to get some informations using spacy

In [5]:
import spacy
print(spacy.__version__)
# maybe you need to run:
#!python -m spacy download en_core_web_lg

3.4.1


In [6]:
nlp = spacy.load("en_core_web_lg")
doc = nlp(text_resume)

In [7]:
# tokens
print([token for token in doc])

[ 
, Revision, :, June, 2015,  
, RESUME, SAMPLES,  
, Preparing, an, effective, resume, is, a, difficult, and, time, -consuming, task, .,  , This, handout, 
, contains, resume, examples,  , that, will, help, you, get, started, .,  , Different, formats, and, styles, 
, are, used, to, illustrate, the, various, suggestions, and, tips, contained, in, the, handout, ,, 
, ", Preparing, Your, Resume, ,, ", also, avai, lable, through, the, Bellevue, University, Career, Services,  
, Center, ., 
 
, Remember, ,, thes, e, are, intended, to, serve, only, as, examples, .,  , You, should, modify, or, change, 
, as, appropriate, to, customize, your, resume, according, to, your, skills, ,, experience, ,, education, ,, 
, and, the, job, you, ’re, applying, .,    
 
, For, additional, guidance, or, assistance, ,, contact, the, Career, Servic, es, Center, at,  
, (, 402, ), 557, -7423, ,, (, 800, ), 7, 56, -, 7920, ext, ., 7423, or, careerservices, @bellevue.edu, .,  
 
 
 
, A, Word, of, Caution, :, P

In [8]:
# lets create a panda dataframe to make easy to see what spacy found
import pandas as pd
entities_text = [ ent.text for ent in doc.ents ]
entities_label = [ ent.label_ for ent in doc.ents ]
dataframe = pd.DataFrame({'text' : entities_text, 'entity' : entities_label })

In [9]:
dataframe.head(10)

Unnamed: 0,text,entity
0,June 2015,DATE
1,RESUME SAMPLES,ORG
2,Preparing Your Resume,WORK_OF_ART
3,the Bellevue University Career Services,ORG
4,402,CARDINAL
5,557,CARDINAL
6,800,CARDINAL
7,7,CARDINAL
8,Resume Wizards,ORG
9,exa mples,PERSON


In [10]:
# check all entities found
entities_label_uniq =  list(dict.fromkeys(entities_label))
print(entities_label_uniq)

['DATE', 'ORG', 'WORK_OF_ART', 'CARDINAL', 'PERSON', 'FAC', 'GPE', 'LAW', 'PRODUCT', 'MONEY', 'NORP', 'LANGUAGE', 'PERCENT', 'ORDINAL', 'TIME', 'LOC']


# just looking around...

In [11]:
dataframe.loc[dataframe['entity'] == 'ORG'].head(50)

Unnamed: 0,text,entity
1,RESUME SAMPLES,ORG
3,the Bellevue University Career Services,ORG
8,Resume Wizards,ORG
17,Bachelor of Science,ORG
18,Bellevue University,ORG
21,Computer Information Systems,ORG
23,GPA,ORG
32,Information Systems Analysis and Problem Solving,ORG
35,Researched,ORG
36,Student Intern,ORG


In [12]:
dataframe.loc[dataframe['entity'] == 'LANGUAGE']

Unnamed: 0,text,entity
83,English,LANGUAGE
84,Spanish,LANGUAGE
248,English,LANGUAGE


In [13]:
dataframe.loc[dataframe['entity'] == 'WORK_OF_ART']

Unnamed: 0,text,entity
2,Preparing Your Resume,WORK_OF_ART
161,Training and Development,WORK_OF_ART
358,Mental Illness,WORK_OF_ART


# As you can superficially see, the pretrained model is not good to parse resume informations.
# So will be good a idea to do some preprocessing steps to remove weird symbols and then train a new vocabulary using spacy.
# And to create vocabulary, we will need a lot of resume examples and manually recognize the relevant entities for resumes. 
# let's start from the basics: removing symbols that do not mean anything to this training.

In [14]:
type(text_resume)

str

In [15]:
# lets preprocessing the text, as we did before
import re
import string
def clean_text(text):
    # remove new lines
    text_cleaned = " ".join(text.split('\n'))
 
    # remove multiple spaces
    _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
    text_cleaned = _RE_COMBINE_WHITESPACE.sub(" ",text_cleaned).strip()
    
    # remove non printable chars
    text_cleaned = re.sub(f'[^{re.escape(string.printable)}]', '', text_cleaned)

    return text_cleaned

In [16]:
text_resume_preprocessed = clean_text(text_resume)
doc = nlp(text_resume_preprocessed)

In [17]:
print([ token for token in doc])

[Revision, :, June, 2015, RESUME, SAMPLES, Preparing, an, effective, resume, is, a, difficult, and, time, -consuming, task, ., This, handout, contains, resume, examples, that, will, help, you, get, started, ., Different, formats, and, styles, are, used, to, illustrate, the, various, suggestions, and, tips, contained, in, the, handout, ,, ", Preparing, Your, Resume, ,, ", also, avai, lable, through, the, Bellevue, University, Career, Services, Center, ., Remember, ,, thes, e, are, intended, to, serve, only, as, examples, ., You, should, modify, or, change, as, appropriate, to, customize, your, resume, according, to, your, skills, ,, experience, ,, education, ,, and, the, job, you, re, applying, ., For, additional, guidance, or, assistance, ,, contact, the, Career, Servic, es, Center, at, (, 402, ), 557, -7423, ,, (, 800, ), 7, 56, -, 7920, ext, ., 7423, or, careerservices, @bellevue.edu, ., A, Word, of, Caution, :, Please, do, nt, be, tempted, to, use, one, of, the, Resume, Wizards, or,

# Seems a little better. We always can improve, of course.
# The next step is get curriculum examples and train the data.
# To recognize the entities, I intend to use Doccano (https://github.com/doccano/doccano).

In [18]:
# but before we train our own data, lets try to use the train_data.json from this github: https://github.com/laxmimerit/CV-Parsing-using-Spacy-3/tree/master/data/training
# by the way, I recommend to watch his video https://www.youtube.com/watch?v=HJy11kOlgvk

In [3]:
import json

def load_train_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

train_data = load_train_data("./train_data_1.json")

In [20]:
# we have 200 Doc to do the training and evaluation
len(train_data)

200

In [21]:
# split the train_data
from sklearn.model_selection import train_test_split
train, test = train_test_split(train_data,test_size=0.2)

In [22]:
#for item in test:
#    print(item)

["Ashish Indoriya Sr. Systems Engineer at Infosys Limited  Hyderabad, Telangana - Email me on Indeed: indeed.com/r/Ashish- Indoriya/84f99c99ebe940be  • Master of Computer Application (MCA) from Bhilai Institute of Technology, Durg, 2014. • Having 3.3 years of Experience on Software Development at Infosys limited. • Extensive working experience on Java, Spring, Hibernate and SQL • Knowledge of design patterns such as Singleton, Factory, Façade, Observer and MVC. • Knowledge of Front-end web development using JavaScript, JQuery, CSS &amp; HTML. • Having knowledge of Oracle SQL Database. • Reliable as a fully contributing, responsible and accountable member of task/ project teams with highly honed creative, logical and analytical approach. • Automated some of HRMS processes like Hiring, transfer, termination to help speed up the QA process. • Hands on knowledge of C, C++ including advanced concepts such as pointers and Dynamic Memory Management. • Learning Hadoop and Big data analysis usi

In [None]:
# creating python function
from spacy.tokens import DocBin

def save_spacy_file(data,file):
    nlp = spacy.blank("en")
    db = DocBin()
    i=1

    for text, annotations in data:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations['entities']:
            print(start,end,label)
            span = doc.char_span(start, end, label=label)
            # theck if span is None type, due some problem of train_data_1.json
            if type(span) != type(None):
                ents.append(span)
        doc.ents = ents
        db.add(doc)
        i=i+1
    db.to_disk(file)

In [None]:
save_spacy_file(train, "./train.spacy")
save_spacy_file(test, "./dev.spacy")

In [25]:
# as I intend to study here, I will maintain this error.
# This is happening because the text was not preprocessed right.
# And we can not just clear because the span indexes (start:end) will change.
# So now we have a opportunity to preprocess the text and use Doccano (free and opensource) to find the entities again.
# I will proceed with the proprocessing 

# Lets create a JSONL to import in Doccano

In [26]:
import json

with open('train_data_doccano.txt', 'w') as f:
    for item in train_data:
        item[0] = clean_text(item[0])
        f.write(item[0] + "\n")

In [None]:
!python -m spacy convert ./train_data_1.json .

# train our model

In [None]:
# we will need a base_config.cfg. I generated one from here (is in the git too): https://spacy.io/usage/training#config
# then load the config
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# and finally train the model
#!python -m spacy train config.cfg --paths.train ./train_data_1.spacy --paths.dev ./dev.spacy

In [None]:
#!python -m spacy debug data config.cfg