# Automated Resume Screening Algorithm

### Progress: Complete

#### by Gingoyon, Arvic Micah B.

Specific Progress Notes:
#### 2023/07/26 2:58PM

Now, I will be integrating the results of the following Jupyter Notebooks: 
- Soft Skills
- Tech Skills
- JobExperience

The full algorithm will need the following dependencies for their respective purposes:

#### Data Manipulation
- numpy
- pandas

#### Text Extraction
- re
- fitz (install pymupdf)
- string
- nltk (install nltk)

#### AI
- pickle

#### Synonym Lookup
- wordhoard (install wordhoard)

#### Date Computation
- datetime

The following models need to be available:
- JobIdentifier
- DateIdentifier
- IndustryClassifier
- SoftSkillIdentifier
- SoftSkillClusters

The following datasets need to be available:
- SoftSkills
- TechnologySkills
- HardSkills

# Models and Datasets

In [106]:
#AI Models
ji = pickle.load(open('JobIdentifier.pkl', 'rb')) #JobIdentifier
di = pickle.load(open('Dateidentifier.pkl', 'rb')) #DateIdentifier
ic = pickle.load(open('IndustryClassifier.pkl', 'rb')) #IndustryClassifier
si = pickle.load(open('SoftSkillIdentifier.pkl', 'rb')) #SoftSkillIdentifier
sc = pickle.load(open('SoftSkillClusters.pkl','rb')) #SoftSkillClusters

#Datasets
softskills = pd.read_csv('SoftSkills.csv')
str_tolist(softskills) ## Converts the string-type lists when loading the CSV back into lists.
techskills = pd.read_csv('TechnologySkills.csv')
hardskills = pd.read_csv('HardSkills.csv')

# Helper Functions

#### str_tolist(DataFrame):
This can only be used for a DataFrame that is composed of a lists of words. This will separate each word by commas (happens when you converts a DataFrame with lists into a CSV file), and substitute each index for that column with a list of the split words instead of the original string that is read from the CSV.

*Made specifically during development of Soft Skill Extraction. For Soft Skills dataset only*

In [12]:
def str_tolist(df):
    for j in range(0, df.shape[0]):
        wlist = []
        for i in df.iloc[j,0].split(','):
            wlist.append(i.strip("[],.' "))

        df.iloc[j,0] = wlist

# Main Functions

In [38]:
def extractApplicantData(filepath):
    doc = fitz.open(filepath)
    text = ""
    for page in doc:
       text+=page.get_text()
    
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.replace('\n\r-', ' ')
    
    return text  

# Job Experience

#### extractJobExperience(text, JobIdentifier, DateIdentifier, IndustryClassifier):

This will extract the 

In [112]:
def extractJobExperience(text, ji, di, ic):
    
    #Text Manipulation
    text = text.lower() #Because this function was made to process text in lowervase.
    
    ##Extracting predicted job titles using trigrams
    txt_splt = text.split()
    trigrams = []
    for i in range (0,len(txt_splt)-3):
        trigrams.append((f'{txt_splt[i]} {txt_splt[i+1]} {txt_splt[i+2]}'))

    pred = ji.predict(trigrams)
    titles = []

    for i in range(0,len(trigrams)):
        if pred[i] == 1:
            titles.append(trigrams[i])
                    
    
    ##Classifying them to industries
    preds = ic.predict(titles)
    preds = pd.DataFrame(preds)
    preds.columns = ['Labels']
    preds['Counts'] = 1
    
    industries = pd.DataFrame(preds.groupby('Labels').count().sort_values('Counts', ascending = False).reset_index())
    
    ##Identifying dates within the text
    grams = text.split()
    pred = di.predict(grams)

    dates = []
    for i in range(0, len(grams)):
        if pred[i] == 1:
            if grams[i] == 'present' or grams[i] == 'Present' or grams[i] == 'current' or grams[i] == 'Current':
                today = datetime.date.today()
                year = today.strftime("%Y")
            else:
                found = re.findall('\d{4}', grams[i])
                if not found:
                    dates.append(0)
                else: 
                    year = found[0]

            dates.append(int(year))
    
    ##Computing total years from gathered dates
    years = 0
    for i in range(0,len(dates)-1):
        if dates[i] < dates[i+1] and dates[i] != 0:
            years+=(dates[i+1]-dates[i])
    
    ##Outputting results
    output = []
    top = []
    for j in range(0, 2):
        labels = industries['Labels']
        top.append(labels[j])
    
    if years >= 1:
        output.append((top, f'{years} year/s'))
    else:
        output.append((top, 'Less than a year experience'))
        
    return output

# Soft Skills

#### extractSoftSkills(text, SoftSkillCorpus, SoftSkillIdentifier, SoftSkillClusters):

In [47]:
def extractSoftSkills(text, scorpus, si, sc):
    
    #Text Manipulation
    text = text.lower() #Because this function was made to process text in lowervase.
    
    ##Extract soft skills from text
    slist = []
    grams = []
    spltxt = text.split()
    stop_words = ['i', 'me', 'skills', 'ability','skill', 'abilities','my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
    for i in range(0,len(spltxt)-1):
        if spltxt[i] not in stop_words not in stop_words:
            word = re.sub("[^A-Z]", "", spltxt[i].strip(",.;!?"),0,re.IGNORECASE)
            grams.append(word)
    
    if not grams:
        slist.append(' ')
    else:
        pred = si.predict(grams)
        for i in range(0, len(grams)):
            if pred[i] == 1:
                if grams[i] not in slist:
                    slist.append(grams[i])
    
    
    ##Getting Synonyms from Corpus
    skills = []
    skill = []
    for s in slist:
        for i in range(0, scorpus.shape[0]):
            if s == scorpus.iloc[i,0][0]:
                skill = scorpus.iloc[i,0]
        if not skill:
            for i in range(0, scorpus.shape[0]):
                for j in scorpus.iloc[i,0]:
                    if s == j:
                        skill = scorpus.iloc[i,0]
    
        skills.append(skill)
        
    ##Stemming
    sno = nltk.stem.SnowballStemmer('english')
    stemmed = []
    for i in range(0,len(skills)):
        stems = ''
        for j in skills[i]:
            if sno.stem(j) not in stems:
                stems += ' ' + sno.stem(j)
        if stems not in stemmed:
            stemmed.append(stems)
    
    ##Predicting with Cluster model
    prediction = sc.predict(stemmed)
    
    ##Interpreting the results of the prediction
    output = []
    for i in prediction:
        if i == 0: 
            if 'Communication/Teamwork' not in output: output.append('Communication/Teamwork')
        elif i == 1: 
            if 'Decision-Making' not in output: output.append('Decision-Making')
        elif i == 2: 
            if 'Creativity/Innovation' not in output: output.append('Creativity/Innovation')
        elif i == 3: 
            if 'Manners/Courtesy' not in output: output.append('Manners/Courtesy')
        elif i == 4: 
            if 'Adaptability/Versatility' not in output: output.append('Adaptability/Versatility')
        elif i == 5: 
            if 'Confidence/Optimism' not in output: output.append('Confidence/Optimism')
        elif i == 6: 
            if 'Efficiency/Well-Organized' not in output: output.append('Efficiency/Well-Organized')
        elif i == 7: 
            if 'Leadership/Accountability' not in output: output.append('Leadership/Accountability')
        elif i == 8: 
            if 'Control/Discipline' not in output: output.append('Control/Discipline')
    
    return output

# Hard Skills and Tech Skills

#### extractTechSkills(text, HardSkills, TechSkills, SoftSkillIdentifier):

In [97]:
def extractTechSkills(text, hs, ts, si):
    
    hskills = []
    tskills = []
    hcount = 0
    tcount = 0
    
    if hs.shape[0] > ts.shape[0]:
        ##Extracting Hard Skills
        for i in range(0,hs.shape[0]):
            if hs.iloc[i,0] in text and hs.iloc[i,0] and hs.iloc[i,0] not in (ts.iloc[:,0].tolist()):
                if len(hs.iloc[i,0].split())>1:
                    if not hskills:
                        hskills.append(hs.iloc[i,0])
                        hcount += 1
                    else:
                        if hskills[hcount-1] in hs.iloc[i,0]:
                            hskills[hcount-1] = hs.iloc[i,0]
                        elif hs.iloc[i,0] in hskills[hcount-1]:
                            continue
                        else:
                            hskills.append(hs.iloc[i,0])
                            hcount+=1
            
            if i < ts.shape[0]:
                if ts.iloc[i,0] in text and ts.iloc[i,0] not in ['R', 'C', 'J', 'SKILL', 'Dig']: 
                #I added the awkward list of tech skills here because they almost always pop up even though they're not actually there. 
                #C++ can be detected but C will be difficult. This may be the reason why LinkedIn also cannot input C as a skill.
                    if not tskills:
                        tskills.append(ts.iloc[i,0])
                        tcount += 1
                    else:
                        if tskills[tcount-1] in ts.iloc[i,0]:
                            tskills[tcount-1] = ts.iloc[i,0]
                        elif ts.iloc[i,0] in tskills[tcount-1]:
                            continue
                        else:
                            tskills.append(ts.iloc[i,0])
                            tcount+=1
        
    else:
        ##Extracting Tech Skills
        for i in range(0,ts.shape[0]):
            if ts.iloc[i,0] in text and ts.iloc[i,0] not in ['R', 'C', 'J', 'SKILL', 'Dig']: 
                #I added the awkward list of tech skills here because they almost always pop up even though they're not actually there. 
                #C++ can be detected but C will be difficult. This may be the reason why LinkedIn also cannot input C as a skill.
                if not tskills:
                    tskills.append(ts.iloc[i,0])
                    tcount += 1
                else:
                    if tskills[tcount-1] in ts.iloc[i,0]:
                        tskills[tcount-1] = ts.iloc[i,0]
                    elif ts.iloc[i,0] in tskills[tcount-1]:
                        continue
                    else:
                        tskills.append(ts.iloc[i,0])
                        tcount+=1
            if i < hs.shape[0]:
                if hs.iloc[i,0] in text and hs.iloc[i,0] and hs.iloc[i,0] not in (ts.iloc[:,0].tolist()):
                    if len(hs.iloc[i,0].split())>1:
                        if not hskills:
                            hskills.append(hs.iloc[i,0])
                            hcount += 1
                        else:
                            if hskills[hcount-1] in hs.iloc[i,0]:
                                hskills[hcount-1] = hs.iloc[i,0]
                            elif hs.iloc[i,0] in hskills[hcount-1]:
                                continue
                            else:
                                hskills.append(hs.iloc[i,0])
                                hcount+=1
                                
    ##Removing Soft Skills detected.
    if hskills:
        hpred = si.predict(hskills)
    if tskills:
        tpred = si.predict(tskills)
    outh = []
    outt = []
        
    if len(hskills) > len(tskills):
        for i in range(0,len(hskills)):
            if hpred[i] == 0:
                outh.append(hskills[i])
            
            if tskills:
                if i < len(tskills):
                    if tpred[i] == 0:
                        outt.append(tskills[i])
    else:
        for i in range(0,len(tskills)):
            if tpred[i] == 0:
                outt.append(tskills[i])
            
            if hskills:
                if i < len(hskills):
                    if hpred[i] == 0:
                        outh.append(hskills[i])
                        
    if outh and outt:
        return outh, outt
    elif outh and not outt:
        return outh, 'No Specific Software Skills.'
    elif not outh and outt:
        return 'No Specific Hard Skills.', outt
    else:
        return 'No Specific Hard Skills.', 'No Specific Software Skills.'

# Testing the Functions

In [107]:
import numpy as np
import pandas as pd
import re
import fitz
import string
import nltk
import pickle
from wordhoard import Synonyms
import datetime

In [108]:
test1 = extractApplicantData('resume7.pdf')

In [109]:
extractJobExperience(test1, ji, di, ic)

[(['CUSTOMER SERVICE', 'ACCOUNTING & FINANCE'], 'Less than a year experience')]

In [110]:
extractSoftSkills(test1, softskills, si, sc)

['Efficiency/Well-Organized',
 'Creativity/Innovation',
 'Confidence/Optimism',
 'Adaptability/Versatility',
 'Communication/Teamwork',
 'Leadership/Accountability']

In [111]:
extractTechSkills(test1, hardskills, techskills, si)

(['Customer Service'],
 ['Cin7',
  'Mi9 Merchant',
  'Netsuite',
  'ProsperWorks CRM',
  'Freshsales',
  'Pipedrive'])