In [1]:
# Export requirements.txt for only the packages used in this notebook
# pip install pipreqs
# !pipreqs --force .
from sklearnex import patch_sklearn
patch_sklearn()
import pandas as pd, glob, os, sys, win32com.client, pythoncom, numpy as np, re, seaborn as sns, matplotlib.pyplot as plt, warnings, time, pickle
from filesplit.split import Split
from filesplit.merge import Merge

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, HalvingRandomSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier, VotingClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")
set_config(display='diagram')
# dir_path = os.path.dirname(os.path.realpath(sys.argv[0])) + "\\"
dir_path = os.getcwd() + "\\"

def set_options(xl, option):
    try:
        xl.Visible = option
        xl.ScreenUpdating = option
        xl.DisplayAlerts = option
        xl.EnableEvents = option
    except: 
        pass

def convert(file):
    file = os.path.splitext(file)[0]
    print('Converting {}...'.format(file))
    xl=win32com.client.Dispatch("Excel.Application",pythoncom.CoInitialize())
    set_options(xl, False)
    
    # check if file with .csv exists
    if not os.path.isfile(dir_path + file + '.csv'):
        try:
            wb = xl.Workbooks.Open(Filename= dir_path + file + '.xlsx',ReadOnly=1)
        except:
            wb = xl.Workbooks.Open(Filename= dir_path + file + '.xlsb',ReadOnly=1)  
        xl.ActiveWorkbook == wb
        
        #6 means csv
        wb.SaveAs(Filename= dir_path + file + '.csv', FileFormat='6') 
        set_options(xl, True)
        wb.Close(True)
        xl.Application.Quit()
        wb=xl=None
    
    df = pd.read_csv(dir_path + file + '.csv', low_memory = False, encoding='ISO-8859-1')
    df.to_pickle(dir_path + file + '.pkl')
    
    return file + '.pkl'

def read(i):
    try: 
        pkl = glob.glob(os.path.join('*{0}*.pkl'.format(i)))[0]
    except: 
        pkl = convert(glob.glob(os.path.join('*{0}*'.format(i)))[0])
    df = pd.read_pickle(dir_path + pkl)
    return df

# Custom function to store models
def pickle_split(filename, model):
    file = filename + '.mgz'
    pickle.dump(model, open(file, 'wb'))

    # Check file size
    print('File Size: {:.2f} {}'.format(os.path.getsize(file)/1000000, 'MB'))

    # Split stack.mgz into 99 MB parts using filesplit
    split = Split(inputfile=file, outputdir='.')
    split.manfilename = filename
    split.bysize(size=100000000)
    os.remove(file)

def merge_pickle(filename):
    # Merge the files back together
    filepath = filename + '.mgz'
    merge = Merge(inputdir='.', outputdir='.', outputfilename=filepath)
    merge.manfilename = filename
    merge.merge()

    # Load the model
    file = pickle.load(open(filepath, 'rb'))
    os.remove(filepath)
    return file

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chinj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\chinj\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
import string
import spacy
from spacy import displacy
# Set up the environment to display the graphical outputs
import pyLDAvis
import pyLDAvis.sklearn

  from imp import reload


In [3]:
# Pickle dataset, delete resume.pkl if you are facing issues
i = "Resume"
df = read(i)

df.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
# Remove Resume_html and ID
df.drop(['Resume_html', 'ID'], axis=1, inplace=True)
df.columns = ['text', 'label']

df.head()

Unnamed: 0,text,label
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR


In [5]:
# Clean text
df['text'] = [re.sub('</?([a-z]+)>', ' ', x) for x in df['text']] 
df['text'] = [re.sub('<[^>]*>', '', x) for x in df['text']] 
df['text'] = [re.sub('\s+', ' ', x) for x in df['text']] 
df['text'] = [x.lower() for x in df['text']] # lowercase
df['text'] = [re.sub('\/', ' ', x) for x in df['text']] # I added this to remove slashes because I see "administrator/marketing"
df.head()

  df['text'] = [re.sub('\s+', ' ', x) for x in df['text']]
  df['text'] = [re.sub('\/', ' ', x) for x in df['text']] # I added this to remove slashes because I see "administrator/marketing"


Unnamed: 0,text,label
0,hr administrator marketing associate hr admin...,HR
1,"hr specialist, us hr operations summary versa...",HR
2,hr director summary over 20 years experience ...,HR
3,"hr specialist summary dedicated, driven, and ...",HR
4,hr manager skill highlights hr skills hr depa...,HR


In [6]:
#  trained pipeline for the English language. 
# It is optimized for the CPU and contains components like ok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer. 
# Its file size is 741 MB as compared to en_core_web_md whose size is only 13 MB.  
# It is the largest English model in size provided by Spacy.
spacy.cli.download("en_core_web_lg")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [7]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "convertjson.jsonl"

In [8]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)
nlp.remove_pipe('lemmatizer') 
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner', 'entity_ruler']

In [9]:
# unique job categories
patterns= df.label.unique()
for a in patterns:
    ruler.add_patterns([{"label":"Job-Category", "pattern": a}])

In [10]:
ruler.add_patterns([{"label": "Job-Category", "pattern": a}])
# options=[{"ents": "Job-Category", "colors": "#ff3232"},{"ents": "SKILL", "colors": "#56c426"}]
colors = {
    "Job-Category": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
    "SKILL": "linear-gradient(90deg, #9BE15D, #00E3AE)",
    "ORG": "#ffd966",
    "PERSON": "#e06666",
    "GPE": "#9fc5e8",
    "DATE": "#c27ba0",
    "ORDINAL": "#674ea7",
    "PRODUCT": "#f9cb9c",
}
options = {
    "ents": [
        "Job-Category",
        "SKILL",
        "ORG",
        "PERSON",
        "GPE",
        "DATE",
        "ORDINAL",
        "PRODUCT",
    ],
    "colors": colors,
}
sent = nlp(df["text"].iloc[5])
displacy.render(sent, style="ent", jupyter=True, options=options)

  from IPython.core.display import display, HTML


In [11]:
def get_skills(text):
    doc = nlp(text)
    myset = []
    subset = []
    for ent in doc.ents:
        if ent.label_ == "SKILL":
            subset.append(ent.text)
    myset.append(subset)
    return subset


def unique_skills(x):
    return list(set(x))

In [12]:
# add unique skills column for each resume
df["skills"] = df["text"].str.lower().apply(get_skills)
df["skills"] = df["skills"].apply(unique_skills)
df.head()

Unnamed: 0,text,label,skills
0,hr administrator marketing associate hr admin...,HR,"[advertising, documentation, data analysis, se..."
1,"hr specialist, us hr operations summary versa...",HR,"[adobe photoshop, material, software, advertis..."
2,hr director summary over 20 years experience ...,HR,"[box, database, advertising, monitoring, secur..."
3,"hr specialist summary dedicated, driven, and ...",HR,"[database, documentation, monitoring, process ..."
4,hr manager skill highlights hr skills hr depa...,HR,"[box, data center, business, support, business..."


In [13]:
# for each job category, get skills, count frequency, and get the top 10 skills
top10skills = pd.DataFrame(df['label'].unique(), columns= ['Label'])
top10skills['Top 10 Skills'] = ''
top10skills['Top 10 Skills'] = top10skills['Top 10 Skills'].apply(list)
row= 0

for label in df['label'].unique():
    skills= []
    df2= pd.DataFrame(df['skills'][df['label']==label]).reset_index(drop=True)
    print(label)
    
    for i in range(0, len(df2)):
        skills += df2['skills'][i]
        freq_skills = nltk.FreqDist(skills)

    print(freq_skills.most_common(10))  

    for j in range(0, len(freq_skills.most_common(10))):
        top10skills.at[row, 'Top 10 Skills'].append(freq_skills.most_common(10)[j][0])
      
    row += 1
top10skills

HR
[('business', 82), ('support', 71), ('database', 42), ('documentation', 37), ('project management', 32), ('marketing', 28), ('software', 28), ('business administration', 27), ('design', 24), ('communications', 23)]
DESIGNER
[('design', 95), ('business', 49), ('software', 44), ('support', 40), ('marketing', 38), ('engineering', 26), ('graphic design', 26), ('material', 23), ('advertising', 23), ('testing', 23)]
INFORMATION-TECHNOLOGY
[('support', 108), ('software', 103), ('business', 84), ('security', 82), ('windows', 71), ('server', 71), ('design', 69), ('project management', 58), ('database', 57), ('documentation', 49)]
TEACHER
[('support', 50), ('business', 32), ('material', 21), ('schedule', 20), ('design', 19), ('play', 19), ('testing', 19), ('certificate', 16), ('collaboration', 15), ('documentation', 15)]
ADVOCATE
[('support', 72), ('business', 50), ('database', 35), ('documentation', 33), ('monitoring', 28), ('schedule', 28), ('payments', 27), ('software', 27), ('marketing', 

Unnamed: 0,Label,Top 10 Skills
0,HR,"[business, support, database, documentation, p..."
1,DESIGNER,"[design, business, software, support, marketin..."
2,INFORMATION-TECHNOLOGY,"[support, software, business, security, window..."
3,TEACHER,"[support, business, material, schedule, design..."
4,ADVOCATE,"[support, business, database, documentation, m..."
5,BUSINESS-DEVELOPMENT,"[business, marketing, support, software, desig..."
6,HEALTHCARE,"[support, business, marketing, documentation, ..."
7,FITNESS,"[business, support, marketing, schedule, desig..."
8,AGRICULTURE,"[business, support, design, monitoring, softwa..."
9,BPO,"[support, business, monitoring, documentation,..."


In [None]:
print(list(top10skills['Top 10 Skills'][top10skills['Label']=='HR']))