In [509]:
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import spacy

# to download spacy, go to terminal and enter these two lines:
# pip install spacy
# spacy download en

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

import time # measures time elapsed in order to run a function or loop

In [510]:
df = pd.read_csv('resume_dataset.csv')

In [511]:
############################################
# Reducing the size of the dataset to make it more manageable (reduce computation times while developing)
# Comment code when model is ready.
df = df[:200]
############################################

In [512]:
# df.head()

In [513]:
categories = ['HR', 'Designing', 'Managment', 'Information Technology',
       'Education', 'Advocate', 'Business Development',
       'Health & Fitness', 'Agricultural', 'BPO', 'Sales', 'Consultant',
       'Digital Media', 'Automobile', 'Food & Beverages', 'Finance',
       'Apparel', 'Engineering', 'Accountant', 'Building & Construction',
       'Architects', 'Public Relations', 'Banking', 'Arts', 'Aviation']

In [514]:
# bar graph showing how many resumes are in each category
# df.groupby('Category').count()['ID'].plot.bar()

In [515]:
cleaned_df = df.drop('ID', axis=1)

In [516]:
# cleaned_df.head()

In [517]:
cleaned_df['Resume'] = df['Resume'].apply(lambda row: row[2:-1])

In [518]:
# cleaned_df.head()

In [519]:
# \\x --> \x (so that regex doesn't count it as an escape sequence)
# \w --> any non whitespace character

# \\x\w\w matches \x--, where - is not a whitespace character
cleaned_df['split_resume'] = cleaned_df['Resume'].apply(
    lambda x: [re.sub(r'\\x\w\w', '', i) for i in x.split('\\n') if len(re.sub(r'\\x\w\w', '', i).strip()) > 0])

In [520]:
# change everything to lowercase in split_resume column
for i in range(len(cleaned_df['split_resume'])):
    cleaned_df['split_resume'][i] = [x.lower() for x in cleaned_df['split_resume'][i]]

In [521]:
# cleaned_df.head()

In [522]:
# cleaned_df['split_resume'][20]

In [523]:
# EMPLOYMENT HISTORY
# CAREER HISTORY
# Professional Experience:
# PROFESSIONAL EXPERIENCE
# CAREER HISTORY
# Project Experience:
# PROFESSIONAL CAREER SERVICES EXPERIENCE

In [524]:
# \\x --> \x (so that regex doesn't count it as an escape sequence)
# \w --> any non whitespace character

# \\x\w\w matches \x--, where - is not a whitespace character
markers = re.compile('professional|career|project|employment|work? employment|experience|history', re.IGNORECASE)

cleaned_df['better_split'] = cleaned_df['Resume'].apply(
    lambda x: [[m for m in markers.split(re.sub(r'\\x\w\w', '', i).strip()) if len(m) > 0] for i in x.split('\\n') if len(re.sub(r'\\x\w\w', '', i).strip()) > 0])

In [525]:
# print(cleaned_df['better_split'][20])

In [526]:
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
# NOTE: this loop takes A LONG TIME to run. Be patient!
nlp = spacy.load('en', disable=['parser', 'ner'])



#initializes new column 'lemmatized'
cleaned_df['lemmatized'] = cleaned_df['split_resume']

#starts measuring time for loop (this isn't necessary for code function, just to see how long it takes)
start = time.time()



for res in range(len(cleaned_df['split_resume'])):
    for i in range(len(cleaned_df['split_resume'][res])):
        doc = nlp(cleaned_df['split_resume'][res][i])
        lemmatized = " ".join([token.lemma_ for token in doc])
        cleaned_df['lemmatized'][res][i] = lemmatized

end = time.time() #ends time
print('this loop took:',end - start,'seconds to run')

#let's take a look at an example of one row that was lemmatized:
# cleaned_df['lemmatized'][20]

this loop took: 54.913658142089844 seconds to run


In [527]:
# remove stopped words. basically removing irrelevant words such as "and" or "to"
# NOTE: this loop takes about 8 seconds to run
from nltk.corpus import stopwords 
cachedStopWords = stopwords.words("english")
start = time.time()
cleaned_df['lemmatized+stopped'] = cleaned_df['lemmatized']
for res in range(len(cleaned_df['split_resume'])):
    for i in range(len(cleaned_df['lemmatized'][res])):
        text = cleaned_df['lemmatized'][res][i]
        cleaned_df['lemmatized+stopped'][res][i] = " ".join([word for word in text.split() if word not in cachedStopWords and word!='-PRON-'])

end = time.time()
print('this loop took:',end - start,'seconds to run')
# cleaned_df['lemmatized+stopped'][20]


this loop took: 1.2614350318908691 seconds to run


In [528]:
# sentences are group of words that symbolize leadership words
'''
sentences = [    
"Lead",
"Pioneer",
"Spearhead",
"Head",
"Pilot",
"Transform",
"Revitalize",
"Optimize",
"Modernize",
"Negotiate",
"Convince",
"Ignite",
"Gained",
"Prompt",
"Mobilize",
"Spur",
"Propel",
"Coach",
"Focus",
"Support",
"Shape",
"Train",
"Motivate",
"Uplift",
"Advocate",
"Unite",
"Galvanize",
"Organize",
"Energize",
"Focus",
"Accelerate",
"Collaborate",
"Administer",
"Built",
"Direct",
"Delegate",
"Empower",
"Drive",
"Engineer",
"Guide",
"Implement",
"Initiate",
"Execute",
"Instruct",
"Launch",
"Mediate",
"Facilitate",
"Formulate",
"Enhance",
"Consolidate",
"Conduct",
"Oversaw",
"Budget",
"Listen",
"Coalesce",
"Diversify",
"Merge",
"Embrace",
"Partner",
"Volunteer",
"Enable",
"Inspire",
"Foster",
"Win",
"Orchestrate",
"Host",
"Cultivate",
"Innovate",
"Manage",
"Handle",
"Develop",
"Raise",
"Persuade",
"Influence",
"Correspond",
"Field",
"Convey",
"Promote",
"Consult",
"Clarify",
"Apologize",
"Inform",
"Author",
"Compose",
"Advance",
"Enact",
"Expedite",
"Generate",
"Improve",
"Lift",
"Maximize",
"Produce",
"Stimulate",
"Boost",
"Deliver",
"Expand",
"Discipline",
"Outpace",
"Amplify",
"Complete",
]
'''

# i made a quick short list of leadership words for simplification. we can change this later
words_original = ['Lead','Led','Instruct','Develop','Conduct','Motivate',
         'Influence','Organize','Spearhead','Innovate','Deliver',
         'Plan','Analyze','Manage','Approve','Administer','Attain',
        'Coordinate']

words = [x.lower() for x in words_original] #make all words in words_original lowercase
words

['lead',
 'led',
 'instruct',
 'develop',
 'conduct',
 'motivate',
 'influence',
 'organize',
 'spearhead',
 'innovate',
 'deliver',
 'plan',
 'analyze',
 'manage',
 'approve',
 'administer',
 'attain',
 'coordinate']

In [529]:
#textexample = ['lead rate','hello','purchase bought 11 boost Organization profit','innovate random','junk',
# '10','nothing','tried Coordinate','20 Develop']
textexample = cleaned_df['lemmatized+stopped'][0]
cleaned_df.head()

Unnamed: 0,Category,Resume,split_resume,better_split,lemmatized,lemmatized+stopped
0,HR,"John H. Smith, P.H.R.\n800-991-5187 | PO Box 1...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[[John H. Smith, P.H.R.], [800-991-5187 | PO B...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p..."
1,HR,Name Surname\nAddress\nMobile No/Email\nPERSON...,"[name surname, address, mobile / email, person...","[[Name Surname], [Address], [Mobile No/Email],...","[name surname, address, mobile / email, person...","[name surname, address, mobile / email, person..."
2,HR,Anthony Brown\nHR Assistant\nAREAS OF EXPERTIS...,"[anthony brown, hr assistant, area expertise, ...","[[Anthony Brown], [HR Assistant], [AREAS OF EX...","[anthony brown, hr assistant, area expertise, ...","[anthony brown, hr assistant, area expertise, ..."
3,HR,www.downloadmela.com\nSatheesh\nEMAIL ID:\nCar...,"[www.downloadmela.com, satheesh, email would :...","[[www.downloadmela.com], [Satheesh], [EMAIL ID...","[www.downloadmela.com, satheesh, email would :...","[www.downloadmela.com, satheesh, email would :..."
4,HR,HUMAN RESOURCES DIRECTOR\n\xef\x82\xb7Expert i...,"[human resources director, expert organization...","[[HUMAN RESOURCES DIRECTOR], [Expert in organi...","[human resources director, expert organization...","[human resources director, expert organization..."


In [530]:
# function assigns y val as 1 or 0 depending on if any of the leadership words are in that resume line 
# creates a new dataframe with a y column with array of binary numbers in each row
cleaned_df['y'] = cleaned_df['lemmatized+stopped']
for res in range(len(cleaned_df['lemmatized+stopped'])):
    cleaned_df['y'][res] = [int(any(ele in x for ele in words)) for x in cleaned_df['lemmatized+stopped'][res]]

In [531]:
#from sklearn.model_selection import train_test_split
#X_train_raw, X_test_raw, y_train, y_test = train_test_split(cleaned_df['lemmatized+stopped'], cleaned_df['y'], test_size=0.30)
#X_train_raw

In [532]:
cleaned_df['mask'] = [np.random.uniform(0,1)  for k in df.index]
train = cleaned_df[cleaned_df['mask'] < 0.7]
test = cleaned_df[cleaned_df['mask']>= 0.7]
train

Unnamed: 0,Category,Resume,split_resume,better_split,lemmatized,lemmatized+stopped,y,mask
0,HR,"John H. Smith, P.H.R.\n800-991-5187 | PO Box 1...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[[John H. Smith, P.H.R.], [800-991-5187 | PO B...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, ...",0.446376
1,HR,Name Surname\nAddress\nMobile No/Email\nPERSON...,"[name surname, address, mobile / email, person...","[[Name Surname], [Address], [Mobile No/Email],...","[name surname, address, mobile / email, person...","[name surname, address, mobile / email, person...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0.628605
3,HR,www.downloadmela.com\nSatheesh\nEMAIL ID:\nCar...,"[www.downloadmela.com, satheesh, email would :...","[[www.downloadmela.com], [Satheesh], [EMAIL ID...","[www.downloadmela.com, satheesh, email would :...","[www.downloadmela.com, satheesh, email would :...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.223760
5,HR,"John H. Smith, P.H.R.\n800-991-5187 | PO Box 1...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[[John H. Smith, P.H.R.], [800-991-5187 | PO B...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[john h. smith , p.h.r ., 800 - 991 - 5187 | p...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, ...",0.464949
6,HR,Resume of Satheesh\n\nwww.downlo\nSatheesh\n\n...,"[resume satheesh, www.downlo, satheesh, experi...","[[Resume of Satheesh], [www.downlo], [Satheesh...","[resume satheesh, www.downlo, satheesh, experi...","[resume satheesh, www.downlo, satheesh, experi...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, ...",0.350070
...,...,...,...,...,...,...,...,...
193,Information Technology,"JOHN H. SMITH\nP.O. Box 1673\x01 Callahan, FL ...","[john h. smith, p.o . box 1673 callahan , fl 3...","[[JOHN H. SMITH], [P.O. Box 1673 Callahan, FL ...","[john h. smith, p.o . box 1673 callahan , fl 3...","[john h. smith, p.o . box 1673 callahan , fl 3...","[0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, ...",0.698802
195,Information Technology,Steve Pomeroy\nResume\nEducation\nFall 2000\xe...,"[steve pomeroy, resume, education, fall 2000fa...","[[Steve Pomeroy], [Resume], [Education], [Fall...","[steve pomeroy, resume, education, fall 2000fa...","[steve pomeroy, resume, education, fall 2000fa...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...",0.687473
196,Information Technology,RAMI XXXXX\nrrXXXXXXr@gmail.com\nCell: +91-9XX...,"[rami xxxxx, rrxxxxxxr@gmail.com, cell : +91 -...","[[RAMI XXXXX], [rrXXXXXXr@gmail.com], [Cell: +...","[rami xxxxx, rrxxxxxxr@gmail.com, cell : +91 -...","[rami xxxxx, rrxxxxxxr@gmail.com, cell : +91 -...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, ...",0.061995
197,Information Technology,Create a Resume in minutes with this template!...,"[create resume minute template !, build resume...",[[Create a Resume in minutes with this templat...,"[create resume minute template !, build resume...","[create resume minute template !, build resume...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...",0.041747


In [533]:
train.reset_index(drop=True, inplace=True)
#X_train_preprocessed = train['lemmatized+stopped'][0]
X_train_preprocess = []
for i in range(len(train)):
    X_train_preprocess = X_train_preprocess+train['lemmatized+stopped'][i]

In [534]:
test.reset_index(drop=True, inplace=True)
#X_train_preprocessed = train['lemmatized+stopped'][0]
X_test_preprocess = []
for i in range(len(test)):
    X_test_preprocess = X_test_preprocess+test['lemmatized+stopped'][i]

In [535]:
y_train = []
for i in range(len(train)):
    y_train = y_train + train['y'][i]

In [536]:
y_test = []
for i in range(len(test)):
    y_test = y_test + test['y'][i]

In [545]:
'''
def label_sentence(sentence):
    for x in sentence:
        if x in words:
            return 1
    return 0
'''

cv = CountVectorizer(
    min_df = 0.01,
    max_df = 0.95,
)
X_train = cv.fit_transform(X_train_preprocess)
X_test = cv.fit_transform(X_test_preprocess)

#y = label_sentence(lemmatizedandstopped)
#y_train = [int(any(ele in x for ele in words)) for x in textexample]
#func assigns y val as t/f depend

In [546]:
lr_bow_classifier = LogisticRegression()
lr_bow_classifier.fit(X_train,y_train)

sorted(list(zip(cv.get_feature_names(), lr_bow_classifier.coef_[0])), key=lambda x: -x[1])



[('new', 6.803431630737171),
 ('office', 6.35779645703125),
 ('development', 6.139544898569746),
 ('develop', 6.116127594373932),
 ('manager', 5.9814715408475685),
 ('management', 5.496303423653753),
 ('university', 1.1926404105274806),
 ('sale', 0.9490487233060095),
 ('web', 0.9237376980171101),
 ('job', 0.9201800191078372),
 ('support', 0.9063941504204588),
 ('business', 0.8896479250689718),
 ('provide', 0.8153974562138289),
 ('professional', 0.765239040483174),
 ('lead', 0.7305241839287399),
 ('project', 0.6240864128057914),
 ('create', 0.6094805984314642),
 ('system', 0.5469683777138505),
 ('resume', 0.5212800189877468),
 ('software', 0.4413400144218755),
 ('skill', 0.39169007246766996),
 ('employee', 0.3874321513643979),
 ('training', 0.3280528785919076),
 ('plan', 0.3269103305712909),
 ('application', 0.2765666609448813),
 ('include', 0.22157663525093657),
 ('product', 0.19405155442569605),
 ('year', 0.1914740397288261),
 ('program', 0.1192460538662398),
 ('customer', 0.065139925

In [547]:
pd.DataFrame(list(zip(cv.get_feature_names(), X_train.sum(axis=0).tolist()[0])), columns=['token', 'records']).sort_values('records', ascending=False).head(50)

Unnamed: 0,token,records
8,design,689
21,new,671
30,provide,644
13,implement,437
11,employee,399
1,business,376
39,training,375
10,development,364
24,present,362
36,support,347


In [552]:
#X_train, X_test, y_train, y_test = train_test_split(df.review, 
#                                                    df.sentiment.apply(lambda x: 1 if x == 'positive' else 0 ),
#                                                    test_size=0.30)

pipeline = Pipeline(steps=[
    ('Vectorizer', CountVectorizer(min_df=0.01, max_df=0.95,)),
    ('Classifier', LogisticRegression(solver='newton-cg'))]).fit(X_train_preprocess, y_train)
pipeline.score(X_test_preprocess, y_test)

0.9385267620561738

In [553]:
pd.DataFrame(sorted(list(zip(pipeline.steps[0][1].get_feature_names(), pipeline.steps[1][1].coef_[0])), key=lambda x: x[1]), columns=['token','coef'])

Unnamed: 0,token,coef
0,com,-1.753159
1,human,-0.656106
2,resume,-0.635486
3,university,-0.570923
4,company,-0.522906
5,technology,-0.474237
6,engineering,-0.436425
7,computer,-0.399057
8,present,-0.287158
9,information,-0.114477


In [None]:
#idk what everything below does and idk if it's relevant:

In [550]:
cv = CountVectorizer()
'''
    The .fit() method of the CountVectorizer is building the token dictionary from training data.
'''
#instead of sentences pass in resumes
cv.fit(textexample)

'''
    The .transform() method of the CountVectorizer is converting sentences into vectors based
    on the dictionary.
'''
word_sentence_matrix = cv.transform(textexample)

sentences_df = pd.DataFrame(word_sentence_matrix.toarray(), columns=cv.get_feature_names())
sentences_df

Unnamed: 0,10,100,13,1673,200,2002,20022004,20042006,20072008,20082010,...,washington,web,webinar,welfare,well,without,work,worldwide,year,york
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
78,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
79,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
80,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
print('tokens:', cv.get_feature_names())
#token = cv.get_feature_names()
word_sentence_matrix.toarray()

tokens: ['10', '100', '13', '1673', '200', '2002', '20022004', '20042006', '20072008', '20082010', '2010', '2010present', '2014', '20k', '300', '30k', '32011', '401', '403', '457', '500', '5187', '5500', '60', '75', '800', '991', 'aap', 'abc', 'account', 'achieve', 'acme', 'acquire', 'acquisition', 'action', 'adaptability', 'admin', 'administer', 'administration', 'administrator', 'adp', 'affiliation', 'affirmative', 'ahead', 'alternative', 'analysis', 'analytical', 'analyze', 'annually', 'approachable', 'area', 'assistant', 'atlanta', 'audit', 'auditing', 'bachelor', 'back', 'balance', 'base', 'behavioral', 'benefit', 'box', 'business', 'call', 'callahan', 'capable', 'capital', 'care', 'cebs', 'certificate', 'certification', 'change', 'choice', 'claim', 'clearly', 'client', 'closely', 'cognos', 'collaborate', 'collaboration', 'college', 'com', 'communicate', 'communication', 'community', 'comp', 'company', 'comparable', 'compensation', 'competency', 'competitive', 'compliance', 'compr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [151]:
print('X:\n', X.toarray())
print('y:\n', y)

X:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
y:
 [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]
