In [19]:
import glob

Get filenames

In [20]:
filenames = glob.glob('resumes/*.txt')
fname1 = filenames[0]
filenames

['resumes/resume_1.txt',
 'resumes/resume_2.txt',
 'resumes/resume_11.txt',
 'resumes/resume_10.txt']

read text file

In [32]:
fname1
with open(fname1, 'r') as f:
    text = f.read()
text[:500]

'REDACTED 1\nREDACTED\nWell-rounded full-stack engineer with strong communication and people skills. Software Engineering Intern at \nCapital One for summer 2019. \nE\nDUCATION\nStanford University (Class of 2020)\nB.S. Symbolic Systems (Concentration in Human\n-\nComputer Interaction) | Minor in Data Science\nGPA: 3.8\nStanford, CA\n   September 2016 \n-\n Present\nR\nELEVANT \nE\nXPERIENCES\nSoftware Engineering Intern \n       Stanford, CA\nCapital One\nJune 2019 \n–\n Present\n●\nFull\n-\nstack engineer in the Retail an'

### Python string methods

In [11]:
text.split('\n')[1]

'REDACTED'

In [14]:
type(text.split('\n')[1])

str

In [18]:
text.lower().split()[:10]

['redacted',
 '1',
 'redacted',
 'well-rounded',
 'full-stack',
 'engineer',
 'with',
 'strong',
 'communication',
 'and']

## Intro to NLTK

In [76]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/utopia3/dc326/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/utopia3/dc326/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [35]:
words = nltk.tokenize.word_tokenize(text.lower())
words[:10]

['redacted',
 '1',
 'redacted',
 'well-rounded',
 'full-stack',
 'engineer',
 'with',
 'strong',
 'communication',
 'and']

In [36]:
sents = nltk.tokenize.sent_tokenize(text.lower())
sents[:2]

['redacted 1\nredacted\nwell-rounded full-stack engineer with strong communication and people skills.',
 'software engineering intern at \ncapital one for summer 2019. \ne\nducation\nstanford university (class of 2020)\nb.s.']

### Work with multiple files

In [37]:
filenames = glob.glob('resumes/*.txt')
filenames

['resumes/resume_1.txt',
 'resumes/resume_2.txt',
 'resumes/resume_11.txt',
 'resumes/resume_10.txt']

In [41]:
docs = list()
for fname in filenames:
    with open(fname, 'r') as f:
        text = f.read()
    words = nltk.tokenize.word_tokenize(text.lower())
    docs.append(words)
docs[0][:5]

['redacted', '1', 'redacted', 'well-rounded', 'full-stack']

In [47]:
from collections import Counter

In [53]:
for doc in docs:
    cts = Counter(doc)
    print(cts['leadership'], ':', doc[:3])

0 : ['redacted', '1', 'redacted']
1 : ['redacted', '2', 'education']
1 : ['redacted', '11', 'redacted']
2 : ['redacted', '10', 'redacted']


In [58]:
keywords = ['leader', 'leadership', 'president', 'chair']
for doc in docs:
    cts = Counter(doc)
    num = sum([cts[kw] for kw in keywords])
    print(doc[:2], ':', num)

['redacted', '1'] : 3
['redacted', '2'] : 1
['redacted', '11'] : 1
['redacted', '10'] : 2


In [71]:
# most common words
for doc in docs:
    cts = Counter(doc)
    topwords = list(sorted(cts, reverse=True))
    print(doc[:2], ':', topwords[:5])

['redacted', '1'] : ['●', '”', '“', '’', '‘']
['redacted', '2'] : ['•', '’', '|', 'workplace', 'working']
['redacted', '11'] : ['•', '”', '“', '’', 'young']
['redacted', '10'] : ['●', '’', 'york', 'years', 'xcode']


In [83]:
# part-of-speech tag filtering
KEEP = ['NN', 'VBN']
for doc in docs:
    pos = nltk.pos_tag(doc)
    use_tok = [tok for tok,tag in pos if tag in KEEP]
    print(pos[:5])

[('redacted', 'VBN'), ('1', 'CD'), ('redacted', 'JJ'), ('well-rounded', 'JJ'), ('full-stack', 'NN')]
[('redacted', 'VBN'), ('2', 'CD'), ('education', 'NN'), ('univ', 'JJ'), ('ersity', 'NN')]
[('redacted', 'VBN'), ('11', 'CD'), ('redacted', 'VBN'), ('computer', 'NN'), ('skills', 'NNS')]
[('redacted', 'VBN'), ('10', 'CD'), ('redacted', 'JJ'), ('education', 'NN'), ('may', 'MD')]


In [87]:
# words surrounding a given target word
for doc in docs:
    for i in range(len(doc)):
        if doc[i] == 'python':
            print(doc[i-5:i+5])

['introductory', 'computer', 'science', 'course', 'in', 'python', '(', 'cs106a', ')', 'march']
['projects', ':', '●', 'chatbot', '(', 'python', ')', '-', 'implemented', 'nlp']
[':', 'java', ',', 'javascript', ',', 'python', ',', 'html', ',', 'css']
['analytics', 'gpa', ':', '3.94', 'skills', 'python', ',', 'java', ',', 'sql']
['novel', 'anomaly', 'detection', 'model', 'in', 'python', 'that', 'flags', 'unhealthy', 'sensors']
['for', 'energy', 'access', 'knowledge', 'using', 'python', '•', 'created', 'algorithm', 'that']
['proficient', 'in', ':', 'java', ',', 'python', ',', 'c', '#', ',']
['●', 'taught', 'myself', 'react', 'and', 'python', 'to', 'build', 'a', 'personal']
['skills', 'languages', ':', 'c/c++', ',', 'python', ',', 'matlab', ',', 'r']


In [97]:
# search for GPA of these documents
for doc in docs:
    for tok in doc:
        isint, isfloat = False, False
        try:
            num = float(tok)
            isfloat = True
        except:
            pass
        try:
            num = int(tok)
            isint = True
        except:
            pass
        if isfloat and not isint and float(tok) > 2.0 and float(tok) < 5.0:
            print(' '.join(doc[:2]), ':', float(tok))

redacted 1 : 3.8
redacted 2 : 3.88
redacted 2 : 3.92
redacted 2 : 3.94
