# Job Description Keyword Extraction

NOTE: You will need to download the spacy model. See https://spacy.io/usage/models

Example: `python -m spacy download en_core_web_sm`

In [29]:
import spacy
import pytextrank

In [30]:
def top_ranked_phrases(filename, limit_phrases=20):
    '''
    Examine the top-ranked phrases in the document
    '''
    with open(f"{filename}", encoding='utf-8') as f:
        text = ''.join(f.readlines())
    
    nlp = spacy.load('en_core_web_sm')

    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

    doc = nlp(text)
    
    num_phrases = 0
    for p in doc._.phrases:
        print('{:.4f} {:5d}  {}'.format(p.rank, p.count, p.text))
        # print(p.chunks)
        num_phrases += 1
        
        if num_phrases == limit_phrases:
            break
        
    return doc

In [31]:
from math import sqrt
from operator import itemgetter

def summarize(doc, limit_sentences=4):
    
    sent_bounds = [ [s.start, s.end, set([])] for s in doc.sents ]
    limit_phrases = 4
    phrase_id = 0
    unit_vector = []

    for p in doc._.phrases:
        # print(phrase_id, p.text, p.rank)

        unit_vector.append(p.rank)

        for chunk in p.chunks:
            # print(" ", chunk.start, chunk.end)

            for sent_start, sent_end, sent_vector in sent_bounds:
                if chunk.start >= sent_start and chunk.start <= sent_end:
                    # print(" ", sent_start, chunk.start, chunk.end, sent_end)
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if phrase_id == limit_phrases:
            break
    
    sum_ranks = sum(unit_vector)
    unit_vector = [ rank/sum_ranks for rank in unit_vector ]
    
    sent_rank = {}
    sent_id = 0

    for sent_start, sent_end, sent_vector in sent_bounds:
        # print(sent_vector)
        sum_sq = 0.0

        for phrase_id in range(len(unit_vector)):
            # print(phrase_id, unit_vector[phrase_id])

            if phrase_id not in sent_vector:
                sum_sq += unit_vector[phrase_id]**2.0

        sent_rank[sent_id] = sqrt(sum_sq)
        sent_id += 1

    # print(sent_rank)

    sent_text = {}
    sent_id = 0

    for sent in doc.sents:
        sent_text[sent_id] = sent.text
        sent_id += 1

    num_sent = 0

    for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)):
        print(f"({sent_id}) {sent_text[sent_id]}")
        num_sent += 1

        if num_sent == limit_sentences:
            break

In [35]:
doc = top_ranked_phrases('../data/nonprofit-data-analyst.txt', 30)

0.0755     1  modern data tools
0.0733     1  key business strategy questions
0.0716     1  new data sources
0.0709     1  data trends
0.0706     3  data
0.0683     1  appropriate data
0.0681     1  data cleansing
0.0661     1  most data storage
0.0654     2  key questions
0.0650     1  data literacy
0.0620     1  good questions
0.0597     1  senior data staff
0.0593     2  questions
0.0587     1  incoming questions
0.0561     1  new tools
0.0547     1  business owners
0.0545     1  various distributed business functions
0.0528     1  business metrics
0.0526     1  other tools
0.0522     1  new opportunities
0.0520     1  business models
0.0508     1  business leaders
0.0496     1  nonprofit organizations
0.0484     1  statistical analysis techniques
0.0482     1  experiment design
0.0478     3  tools
0.0464     1  non-technical audiences
0.0455     3  experiments
0.0441     3  analysis
0.0433     1  the team core data skills


In [34]:
summarize(doc, 4)

(32) A solid background in modern data tools and techniques.
(17) Proactively surface, highlight, and explore key business strategy questions with channel owners.

(21) Work closely with others on the tech team to make new data sources available and ensure that existing ones are delivering on the data access needs of the organization.

(28) Always Learning: Keep up-to-date with technology and data trends and experiment with new tools and products to maintain competitive edge.

