In [1]:
import re
import json
from collections import Counter
import numpy as np
import itertools

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('omw-1.4')

from gensim import corpora
from gensim.models.ldamodel import LdaModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Step 1

- build dataset

In [2]:
documents = []
with open('../../data/disney/job_descriptions.json', 'r') as reader:
  for description in json.loads(reader.read()):
    items = [
      item['text']
      for item in description['sections']
      if item['section'] == 'Basic Qualifications' or item['section'] == 'Preferred Qualifications'
    ]

    if len(items) == 0:
      continue

    qualification = {
      'cat_id': description['cat_id'],
      'job_id': description['job_id'],
      'step_1': items
    }

    documents.append(qualification)

documents[0]

{'cat_id': 391,
 'job_id': 19757793040,
 'step_1': ['* You are passionate about your area of expertise, deeply inquisitive and open minded, informed, but not limited, by your domain of expertise.\n* You enjoy intellectual debate.\n* You are comfortable guiding other team members but willing to get your hands dirty and help build research systems when needed\n* You are driven to perform research that is not simply novel, but deeply impactful to the company and society have a deep desire to reduce the research to practice in the form of prototypes and technology demonstrators.\n* You are excited by the platform provided by Disney to connect with children and guests of all ages to have a positive impact on the world.\n* You have a deep sensitivity for ethical use of technology and data.\n* You thrive in a fast-paced collaborative environment.\n* You are self-directed and independent, but open to constructive feedback.\n* You are a team player, able to work in collaborative interdisciplina

### Step 2

- text to sentences

In [3]:
def transform_text(text):
  def clean_sentence(sentence):
    sentence = re.sub(r'[`\']', '', sentence)
    sentence = re.sub(r'[,.!?:;"]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.strip()

  text = re.sub(r'(e\.g\.|\/)', ' ', text)
  text = re.sub(r'etc\.', 'etc', text)
  text = re.sub(r'[*â€¢"()]', ' ', text)
  text = re.sub(r'&', ' and ', text)

  return [ clean_sentence(sentence) for sentence in sent_tokenize(text.lower()) ]

for document in documents:
  document['step_2'] = []
  for section_text in document['step_1']:
    document['step_2'].extend(transform_text(section_text))

print(documents[0]['step_1'][0])
print()
print(documents[0]['step_2'][0])

* You are passionate about your area of expertise, deeply inquisitive and open minded, informed, but not limited, by your domain of expertise.
* You enjoy intellectual debate.
* You are comfortable guiding other team members but willing to get your hands dirty and help build research systems when needed
* You are driven to perform research that is not simply novel, but deeply impactful to the company and society have a deep desire to reduce the research to practice in the form of prototypes and technology demonstrators.
* You are excited by the platform provided by Disney to connect with children and guests of all ages to have a positive impact on the world.
* You have a deep sensitivity for ethical use of technology and data.
* You thrive in a fast-paced collaborative environment.
* You are self-directed and independent, but open to constructive feedback.
* You are a team player, able to work in collaborative interdisciplinary groups
* You are willing to take the time necessary to und

### Step 3

- sentences to individual tokens

In [4]:
for document in documents:
  document['step_3'] = []
  for sentence in document['step_2']:
    document['step_3'].append(word_tokenize(sentence))
  
print(documents[0]['step_2'][0])
print()
print(documents[0]['step_3'][0])

you are passionate about your area of expertise deeply inquisitive and open minded informed but not limited by your domain of expertise

['you', 'are', 'passionate', 'about', 'your', 'area', 'of', 'expertise', 'deeply', 'inquisitive', 'and', 'open', 'minded', 'informed', 'but', 'not', 'limited', 'by', 'your', 'domain', 'of', 'expertise']


### Step 4

- build ngrams

In [5]:
n_gram_ranges = [2, 3, 4, 5]

def format(tup):
  return '_'.join(tup)

for document in documents:
  document['step_4'] = []
  for tokens in document['step_3']:
    sentence_grams = []
    for n in n_gram_ranges:
      sentence_grams.extend([ format(tup) for tup in nltk.ngrams(tokens, n)])
    
    document['step_4'].append(sentence_grams)

print(documents[0]['step_3'][0])
print()
print(documents[0]['step_4'][0])

['you', 'are', 'passionate', 'about', 'your', 'area', 'of', 'expertise', 'deeply', 'inquisitive', 'and', 'open', 'minded', 'informed', 'but', 'not', 'limited', 'by', 'your', 'domain', 'of', 'expertise']

['you_are', 'are_passionate', 'passionate_about', 'about_your', 'your_area', 'area_of', 'of_expertise', 'expertise_deeply', 'deeply_inquisitive', 'inquisitive_and', 'and_open', 'open_minded', 'minded_informed', 'informed_but', 'but_not', 'not_limited', 'limited_by', 'by_your', 'your_domain', 'domain_of', 'of_expertise', 'you_are_passionate', 'are_passionate_about', 'passionate_about_your', 'about_your_area', 'your_area_of', 'area_of_expertise', 'of_expertise_deeply', 'expertise_deeply_inquisitive', 'deeply_inquisitive_and', 'inquisitive_and_open', 'and_open_minded', 'open_minded_informed', 'minded_informed_but', 'informed_but_not', 'but_not_limited', 'not_limited_by', 'limited_by_your', 'by_your_domain', 'your_domain_of', 'domain_of_expertise', 'you_are_passionate_about', 'are_passiona

### Step 5

- final document

In [6]:
for document in documents:
  document['step_5'] = []
  for tokens in document['step_4']:
    document['step_5'].extend(tokens)
  
print(documents[0]['step_4'][0])
print()
print(documents[0]['step_5'][:5])

['you_are', 'are_passionate', 'passionate_about', 'about_your', 'your_area', 'area_of', 'of_expertise', 'expertise_deeply', 'deeply_inquisitive', 'inquisitive_and', 'and_open', 'open_minded', 'minded_informed', 'informed_but', 'but_not', 'not_limited', 'limited_by', 'by_your', 'your_domain', 'domain_of', 'of_expertise', 'you_are_passionate', 'are_passionate_about', 'passionate_about_your', 'about_your_area', 'your_area_of', 'area_of_expertise', 'of_expertise_deeply', 'expertise_deeply_inquisitive', 'deeply_inquisitive_and', 'inquisitive_and_open', 'and_open_minded', 'open_minded_informed', 'minded_informed_but', 'informed_but_not', 'but_not_limited', 'not_limited_by', 'limited_by_your', 'by_your_domain', 'your_domain_of', 'domain_of_expertise', 'you_are_passionate_about', 'are_passionate_about_your', 'passionate_about_your_area', 'about_your_area_of', 'your_area_of_expertise', 'area_of_expertise_deeply', 'of_expertise_deeply_inquisitive', 'expertise_deeply_inquisitive_and', 'deeply_inq

### Step 6

- filter out ngrams

In [7]:
qualifications = [ document['step_5'] for document in documents ]

In [8]:
bag_words = Counter(
  itertools.chain(*qualifications)
)

n = 5
print(np.array(bag_words.most_common(n)))
print()
print(np.array(bag_words.most_common()[-(n+1):-1]))

[['ability_to' '264']
 ['experience_in' '128']
 ['in_a' '110']
 ['knowledge_of' '98']
 ['experience_with' '96']]

[['respects_a_variety_of_voices' '1']
 ['a_variety_of_voices_identities' '1']
 ['variety_of_voices_identities_backgrounds' '1']
 ['of_voices_identities_backgrounds_experiences' '1']
 ['voices_identities_backgrounds_experiences_and' '1']]


In [9]:
## todo: tfidf

In [10]:
### todo: build list of stop words
stop_words = []

In [11]:
for document in documents:
  document['final'] = [ token for token in document['step_5'] if token not in stop_words ]
  
print(documents[0]['step_5'][:5])
print()
print(documents[0]['final'][:5])

['you_are', 'are_passionate', 'passionate_about', 'about_your', 'your_area']

['you_are', 'are_passionate', 'passionate_about', 'about_your', 'your_area']


### Step 7

- lda

In [12]:
documents_to_process = [ document['final'] for document in documents ]
dictionary = corpora.Dictionary(documents_to_process)
document_term_matrix = [dictionary.doc2bow(doc) for doc in documents_to_process]

model = LdaModel(
  document_term_matrix,
  num_topics=5,
  id2word=dictionary,
  passes=5,
)

In [13]:
print(np.array(model.print_topics(num_topics=5, num_words=10)))

[['0'
  '0.003*"ability_to" + 0.001*"to_work" + 0.001*"experience_in" + 0.001*"in_a" + 0.001*"knowledge_of" + 0.001*"able_to" + 0.001*"experience_with" + 0.001*"understanding_of" + 0.001*"communication_skills" + 0.001*"years_of"']
 ['1'
  '0.002*"ability_to" + 0.001*"familiarity_with" + 0.001*"experience_with" + 0.001*"knowledge_of" + 0.001*"in_a" + 0.001*"experience_in" + 0.001*"understanding_of" + 0.001*"to_work" + 0.001*"such_as" + 0.000*"you_are"']
 ['2'
  '0.001*"ability_to" + 0.001*"you_have" + 0.001*"experience_in" + 0.001*"you_are" + 0.001*"understanding_of" + 0.001*"able_to" + 0.001*"communication_skills" + 0.001*"to_work" + 0.001*"in_a" + 0.001*"knowledge_of"']
 ['3'
  '0.002*"ability_to" + 0.001*"experience_in" + 0.001*"in_a" + 0.001*"understanding_of" + 0.001*"experience_with" + 0.001*"knowledge_of" + 0.001*"you_have" + 0.001*"with_a" + 0.000*"years_of" + 0.000*"in_the"']
 ['4'
  '0.002*"ability_to" + 0.001*"in_a" + 0.001*"knowledge_of" + 0.001*"experience_in" + 0.001*"expe

In [14]:
for document in documents:
    job_id = document['job_id']
    bow = dictionary.doc2bow(document['final'])
    topic = model.get_document_topics(bow)

    print(job_id, '->', *topic)

19757793040 -> (1, 0.99905574)
26800277248 -> (0, 0.9989317)
25793337456 -> (2, 0.997526)
26601275040 -> (4, 0.9989188)
24704474768 -> (0, 0.9989419)
24704474480 -> (4, 0.9980505)
24429981920 -> (1, 0.9992941)
20042573200 -> (3, 0.9975338)
27441961376 -> (3, 0.9974231)
26304165456 -> (0, 0.99875724)
24259248272 -> (0, 0.99342006)
27629588832 -> (1, 0.99887717)
27272804544 -> (4, 0.99707323)
27266426816 -> (4, 0.99758)
27506594224 -> (3, 0.9986497)
26644660320 -> (0, 0.99641603)
27699308208 -> (3, 0.998499)
25947875248 -> (1, 0.99822646)
27155015808 -> (3, 0.9978878)
24573292192 -> (3, 0.9959056)
24200527168 -> (1, 0.9986803)
24170652720 -> (0, 0.9983058)
23025374752 -> (4, 0.9975372)
13560672608 -> (3, 0.99862653)
27088782608 -> (0, 0.9963291)
27077496880 -> (3, 0.99807096)
27032698496 -> (1, 0.9979899)
26606093104 -> (0, 0.99648935)
26435839936 -> (2, 0.9987411)
26435816656 -> (3, 0.99672234)
26421625344 -> (1, 0.997339)
26421612896 -> (0, 0.9965378)
25804673536 -> (2, 0.9981198)
2555