In [242]:
import re
import json
from collections import Counter
import numpy as np
import itertools

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('omw-1.4')

from gensim import corpora
from gensim.models.ldamodel import LdaModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Step 1

In [259]:
documents = []
with open('../../data/disney/job_descriptions.json', 'r') as reader:
  for description in json.loads(reader.read()):
    items = [
      item['text']
      for item in description['sections']
      if item['section'] == 'Basic Qualifications' or item['section'] == 'Preferred Qualifications'
    ]

    if len(items) == 0:
      continue

    qualification = {
      'cat_id': description['cat_id'],
      'job_id': description['job_id'],
      'step_1': items
    }

    documents.append(qualification)

documents[0]

{'cat_id': 391,
 'job_id': 19757793040,
 'step_1': ['* You are passionate about your area of expertise, deeply inquisitive and open minded, informed, but not limited, by your domain of expertise.\n* You enjoy intellectual debate.\n* You are comfortable guiding other team members but willing to get your hands dirty and help build research systems when needed\n* You are driven to perform research that is not simply novel, but deeply impactful to the company and society have a deep desire to reduce the research to practice in the form of prototypes and technology demonstrators.\n* You are excited by the platform provided by Disney to connect with children and guests of all ages to have a positive impact on the world.\n* You have a deep sensitivity for ethical use of technology and data.\n* You thrive in a fast-paced collaborative environment.\n* You are self-directed and independent, but open to constructive feedback.\n* You are a team player, able to work in collaborative interdisciplina

### Step 2

In [278]:
def transform_text(text):
  def clean_sentence(sentence):
    sentence = re.sub(r'[`\']', '', sentence)
    sentence = re.sub(r'[,.!?:;"]', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence.strip()

  text = re.sub(r'(e\.g\.|\/)', ' ', text)
  text = re.sub(r'etc\.', 'etc', text)
  text = re.sub(r'[*•"()]', ' ', text)
  text = re.sub(r'&', ' and ', text)

  return [ clean_sentence(sentence) for sentence in sent_tokenize(text.lower()) ]

for document in documents:
  document['step_2'] = []
  for section_text in document['step_1']:
    document['step_2'].extend(transform_text(section_text))

print(documents[0]['step_1'][0])
print()
print(documents[0]['step_2'][0])

* You are passionate about your area of expertise, deeply inquisitive and open minded, informed, but not limited, by your domain of expertise.
* You enjoy intellectual debate.
* You are comfortable guiding other team members but willing to get your hands dirty and help build research systems when needed
* You are driven to perform research that is not simply novel, but deeply impactful to the company and society have a deep desire to reduce the research to practice in the form of prototypes and technology demonstrators.
* You are excited by the platform provided by Disney to connect with children and guests of all ages to have a positive impact on the world.
* You have a deep sensitivity for ethical use of technology and data.
* You thrive in a fast-paced collaborative environment.
* You are self-directed and independent, but open to constructive feedback.
* You are a team player, able to work in collaborative interdisciplinary groups
* You are willing to take the time necessary to und

### Step 3

In [279]:
for document in documents:
  document['step_3'] = []
  for sentence in document['step_2']:
    document['step_3'].append(word_tokenize(sentence))
  
print(documents[0]['step_2'][0])
print()
print(documents[0]['step_3'][0])

you are passionate about your area of expertise deeply inquisitive and open minded informed but not limited by your domain of expertise

['you', 'are', 'passionate', 'about', 'your', 'area', 'of', 'expertise', 'deeply', 'inquisitive', 'and', 'open', 'minded', 'informed', 'but', 'not', 'limited', 'by', 'your', 'domain', 'of', 'expertise']


### Step 4

In [290]:
n_gram_ranges = [2, 3, 4, 5]

def format(tup):
  return '_'.join(tup)

for document in documents:
  document['step_4'] = []
  for tokens in document['step_3']:
    sentence_grams = []
    for n in n_gram_ranges:
      sentence_grams.extend([ format(tup) for tup in nltk.ngrams(tokens, n)])
    
    document['step_4'].append(sentence_grams)

print(documents[0]['step_3'][0])
print()
print(documents[0]['step_4'][0])

['you', 'are', 'passionate', 'about', 'your', 'area', 'of', 'expertise', 'deeply', 'inquisitive', 'and', 'open', 'minded', 'informed', 'but', 'not', 'limited', 'by', 'your', 'domain', 'of', 'expertise']

['you_are', 'are_passionate', 'passionate_about', 'about_your', 'your_area', 'area_of', 'of_expertise', 'expertise_deeply', 'deeply_inquisitive', 'inquisitive_and', 'and_open', 'open_minded', 'minded_informed', 'informed_but', 'but_not', 'not_limited', 'limited_by', 'by_your', 'your_domain', 'domain_of', 'of_expertise', 'you_are_passionate', 'are_passionate_about', 'passionate_about_your', 'about_your_area', 'your_area_of', 'area_of_expertise', 'of_expertise_deeply', 'expertise_deeply_inquisitive', 'deeply_inquisitive_and', 'inquisitive_and_open', 'and_open_minded', 'open_minded_informed', 'minded_informed_but', 'informed_but_not', 'but_not_limited', 'not_limited_by', 'limited_by_your', 'by_your_domain', 'your_domain_of', 'domain_of_expertise', 'you_are_passionate_about', 'are_passiona

### Step 5

In [293]:
for document in documents:
  document['step_5'] = []
  for tokens in document['step_4']:
    document['step_5'].extend(tokens)
  
print(documents[0]['step_4'][0])
print()
print(len(documents[0]['step_5']))

['you_are', 'are_passionate', 'passionate_about', 'about_your', 'your_area', 'area_of', 'of_expertise', 'expertise_deeply', 'deeply_inquisitive', 'inquisitive_and', 'and_open', 'open_minded', 'minded_informed', 'informed_but', 'but_not', 'not_limited', 'limited_by', 'by_your', 'your_domain', 'domain_of', 'of_expertise', 'you_are_passionate', 'are_passionate_about', 'passionate_about_your', 'about_your_area', 'your_area_of', 'area_of_expertise', 'of_expertise_deeply', 'expertise_deeply_inquisitive', 'deeply_inquisitive_and', 'inquisitive_and_open', 'and_open_minded', 'open_minded_informed', 'minded_informed_but', 'informed_but_not', 'but_not_limited', 'not_limited_by', 'limited_by_your', 'by_your_domain', 'your_domain_of', 'domain_of_expertise', 'you_are_passionate_about', 'are_passionate_about_your', 'passionate_about_your_area', 'about_your_area_of', 'your_area_of_expertise', 'area_of_expertise_deeply', 'of_expertise_deeply_inquisitive', 'expertise_deeply_inquisitive_and', 'deeply_inq

### Step 6

In [294]:
qualifications = [ document['step_5'] for document in documents ]

## switch to tf-idf ... build up stop words...

bag_words = Counter(itertools.chain(*qualifications))
print(bag_words.most_common(20))

[('ability_to', 264), ('experience_in', 128), ('in_a', 110), ('knowledge_of', 98), ('experience_with', 96), ('to_work', 94), ('understanding_of', 92), ('communication_skills', 67), ('you_have', 65), ('years_of', 65), ('able_to', 64), ('familiarity_with', 59), ('you_are', 51), ('in_the', 49), ('with_a', 48), ('skills_and', 45), ('experience_working', 44), ('the_ability', 41), ('the_ability_to', 41), ('such_as', 38)]


In [244]:
def clean_text(text):
  text = re.sub(r'e\.g\.', '', text)
  text = re.sub(r'etc\.', 'etc', text)
  text = re.sub(r"'", '', text)

  text = re.sub(r'[*•"]', ' ', text)
  text = re.sub(r'\/', ' or ', text)
  text = re.sub(r'\s+', ' ', text)

  text = text.lower()

  return sent_tokenize(text.strip())

stop_words = stopwords.words('english')

more_stop_words = [
  '.',
  ';',
  ')',
  '(',
  ',',
  ':',
  '&',
  '•',
  "’",
  '-',
]

bad_words = [
  'ability',
  'experience',
  'knowledge',
  'proven',
  'skill',
  'year',
  'disney',
  'strong'
]

stop_words.extend(bad_words)

lemmatizer = WordNetLemmatizer()

qualifications = []
for description in descriptions:
  items = []
  for item in description['sections']:
    if item['section'] == 'Basic Qualifications' or item['section'] == 'Preferred Qualifications':
      for sentence in clean_text(item['text']):
        tokens = [tk for tk in word_tokenize(sentence) if re.search('\w', tk) != None]
        tokens = [tk for tk in word_tokenize(sentence) if not tk.isdigit()]
        tokens = [lemmatizer.lemmatize(tk) for tk in tokens]
        tokens = [tk for tk in tokens if tk not in more_stop_words]

        items.extend([ f'{t1}_{t2}' for t1, t2 in nltk.ngrams(tokens, n=2) if not (t1 in stop_words or t2 in stop_words)])

  if len(items) == 0:
    continue

  qualification = {
    'cat_id': description['cat_id'],
    'job_id': description['job_id'],
    'document': items
  }

  qualifications.append(qualification)


In [245]:
documents = [ item['document'] for item in qualifications ]
documents[0][:5]

['expertise_deeply',
 'deeply_inquisitive',
 'open_minded',
 'minded_informed',
 'enjoy_intellectual']

In [246]:
bag_words = Counter(itertools.chain(*documents))
print(bag_words.most_common(20))

[('project_management', 24), ('team_player', 21), ('verbal_communication', 18), ('team_environment', 18), ('microsoft_office', 17), ('per_week', 17), ('night_weekend', 16), ('related_field', 15), ('graphic_design', 14), ('adobe_creative', 13), ('creative_suite', 13), ('written_communication', 13), ('handle_multiple', 13), ('including_night', 13), ('full_availability', 13), ('photoshop_illustrator', 12), ('manage_multiple', 12), ('cast_member', 12), ('day_per', 11), ('team_member', 10)]


In [252]:
dictionary = corpora.Dictionary(documents)
document_term_matrix = [dictionary.doc2bow(doc) for doc in documents]

model = LdaModel(
  document_term_matrix,
  num_topics=5,
  id2word=dictionary,
  passes=10,
)

print(
  np.array(model.print_topics(num_topics=5, num_words=10))
)

[['0'
  '0.002*"best_practice" + 0.002*"theme_park" + 0.002*"team_player" + 0.002*"team_environment" + 0.002*"project_management" + 0.002*"work_ethic" + 0.002*"scripting_language" + 0.002*"handle_multiple" + 0.002*"scheduling_method" + 0.002*"written_communication"']
 ['1'
  '0.004*"night_weekend" + 0.004*"full_availability" + 0.004*"including_night" + 0.003*"per_week" + 0.003*"day_per" + 0.003*"guest_service" + 0.002*"microsoft_office" + 0.002*"work_well" + 0.002*"verbal_communication" + 0.002*"week_including"']
 ['2'
  '0.003*"project_management" + 0.003*"related_field" + 0.002*"graphic_design" + 0.002*"computer_science" + 0.002*"verbal_communication" + 0.002*"programming_language" + 0.002*"mathematics_statistic" + 0.002*"economics_engineering" + 0.002*"statistic_economics" + 0.002*"environmental_graphic"']
 ['3'
  '0.002*"machine_learning" + 0.002*"microsoft_office" + 0.002*"adobe_creative" + 0.002*"team_player" + 0.002*"photoshop_illustrator" + 0.002*"data_management" + 0.002*"mech

In [255]:
for document in qualifications:
    job_id = document['job_id']
    bow = dictionary.doc2bow(document['document'])
    topic = model.get_document_topics(bow)

    print(job_id, *topic)

19757793040 (3, 0.97833526)
26800277248 (3, 0.9897175)
25793337456 (4, 0.97315794)
26601275040 (0, 0.9886906)
24704474768 (2, 0.98882234)
24704474480 (2, 0.9842789)
24429981920 (3, 0.9902122)
20042573200 (3, 0.9740526)
27441961376 (3, 0.9702436)
26304165456 (2, 0.9888663)
24259248272 (0, 0.0175085) (1, 0.9320049) (2, 0.016907346) (3, 0.01677804) (4, 0.01680117)
27629588832 (4, 0.989157)
27272804544 (2, 0.974919)
27266426816 (2, 0.9787574)
27506594224 (0, 0.9894405)
26644660320 (3, 0.9664998)
27699308208 (1, 0.9864001)
25947875248 (2, 0.9821542)
27155015808 (0, 0.9857028)
24573292192 (0, 0.015492004) (1, 0.015553833) (2, 0.01543027) (3, 0.015502573) (4, 0.93802136)
24200527168 (2, 0.9894666)
24170652720 (2, 0.9808334)
23025374752 (3, 0.9722748)
13560672608 (0, 0.978839)
27088782608 (0, 0.01003276) (1, 0.010044117) (2, 0.010034749) (3, 0.010025941) (4, 0.9598625)
27077496880 (2, 0.98042506)
27032698496 (0, 0.97706246)
26606093104 (0, 0.010032748) (1, 0.0100442385) (2, 0.010034758) (3, 0.