In [19]:
import re
import os
import json
from collections import Counter
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')

import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to /Users/danpasse/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/danpasse/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/danpasse/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


In [20]:
documents = []
with open('../../data/disney/imagineering_job_descriptions.json', 'r') as reader:
  for description in json.loads(reader.read()):
    items = [
      item['text']
      for item in description['sections']
      if item['section'] == 'Basic Qualifications' or item['section'] == 'Preferred Qualifications'
    ]

    if len(items) == 0:
      continue

    qualification = {
      'cat_id': description['cat_id'],
      'job_id': description['job_id'],
      'step_1': items
    }

    documents.append(qualification)

documents[0]

{'cat_id': 391,
 'job_id': 27150045808,
 'step_1': ['* Familiar with maintaining and creating rigs for CG character animation.\n* Able to provide technical support for computer graphic animation applications.\n* Desired software knowledge/familiarity among: Maya and/or Motion Builder, Adobe Creative Suite, and Shotgun\n* Desired scripting knowledge with languages such as: Python, Mel, C, C++\n* Able to communicate (verbally and in writing) clearly and concisely, regularly keeping stakeholders informed of status, risks, and issues.\n* Able to function in a dynamic work environment, managing multiple project activities.\n* Able to work with limited supervision and to integrate inputs from many individual team members.\n* Have a high tolerance for change and changing requirements.\n* Be a creative problem solver and demonstrate comfort with inventive solutions.\n* Have a valid passport and willing to travel domestically or internationally.',
  '* Have basic knowledge of mechanical systems

### Step 1

In [21]:
def transform_text(text):
    text = re.sub(r'(e\.g\.|\/)', ' ', text)
    text = re.sub(r'etc\.', 'etc', text)
    text = re.sub(r'[*•●"()]', ' ', text)
    text = re.sub(r'&', ' and ', text)

    return text

for document in documents:

    document['step_2'] = []
    for section_text in document['step_1']:
        sentences = sent_tokenize(transform_text(section_text))
        tokens = [
            nltk.pos_tag(word_tokenize(sentence))
            for sentence
            in sentences
        ]

        document['step_2'].extend(tokens)

documents[0]['step_2']

[[('Familiar', 'NNP'),
  ('with', 'IN'),
  ('maintaining', 'VBG'),
  ('and', 'CC'),
  ('creating', 'VBG'),
  ('rigs', 'NNS'),
  ('for', 'IN'),
  ('CG', 'NNP'),
  ('character', 'NN'),
  ('animation', 'NN'),
  ('.', '.')],
 [('Able', 'JJ'),
  ('to', 'TO'),
  ('provide', 'VB'),
  ('technical', 'JJ'),
  ('support', 'NN'),
  ('for', 'IN'),
  ('computer', 'NN'),
  ('graphic', 'JJ'),
  ('animation', 'NN'),
  ('applications', 'NNS'),
  ('.', '.')],
 [('Desired', 'NNP'),
  ('software', 'NN'),
  ('knowledge', 'NN'),
  ('familiarity', 'NN'),
  ('among', 'IN'),
  (':', ':'),
  ('Maya', 'NNP'),
  ('and', 'CC'),
  ('or', 'CC'),
  ('Motion', 'NNP'),
  ('Builder', 'NNP'),
  (',', ','),
  ('Adobe', 'NNP'),
  ('Creative', 'NNP'),
  ('Suite', 'NNP'),
  (',', ','),
  ('and', 'CC'),
  ('Shotgun', 'NNP'),
  ('Desired', 'NNP'),
  ('scripting', 'VBG'),
  ('knowledge', 'NN'),
  ('with', 'IN'),
  ('languages', 'NNS'),
  ('such', 'JJ'),
  ('as', 'IN'),
  (':', ':'),
  ('Python', 'NNP'),
  (',', ','),
  ('Mel', '

### Step 2

In [50]:
## tag strings

grammar = r"""
  CLAUSE: {<DT>?<JJ.*>*<NN.*>}           # Chunk NP, VP
"""

cp = nltk.RegexpParser(grammar)

for document in documents:

    document['step_3'] = []
    for sentence in document['step_2']:
        chunks = cp.parse(sentence)
        for subtree in chunks.subtrees():
            if subtree.label() == 'CLAUSE':
                print(subtree)

documents[0]['step_3']

(CLAUSE character/NN animation/NN)
(CLAUSE technical/JJ support/NN)
(CLAUSE computer/NN)
(CLAUSE graphic/JJ animation/NN)
(CLAUSE software/NN knowledge/NN familiarity/NN)
(CLAUSE knowledge/NN)
(CLAUSE status/NN)
(CLAUSE dynamic/JJ work/NN environment/NN)
(CLAUSE multiple/JJ project/NN)
(CLAUSE limited/JJ supervision/NN)
(CLAUSE many/JJ individual/JJ team/NN)
(CLAUSE high/JJ tolerance/NN)
(CLAUSE change/NN)
(CLAUSE changing/NN)
(CLAUSE creative/JJ problem/NN solver/NN)
(CLAUSE comfort/NN)
(CLAUSE valid/JJ passport/NN)
(CLAUSE basic/JJ knowledge/NN)
(CLAUSE control/NN)
(CLAUSE Experience/NN)
(CLAUSE knowledge/NN)
(CLAUSE software/NN)
(CLAUSE Experience/NN)
(CLAUSE motion-capture/NN)
(CLAUSE Familiarity/NN)
(CLAUSE year/NN)
(CLAUSE experience/NN)
(CLAUSE interior/JJ design/NN practice/NN)
(CLAUSE knowledge/NN)
(CLAUSE experience/NN)
(CLAUSE practice/NN)
(CLAUSE interior/JJ design/NN)
(CLAUSE verbal/JJ communication/NN)
(CLAUSE thoughtful/JJ design/NN)
(CLAUSE ability/NN)
(CLAUSE problem/N

[]