In [1]:
import re
import json
import os
import requests
import time
import numpy as np
import pandas as pd
from datetime import date

from bs4 import BeautifulSoup

In [2]:
def save(df, cache_path):
    if not os.path.exists(cache_path):
        df.to_csv(cache_path, index=False)
        return df

    df_gospel = pd.read_csv(cache_path)
    df_gospel['cat_id'] = df_gospel['cat_id'].astype(int)
    df_gospel['job_id'] = df_gospel['job_id'].astype(int)

    df_gospel_slim = df_gospel[['cat_id', 'job_id']]
    df_scrapped_new = df \
        .merge(df_gospel_slim, indicator='i', how='outer')

    closed_jobs = df_scrapped_new.query('i == "right_only"')[['cat_id', 'job_id']]
    df_scrapped_new = df_scrapped_new.query('i == "left_only"').drop(['i'], axis=1)

    if not df_scrapped_new.empty:
        df_new = pd.concat([df_gospel, df_scrapped_new], axis=0)
    else:
        df_new = df_gospel.copy()

    for _, row in closed_jobs.iterrows():
        cat_id = row['cat_id']
        job_id = row['job_id']
        q = np.logical_and(df_new['cat_id'] == cat_id, df_new['job_id'] == job_id)
        q = np.logical_and(q, df_new['close'] == '-')
        df_new.loc[q, 'close'] = date.today()

    df_new.to_csv(cache_path, index=False)
    return df_new

In [3]:
def parse(cache_path):
    p = 1
    total = 20

    jobs = []
    while p <= total:
        url = f'https://jobs.disneycareers.com/search-jobs?k=imagineering&p={p}'
        response = requests.get(url)

        if response.status_code != 200:
            print(f'** ran into an issue {response.status_code}')
            break

        bs = BeautifulSoup(
            response.content,
            features='html.parser'
        )

        total = int(
            re.sub('of\s+', '', bs.select('.pagination-total-pages')[0].text)
        )

        rows = bs.select('#search-results-list tr')
        if len(rows) == 0:
            print('no rows to parse, stopping')
            break

        for row in bs.select('#search-results-list tr'):
            cols = row.select('td')
            if len(cols) < 3:
                continue

            link = cols[0].select('a')[0].attrs['href']
            cat_id = re.search(r'(?<=\/)(\d+)(?=\/\d+$)', link)
            job_id = re.search(r'(?<=\/)(\d+)$', link)

            jobs.append({
                'cat_id': cat_id.group(0),
                'job_id': job_id.group(0),
                'title': re.sub('\s+', ' ', cols[0].text).strip(),
                'date': re.sub('\s+', ' ', cols[1].text).strip(),
                'brand': re.sub('\s+', ' ', cols[2].text).strip(),
                'location': re.sub('\s+', ' ', cols[3].text).strip(),
                'url': f'https://jobs.disneycareers.com{link}',
                'close': '-'
            })

        p += 1
        time.sleep(5)
    
    df = pd.DataFrame(jobs)
    df['cat_id'] = df['cat_id'].astype(int)
    df['job_id'] = df['job_id'].astype(int)
    df['date'] = pd.to_datetime(df.date)

    save(df, cache_path)

parse('../../data/disney/jobs.csv')

In [6]:
df = pd.read_csv('../../data/disney/jobs.csv').sort_values(['date'], ascending=False)
df.head(n=10)

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
96,391,27851185408,Staff Mechanical Engineer-Utility and Sustaining,2022-04-24,"Parks, Experiences and Products","Anaheim, California, United States",https://jobs.disneycareers.com/job/anaheim/sta...,-
19,391,25947875248,Show Writer,2022-04-24,"Parks, Experiences and Products","Chiba, Japan",https://jobs.disneycareers.com/job/chiba/show-...,-
95,391,27851186288,Senior Mechanical Engineer-Utility and Sustaining,2022-04-24,"Parks, Experiences and Products","Anaheim, California, United States",https://jobs.disneycareers.com/job/anaheim/sen...,-
94,391,27855101056,Senior Manager Research & Development F&B M/F ...,2022-04-24,Disneyland Paris,"Marne-la-Vallée, France",https://jobs.disneycareers.com/job/marne-la-va...,-
135,391,27780781936,"Stage Technician, Entertainment (Repair and Ma...",2022-04-23,Shanghai Disney Resort,"Shanghai, Mainland China",https://jobs.disneycareers.com/job/shanghai/st...,-
49,391,7255959040,Project Interpreter/Translator (English/Japanese),2022-04-23,"Parks, Experiences and Products","Chiba, Japan",https://jobs.disneycareers.com/job/chiba/proje...,-
154,391,26439104800,Stage Technician (Lighting/Audio/Video/Rigging...,2022-04-23,Shanghai Disney Resort,"Shanghai, Mainland China",https://jobs.disneycareers.com/job/shanghai/st...,-
152,391,26469543424,"Food Prep - Part Time - $1,000 Hiring Bonus",2022-04-22,Disneyland Resort,"Anaheim, California, United States",https://jobs.disneycareers.com/job/anaheim/foo...,-
78,391,26288058352,Senior Manager-Packaging Design,2022-04-22,"Parks, Experiences and Products","Kissimmee, Florida, United States",https://jobs.disneycareers.com/job/kissimmee/s...,-
171,391,27716762704,"Disney Central Orlando Job Fair - $1,500 New H...",2022-04-22,Walt Disney World Resort,"Orlando, Florida, United States",https://jobs.disneycareers.com/job/orlando/dis...,-


In [4]:
pd.to_datetime(df.date)

0     2022-03-29
1     2022-04-07
2     2022-03-21
3     2022-03-27
4     2022-04-03
         ...    
171   2022-04-19
172   2022-04-08
173   2022-04-21
174   2022-03-22
175   2022-04-14
Name: date, Length: 176, dtype: datetime64[ns]

In [13]:
df[df.title.str.lower().str.contains('software')]

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
6,391,24704474480,Associate R&D Imagineer-Tools Software Engineer,"Mar. 03, 2022",Walt Disney Imagineering,"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/ass...,-
22,391,24200527168,Control Software Dev Principal,"Feb. 23, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/co...,-
46,391,21407266064,Software Dev Lead,"Feb. 22, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/so...,-
171,391,27550252720,Sr Software Engineer,"Apr. 19, 2022",Disney Media & Entertainment Distribution,"Bristol, Connecticut, United States",https://jobs.disneycareers.com/job/bristol/sr-...,-


In [19]:
df[df.title.str.lower().str.contains('data')]

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
47,391,17617622720,Sr. (Data) Marketing Scientist,"Nov. 04, 2021",The Walt Disney Studios,"Burbank, California, United States",https://jobs.disneycareers.com/job/burbank/sr-...,-
54,391,25922527760,"Sr Analyst, Data Visualization","Mar. 23, 2022",The Walt Disney Company (Corporate),"California, United States",https://jobs.disneycareers.com/job/california/...,-
79,391,26229767216,HR Data & Process Analyst (IA),"Mar. 28, 2022",The Walt Disney Studios,"Burbank, California, United States / United St...",https://jobs.disneycareers.com/job/burbank/hr-...,-
85,391,25137368128,Data Integration Analyst (PH),"Mar. 10, 2022",The Walt Disney Company (Corporate),"Burbank, California, United States / Orlando, ...",https://jobs.disneycareers.com/job/burbank/dat...,-
86,391,25137360720,Data Integration Analyst-Replatform (PH),"Mar. 10, 2022",The Walt Disney Company (Corporate),"Burbank, California, United States / Orlando, ...",https://jobs.disneycareers.com/job/burbank/dat...,-
92,391,16360548592,Data Integration Engineer,"Dec. 27, 2021",The Walt Disney Company (Corporate),"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/dat...,-
117,391,25561189520,Data Analyst,"Mar. 17, 2022",Disney Streaming,"Santa Monica, California, United States",https://jobs.disneycareers.com/job/santa-monic...,-
118,391,25561188240,Sr Data Analyst,"Mar. 17, 2022",Disney Streaming,"Santa Monica, California, United States",https://jobs.disneycareers.com/job/santa-monic...,-
120,391,25137365136,"Manager, Data Governance","Mar. 10, 2022",The Walt Disney Company (Corporate),United States,https://jobs.disneycareers.com/job/united-stat...,-
127,391,21392955472,"Manager, Data Integration","Jan. 06, 2022",The Walt Disney Company (Corporate),"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/man...,-


In [16]:
df[df.title.str.lower().str.contains('ml')]

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
7,391,24429981920,"Senior AI/ML Research Scientist, Disney Resear...","Feb. 27, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/se...,-


In [35]:
gosbel_json = []
if os.path.exists('../../data/disney/job_descriptions.json'):
    with open('../../data/disney/job_descriptions.json', 'r') as reader:
        gosbel_json = json.loads(reader.read())

not_closed = df[df['close'] == '-']
for _, row in not_closed.loc[:1].iterrows():
    url = row['url']
    cat_id = row['cat_id']
    job_id = row['job_id']

    if any(filter(lambda a: a['cat_id'] == cat_id and a['job_id'] == job_id, gosbel_json)):
        print(f'already pulled {cat_id}/{job_id}')
        continue

    response = requests.get(url)
    if response.status_code == 200:
        bs = BeautifulSoup(
            response.content,
            features='html.parser'
        )

        description = bs.select('.ats-description')[0]

        gosbel_json.append({
            'cat_id': cat_id,
            'job_id': job_id,
            'description': re.sub('\s+', ' ', f'<html><body>{description}</body></html>')
        })

        time.sleep(5)

with open('../../data/disney/job_descriptions.json', 'w') as writer:
    writer.write(
        json.dumps(gosbel_json, indent=3)
    )
    

In [96]:
from IPython.display import display, HTML

gosbel_json = []
with open('../../data/disney/job_descriptions.json', 'r') as reader:
    gosbel_json = json.loads(reader.read())

display(HTML(gosbel_json[0]['description']))

In [91]:
def break_into_sections(obj):
    html = obj['description']

    bs = BeautifulSoup(
        html,
        features='html.parser'
    )

    ignore_sections = ['Additional Information:']
    job_description_sections = []
    sections = list(map(lambda a: a.text, bs.select('h4')))
    for start, end in zip(sections, [*sections[1:], '-1']):
        if start in ignore_sections:
            continue

        if end != '-1':
            q = f'(?<=<h4>{start}</h4>)(.+?)(<h4>{end}</h4>)'
        else:
            q = f'(?<=<h4>{start}</h4>)(.+)'
        
        match_html = re.search(q, html).group(1)
        match_html = re.sub('<ul>', '\n\n<ul>', match_html)
        match_html = re.sub('</ul>', '</ul>\n\n', match_html)
        match_html = re.sub('<li>', '<li>* ', match_html)
        match_html = re.sub('</li>', '\n</li>', match_html)

        bs2 = BeautifulSoup(
            match_html,
            features='html.parser'
        )

        text = bs2.get_text()
        text = re.sub(r'(\n\n)(\* )([^\n]+)(\n\n)', r'\1\3\n', text)
        job_description_sections.append({
            'section': re.sub(':\s*$', '', start).strip(),
            'text': text.strip()
        })

    return job_description_sections

[{'section': 'Job Summary',
  'text': '“Any sufficiently complex technology is indistinguishable from magic.” - Arthur C ClarkeThe TeamIn a time where children carry devices with AI in their pockets, robots deliver fast food meals, and other science fictions have become mundane reality, we need new magic: New ways to connect with our guests, to surprise them, make them smile, and open their minds to magic - not just in our parks and resorts, but everywhere in the world.At Disney Research Imagineering this emerges from the application of bleeding edge science to everything from enabling new guest-facing experiences in the parks to assisting cast members and transforming critical business processes.At Disney Research, we focus on new ideas and ways of thinking, working to solve the biggest challenges that are otherwise unsolvable without deep understanding of both the complexities of the Disney business, and the opportunities afforded by bleeding edge scientific exploration and applicati

In [97]:
headers = []
for item in gosbel_json:
  headers.extend(list(map(lambda a: a['section'], item['sections'])))

set(headers)

{'Basic Qualifications',
 'Job Summary',
 'Preferred Education',
 'Preferred Qualifications',
 'Required Education',
 'Responsibilities'}