In [2]:
import re
import json
import os
import requests
import time
import numpy as np
import pandas as pd
from datetime import date

from bs4 import BeautifulSoup

### scrap job postings

In [2]:
def save(df, cache_path):
    if not os.path.exists(cache_path):
        df.to_csv(cache_path, index=False)
        return df

    df_gospel = pd.read_csv(cache_path)
    df_gospel['cat_id'] = df_gospel['cat_id'].astype(int)
    df_gospel['job_id'] = df_gospel['job_id'].astype(int)

    df_gospel_slim = df_gospel[['cat_id', 'job_id']]
    df_scrapped_new = df \
        .merge(df_gospel_slim, indicator='i', how='outer')

    closed_jobs = df_scrapped_new.query('i == "right_only"')[['cat_id', 'job_id']]
    df_scrapped_new = df_scrapped_new.query('i == "left_only"').drop(['i'], axis=1)

    if not df_scrapped_new.empty:
        df_new = pd.concat([df_gospel, df_scrapped_new], axis=0)
    else:
        df_new = df_gospel.copy()

    for _, row in closed_jobs.iterrows():
        cat_id = row['cat_id']
        job_id = row['job_id']
        q = np.logical_and(df_new['cat_id'] == cat_id, df_new['job_id'] == job_id)
        q = np.logical_and(q, df_new['close'] == '-')
        df_new.loc[q, 'close'] = date.today()

    df_new.to_csv(cache_path, index=False)
    return df_new

In [3]:
def parse(cache_path):
    p = 1
    total = 20

    jobs = []
    while p <= total:
        url = f'https://jobs.disneycareers.com/search-jobs?k=imagineering&p={p}'
        response = requests.get(url)

        if response.status_code != 200:
            print(f'** ran into an issue {response.status_code}')
            break

        bs = BeautifulSoup(
            response.content,
            features='html.parser'
        )

        total = int(
            re.sub('of\s+', '', bs.select('.pagination-total-pages')[0].text)
        )

        rows = bs.select('#search-results-list tr')
        if len(rows) == 0:
            print('no rows to parse, stopping')
            break

        for row in bs.select('#search-results-list tr'):
            cols = row.select('td')
            if len(cols) < 3:
                continue

            link = cols[0].select('a')[0].attrs['href']
            cat_id = re.search(r'(?<=\/)(\d+)(?=\/\d+$)', link)
            job_id = re.search(r'(?<=\/)(\d+)$', link)

            jobs.append({
                'cat_id': cat_id.group(0),
                'job_id': job_id.group(0),
                'title': re.sub('\s+', ' ', cols[0].text).strip(),
                'date': re.sub('\s+', ' ', cols[1].text).strip(),
                'brand': re.sub('\s+', ' ', cols[2].text).strip(),
                'location': re.sub('\s+', ' ', cols[3].text).strip(),
                'url': f'https://jobs.disneycareers.com{link}',
                'close': '-'
            })

        p += 1
        time.sleep(5)
    
    df = pd.DataFrame(jobs)
    df['cat_id'] = df['cat_id'].astype(int)
    df['job_id'] = df['job_id'].astype(int)
    df['date'] = pd.to_datetime(df.date)

    save(df, cache_path)

parse('../../data/disney/jobs.csv')

In [20]:
df = pd.read_csv('../../data/disney/jobs.csv').sort_values(['date'], ascending=False)
df.head(n=5)

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
196,391,28088851264,Senior Manager – Ad Sales Automation,2022-04-29,Star,"Mumbai, India",https://jobs.disneycareers.com/job/mumbai/seni...,-
194,391,28089159280,Alternance - Assistant(e) Graphiste en Signalé...,2022-04-29,Disneyland Paris,"Chessy, France",https://jobs.disneycareers.com/job/chessy/alte...,-
195,391,28087427296,Software Engineer,2022-04-28,Disney Media & Entertainment Distribution,"Richmond, Australia",https://jobs.disneycareers.com/job/richmond/so...,-
192,391,28036542592,"Assistant Manager - Content Development, Star ...",2022-04-28,Star,"Mumbai, India",https://jobs.disneycareers.com/job/mumbai/assi...,-
191,391,28031261712,"Manager - Content Development, Star Plus",2022-04-28,Star,"Mumbai, India",https://jobs.disneycareers.com/job/mumbai/mana...,-


### query

In [21]:
df[df.title.str.lower().str.contains('software')].head(n=5)

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
195,391,28087427296,Software Engineer,2022-04-28,Disney Media & Entertainment Distribution,"Richmond, Australia",https://jobs.disneycareers.com/job/richmond/so...,-
172,391,27550252720,Sr Software Engineer,2022-04-19,Disney Media & Entertainment Distribution,"Bristol, Connecticut, United States",https://jobs.disneycareers.com/job/bristol/sr-...,2022-04-27
6,391,24704474480,Associate R&D Imagineer-Tools Software Engineer,2022-03-03,Walt Disney Imagineering,"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/ass...,-
23,391,24200527168,Control Software Dev Principal,2022-02-23,Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/co...,2022-04-27
47,391,21407266064,Software Dev Lead,2022-02-22,Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/so...,-


In [22]:
df[df.title.str.lower().str.contains('data')].head(n=5)

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
157,391,26192258656,"Data Analyst, Disney+",2022-03-28,Disney Streaming,"Melbourne, Australia",https://jobs.disneycareers.com/job/melbourne/d...,-
79,391,26229767216,HR Data & Process Analyst (IA),2022-03-28,The Walt Disney Studios,"Burbank, California, United States / United St...",https://jobs.disneycareers.com/job/burbank/hr-...,2022-04-29
54,391,25922527760,"Sr Analyst, Data Visualization",2022-03-23,The Walt Disney Company (Corporate),"California, United States",https://jobs.disneycareers.com/job/california/...,-
118,391,25561189520,Data Analyst,2022-03-17,Disney Streaming,"Santa Monica, California, United States",https://jobs.disneycareers.com/job/santa-monic...,-
119,391,25561188240,Sr Data Analyst,2022-03-17,Disney Streaming,"Santa Monica, California, United States",https://jobs.disneycareers.com/job/santa-monic...,-


In [23]:
df[df.title.str.lower().str.contains('ml')].head(n=5)

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
7,391,24429981920,"Senior AI/ML Research Scientist, Disney Resear...",2022-02-27,Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/se...,-


### scrap job descriptions

In [24]:
gosbel_json = []
if os.path.exists('../../data/disney/job_descriptions.json'):
    with open('../../data/disney/job_descriptions.json', 'r') as reader:
        gosbel_json = json.loads(reader.read())

not_closed = df[df['close'] == '-']
for _, row in not_closed.loc[:1].iterrows():
    url = row['url']
    cat_id = row['cat_id']
    job_id = row['job_id']

    if any(filter(lambda a: a['cat_id'] == cat_id and a['job_id'] == job_id, gosbel_json)):
        print(f'already pulled {cat_id}/{job_id}')
        continue

    response = requests.get(url)
    if response.status_code == 200:
        bs = BeautifulSoup(
            response.content,
            features='html.parser'
        )

        description = bs.select('.ats-description')[0]

        gosbel_json.append({
            'cat_id': cat_id,
            'job_id': job_id,
            'description': re.sub('\s+', ' ', f'<html><body>{description}</body></html>')
        })

        time.sleep(5)

with open('../../data/disney/job_descriptions.json', 'w') as writer:
    writer.write(
        json.dumps(gosbel_json, indent=3)
    )
    

already pulled 391/28088851264
already pulled 391/28089159280
already pulled 391/28087427296
already pulled 391/28036542592
already pulled 391/28031261712
already pulled 391/28047612016
already pulled 391/27996730304
already pulled 391/27991931456
already pulled 391/27991924976
already pulled 391/28013695616
already pulled 391/28018869648
already pulled 391/27930208640
already pulled 391/27934916224
already pulled 391/27930209104
already pulled 391/27873373664
already pulled 391/27855101056
already pulled 391/27851186288
already pulled 391/27851185408
already pulled 391/27873373936
already pulled 391/27861261296
already pulled 391/27868816304
already pulled 391/27880153024
already pulled 391/25947875248
already pulled 391/27873375440
already pulled 391/7255959040
already pulled 391/26439104800
already pulled 391/27780781936
already pulled 391/27716762704
already pulled 391/26469543424
already pulled 391/26288058352
already pulled 391/27699308208
already pulled 391/26606093104
already p

In [25]:
from IPython.display import display, HTML

gosbel_json = []
with open('../../data/disney/job_descriptions.json', 'r') as reader:
    gosbel_json = json.loads(reader.read())

display(HTML(gosbel_json[0]['description']))

### parse job descriptions

In [26]:
def break_into_sections(obj):
    html = obj['description']

    bs = BeautifulSoup(
        html,
        features='html.parser'
    )

    ignore_sections = ['Additional Information:']
    job_description_sections = []
    sections = list(map(lambda a: a.text, bs.select('h4')))
    for start, end in zip(sections, [*sections[1:], '-1']):
        if start in ignore_sections:
            continue

        if end != '-1':
            q = f'(?<=<h4>{start}</h4>)(.+?)(<h4>{end}</h4>)'
        else:
            q = f'(?<=<h4>{start}</h4>)(.+)'
        
        match_html = re.search(q, html).group(1)
        match_html = re.sub('<ul>', '\n\n<ul>', match_html)
        match_html = re.sub('</ul>', '</ul>\n\n', match_html)
        match_html = re.sub('<li>', '<li>* ', match_html)
        match_html = re.sub('</li>', '\n</li>', match_html)

        bs2 = BeautifulSoup(
            match_html,
            features='html.parser'
        )

        text = bs2.get_text()
        text = re.sub(r'(\n\n)(\* )([^\n]+)(\n\n)', r'\1\3\n', text)
        job_description_sections.append({
            'section': re.sub(':\s*$', '', start).strip(),
            'text': text.strip()
        })

    return job_description_sections

In [27]:
headers = []
for item in gosbel_json:
  headers.extend(list(map(lambda a: a['section'], item['sections'])))

set(headers)

{'Basic Qualifications',
 'Job Summary',
 'Preferred Education',
 'Preferred Qualifications',
 'Required Education',
 'Responsibilities'}