In [1]:
import re
import os
import requests
import time
import numpy as np
import pandas as pd
from datetime import date

from bs4 import BeautifulSoup

In [2]:
def save(df, cache_path):
    if not os.path.exists(cache_path):
        df.to_csv(cache_path, index=False)
        return df

    df_gospel = pd.read_csv(cache_path)
    df_gospel['cat_id'] = df_gospel['cat_id'].astype(int)
    df_gospel['job_id'] = df_gospel['job_id'].astype(int)

    df_gospel_slim = df_gospel[['cat_id', 'job_id']]
    df_scrapped_new = df \
        .merge(df_gospel_slim, indicator='i', how='outer')

    closed_jobs = df_scrapped_new.query('i == "right_only"')[['cat_id', 'job_id']]
    df_scrapped_new = df_scrapped_new.query('i == "left_only"').drop(['i'], axis=1)

    if not df_scrapped_new.empty:
        df_new = pd.concat([df_gospel, df_scrapped_new], axis=0)
    else:
        df_new = df_gospel.copy()

    for _, row in closed_jobs.iterrows():
        cat_id = row['cat_id']
        job_id = row['job_id']
        q = np.logical_and(df_new['cat_id'] == cat_id, df_new['job_id'] == job_id)
        q = np.logical_and(q, df_new['close'] == '-')
        df_new.loc[q, 'close'] = date.today()

    df_new.to_csv(cache_path, index=False)
    return df_new

In [3]:
def parse(cache_path):
    p = 1
    total = 20

    jobs = []
    while p <= total:
        url = f'https://jobs.disneycareers.com/search-jobs?k=imagineering&p={p}'
        response = requests.get(url)

        if response.status_code != 200:
            print(f'** ran into an issue {response.status_code}')
            break

        bs = BeautifulSoup(
            response.content,
            features='html.parser'
        )

        total = int(
            re.sub('of\s+', '', bs.select('.pagination-total-pages')[0].text)
        )

        rows = bs.select('#search-results-list tr')
        if len(rows) == 0:
            print('no rows to parse, stopping')
            break

        for row in bs.select('#search-results-list tr'):
            cols = row.select('td')
            if len(cols) < 3:
                continue

            link = cols[0].select('a')[0].attrs['href']
            cat_id = re.search(r'(?<=\/)(\d+)(?=\/\d+$)', link)
            job_id = re.search(r'(?<=\/)(\d+)$', link)

            jobs.append({
                'cat_id': cat_id.group(0),
                'job_id': job_id.group(0),
                'title': re.sub('\s+', ' ', cols[0].text).strip(),
                'date': re.sub('\s+', ' ', cols[1].text).strip(),
                'brand': re.sub('\s+', ' ', cols[2].text).strip(),
                'location': re.sub('\s+', ' ', cols[3].text).strip(),
                'url': f'https://jobs.disneycareers.com{link}',
                'close': '-'
            })

        p += 1
        time.sleep(5)
    
    df = pd.DataFrame(jobs)
    df['cat_id'] = df['cat_id'].astype(int)
    df['job_id'] = df['job_id'].astype(int)

    save(df, cache_path)

parse('../../data/disney/jobs.csv')

In [5]:
df = pd.read_csv('../../data/disney/jobs.csv')
df.head()

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
0,391,19757793040,"Researcher, Disney Research Imagineering","Mar. 29, 2022",Disney Research,"Glendale, California, United States / Orlando,...",https://jobs.disneycareers.com/job/glendale/re...,-
1,391,26800277248,Associate R&D Imagineer - Creative Technologist,"Apr. 07, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/as...,-
2,391,25793337456,R&D Imagineer Principal - Business Manager,"Mar. 21, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/r-...,-
3,391,26150444112,"Exec Asst, Walt Disney Imagineering","Mar. 27, 2022",Walt Disney Imagineering,"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/exe...,-
4,391,26601275040,R&D Imagineer Sr. - Mechanical Design Engineer,"Apr. 03, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/r-...,-


In [13]:
df[df.title.str.lower().str.contains('software')]

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
6,391,24704474480,Associate R&D Imagineer-Tools Software Engineer,"Mar. 03, 2022",Walt Disney Imagineering,"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/ass...,-
22,391,24200527168,Control Software Dev Principal,"Feb. 23, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/co...,-
46,391,21407266064,Software Dev Lead,"Feb. 22, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/so...,-
171,391,27550252720,Sr Software Engineer,"Apr. 19, 2022",Disney Media & Entertainment Distribution,"Bristol, Connecticut, United States",https://jobs.disneycareers.com/job/bristol/sr-...,-


In [19]:
df[df.title.str.lower().str.contains('data')]

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
47,391,17617622720,Sr. (Data) Marketing Scientist,"Nov. 04, 2021",The Walt Disney Studios,"Burbank, California, United States",https://jobs.disneycareers.com/job/burbank/sr-...,-
54,391,25922527760,"Sr Analyst, Data Visualization","Mar. 23, 2022",The Walt Disney Company (Corporate),"California, United States",https://jobs.disneycareers.com/job/california/...,-
79,391,26229767216,HR Data & Process Analyst (IA),"Mar. 28, 2022",The Walt Disney Studios,"Burbank, California, United States / United St...",https://jobs.disneycareers.com/job/burbank/hr-...,-
85,391,25137368128,Data Integration Analyst (PH),"Mar. 10, 2022",The Walt Disney Company (Corporate),"Burbank, California, United States / Orlando, ...",https://jobs.disneycareers.com/job/burbank/dat...,-
86,391,25137360720,Data Integration Analyst-Replatform (PH),"Mar. 10, 2022",The Walt Disney Company (Corporate),"Burbank, California, United States / Orlando, ...",https://jobs.disneycareers.com/job/burbank/dat...,-
92,391,16360548592,Data Integration Engineer,"Dec. 27, 2021",The Walt Disney Company (Corporate),"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/dat...,-
117,391,25561189520,Data Analyst,"Mar. 17, 2022",Disney Streaming,"Santa Monica, California, United States",https://jobs.disneycareers.com/job/santa-monic...,-
118,391,25561188240,Sr Data Analyst,"Mar. 17, 2022",Disney Streaming,"Santa Monica, California, United States",https://jobs.disneycareers.com/job/santa-monic...,-
120,391,25137365136,"Manager, Data Governance","Mar. 10, 2022",The Walt Disney Company (Corporate),United States,https://jobs.disneycareers.com/job/united-stat...,-
127,391,21392955472,"Manager, Data Integration","Jan. 06, 2022",The Walt Disney Company (Corporate),"Orlando, Florida, United States / Lake Buena V...",https://jobs.disneycareers.com/job/orlando/man...,-


In [16]:
df[df.title.str.lower().str.contains('ml')]

Unnamed: 0,cat_id,job_id,title,date,brand,location,url,close
7,391,24429981920,"Senior AI/ML Research Scientist, Disney Resear...","Feb. 27, 2022",Walt Disney Imagineering,"Glendale, California, United States",https://jobs.disneycareers.com/job/glendale/se...,-
