In [21]:
import requests
from bs4 import BeautifulSoup
import shutil
import pandas as pd
import time

In [None]:
companies = {
    'github': 'GitHub', 
    'gitlab': 'GitLab', 
    'invision': 'InVision', 
    'blockchain': 'Blockchain', 
    'automatticcareers': 'Automattic', 
    'monzo': 'Monzo', 
    'mozilla': 'Mozila', 
    'autoscout24': 'Autoscout24',
    'zapiercareers': 'Zapier' 
}

In [27]:
url_base = 'https://boards.greenhouse.io'
jobs = pd.DataFrame()

for company in companies.keys():
    
    company_url = url_base + '/' + company
    response = requests.get(company_url)
    time.sleep(5)
    
    response_html = BeautifulSoup(response.content, 'html.parser')
    
    
    #company name
    company_name = companies[company]
    
    
    # saving company logo
    image_url = response_html.find('div', id='logo').img['src']
    response_image = requests.get(image_url, stream=True)

    # Check if the image was retrieved successfully
    if response_image.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        response_image.raw.decode_content = True

        # Open a local file with wb ( write binary ) permission.
        with open(f'data/logo/{company_name}.png', 'wb') as f:
            shutil.copyfileobj(response_image.raw, f)
    
    
    # openings
    results = response_html.find_all('div', class_='opening')
    for result in results:
        job = result.find('a')
        job_url = job['href']
        job_name = job.text
        job_location = result.find('span', class_='location').text
        
        # job description
        job_description_url = 'https://boards.greenhouse.io' + job_url
        job_description_response = requests.get(job_description_url)
        job_description_html = BeautifulSoup(job_description_response.content, 'html.parser')
        job_description = job_description_html.find('div', id='content')
        
        
        jobs_temp = pd.DataFrame({
            'company': [company_name],
            'job_name': [job_name],
            'location': [job_location],
            'job_url': [job_url],
            'job_description': [job_description]
        })
        
        jobs = pd.concat([jobs, jobs_temp], ignore_index=True)
        
        time.sleep(5)

In [28]:
jobs.head()

Unnamed: 0,company,job_name,location,job_url,job_description
0,GitHub,IT Software Engineer,Remote - US West,/github/jobs/4016529,"[\n, [[The IT Engineering team at GitHub is gr..."
1,GitHub,"Design Director – Education, Communities, and ...",Remote - US East,/github/jobs/4177775,"[\n, [GitHub is changing the way the world bui..."
2,GitHub,"Design Director – Education, Communities, and ...",Remote - US,/github/jobs/3992208,"[\n, [GitHub is changing the way the world bui..."
3,GitHub,"Design Director – Education, Communities, and ...",Remote - Canada,/github/jobs/3992209,"[\n, [GitHub is changing the way the world bui..."
4,GitHub,"Senior Engineering Manager, Primer Brand & Bra...",Remote - Europe,/github/jobs/4020252,"[\n, [GitHub is looking for an experienced Eng..."


In [29]:
len(jobs)

748

In [30]:
len(jobs.job_description.unique())

687

In [31]:
jobs.company.unique()

array(['GitHub', 'GitLab', 'InVision',
       'Current Job Openings at Blockchain.com',
       'Current Job Openings at Automattic Careers',
       'Current Job Openings at Monzo', 'Mozilla', 'AutoScout24',
       'Zapier Job Board'], dtype=object)

In [35]:
jobs['swe'] = jobs.job_name.apply(
    lambda x: True if 'engineer' in x.lower().replace(' ', '') else False)

jobs['ds'] = jobs.job_name.apply(
    lambda x: True if 'datascientist' in x.lower().replace(' ', '') \
                        or 'dataanalyst' in x.lower().replace(' ', '') \
                        or 'productanalyst' in x.lower().replace(' ', '') else False)

jobs['de'] = jobs.job_name.apply(
    lambda x: True if 'dataengineer' in x.lower().replace(' ', '') else False)

jobs['em'] = jobs.job_name.apply(
    lambda x: True if 'engineeringmanager' in x.lower().replace(' ', '') else False)

jobs['pm'] = jobs.job_name.apply(
    lambda x: True if 'productmanager' in x.lower().replace(' ', '') else False)

jobs['dis'] = jobs.job_name.apply(
    lambda x: True if 'designer' in x.lower().replace(' ', '') else False)

jobs['mr'] = jobs.job_name.apply(
    lambda x: True if 'marketing' in x.lower().replace(' ', '') else False)

In [41]:
jobs[jobs.dis]

Unnamed: 0,company,job_name,location,job_url,job_description,swe,ds,de,em,pm,dis,mr
5,GitHub,Lead Interaction Designer,Remote - US,/github/jobs/4098600,"[\n, [[Github is looking for a Lead Interactio...",False,False,False,False,False,True,False
173,GitLab,"Product Designer, Compliance","Remote, Americas",/gitlab/jobs/6266828002,"[\n, [[The GitLab , <a class=""editor-rtfLink"" ...",False,False,False,False,False,True,False
432,Current Job Openings at Blockchain.com,Senior Product Designer,"London, San Francisco, Miami, Remote",/blockchain/jobs/3308243,"[\n, [[Blockchain is the world's leading softw...",False,False,False,False,False,True,False
476,Current Job Openings at Automattic Careers,Senior Product Designer,Remote,/automatticcareers/jobs/2127794,"[\n, [At Automattic, we believe in making the ...",False,False,False,False,False,True,False
477,Current Job Openings at Automattic Careers,"Senior Product Designer, Mobile",Remote,/automatticcareers/jobs/2181754,"[\n, [[At Automattic, we believe in making the...",False,False,False,False,False,True,False
492,Current Job Openings at Monzo,Product Designer,London,/monzo/jobs/3369069,"[\n, [We’re looking for Senior/Lead Product De...",False,False,False,False,False,True,False
545,Mozilla,Senior Product Designer,"Remote, New York, Chicago, Portland, San Franc...",/mozilla/jobs/4122693,"[\n, [\n, [ ], \n, [<span style=""font-weight: ...",False,False,False,False,False,True,False
554,Mozilla,Staff Content Designer,"Remote US, Remote Canada",/mozilla/jobs/4452793,"[\n, [The Firefox UX team works on products an...",False,False,False,False,False,True,False
557,Mozilla,"Staff Product/UX Designer, Design Systems","Remote US, Remote Canada, Remote France, Remot...",/mozilla/jobs/4030606,"[\n, [\n, [\n, <p><strong>The Company</strong>...",False,False,False,False,False,True,False
563,Mozilla,Staff Product/UX Designer,"Remote US, Remote San Francisco Bay Area, Remo...",/mozilla/jobs/4247856,"[\n, [[Come join Mozilla's Future Products tea...",False,False,False,False,False,True,False


In [44]:
jobs.company.unique()

array(['GitHub', 'GitLab', 'InVision',
       'Current Job Openings at Blockchain.com',
       'Current Job Openings at Automattic Careers',
       'Current Job Openings at Monzo', 'Mozilla', 'AutoScout24',
       'Zapier Job Board'], dtype=object)

In [45]:
def replace_name(x):
    if x == 'Current Job Openings at Blockchain.com':
        return 'Blockchain'
    elif x == 'Current Job Openings at Automattic Careers':
        return 'Automattic'
    elif x == 'Current Job Openings at Monzo':
        return 'Monzo'
    elif x == 'Zapier Job Board':
        return 'Zapier'
    else:
        return x
    

jobs['company'] = jobs['company'].apply(lambda x: replace_name(x))

In [46]:
jobs.to_csv('data/jobs_list.csv', index=False)