In [1]:
import requests
from bs4 import BeautifulSoup
import shutil
import pandas as pd
import time
import datetime

In [3]:
companies = {
    'github': 'GitHub', 
    'gitlab': 'GitLab', 
    'invision': 'InVision', 
    'blockchain': 'Blockchain', 
    'automatticcareers': 'Automattic', 
    'monzo': 'Monzo', 
    'mozilla': 'Mozilla', 
    'autoscout24': 'Autoscout24',
    'zapiercareers': 'Zapier' 
}

In [4]:
url_base = 'https://boards.greenhouse.io'
jobs = pd.DataFrame()

for company in companies.keys():
    
    company_url = url_base + '/' + company
    response = requests.get(company_url)
    time.sleep(5)
    
    response_html = BeautifulSoup(response.content, 'html.parser')
    
    
    #company name
    company_name = companies[company]
    
    
    """
    # saving company logo
    image_url = response_html.find('div', id='logo').img['src']
    response_image = requests.get(image_url, stream=True)

    # Check if the image was retrieved successfully
    if response_image.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        response_image.raw.decode_content = True

        # Open a local file with wb ( write binary ) permission.
        with open(f'https://storage.googleapis.com/findremote/{company_name}.jpg', 'wb') as f:
            shutil.copyfileobj(response_image.raw, f)
    """
    
    # openings
    results = response_html.find_all('div', class_='opening')
    for result in results:
        job = result.find('a')
        job_url = job['href']
        job_name = job.text
        job_location = result.find('span', class_='location').text
        
        # job description
        job_description_url = 'https://boards.greenhouse.io' + job_url
        job_description_response = requests.get(job_description_url)
        job_description_html = BeautifulSoup(job_description_response.content, 'html.parser')
        job_description = job_description_html.find('div', id='content')
        
        
        jobs_temp = pd.DataFrame({
            'company': [company_name],
            'job_id': [job_url.split('/')[-1]],
            'job_name': [job_name],
            'location': [job_location],
            'job_url': [job_url],
            'img_url': ['https://storage.googleapis.com/findremote/' + company_name.lower() + '.jpg'],
            'datetime': pd.to_datetime(datetime.datetime.utcnow()),
            'job_description': [job_description]
        })
        
        jobs = pd.concat([jobs, jobs_temp], ignore_index=True)
        
        time.sleep(5)

In [5]:
jobs.head()

Unnamed: 0,company,job_id,job_name,location,job_url,img_url,datetime,job_description
0,GitHub,4177775,"Design Director – Education, Communities, and ...",Remote - US East,/github/jobs/4177775,logo/GitHub.jpg,2022-08-12 14:57:05.277160,"[\n, [GitHub is changing the way the world bui..."
1,GitHub,3992208,"Design Director – Education, Communities, and ...",Remote - US,/github/jobs/3992208,logo/GitHub.jpg,2022-08-12 14:57:10.878391,"[\n, [GitHub is changing the way the world bui..."
2,GitHub,3992209,"Design Director – Education, Communities, and ...",Remote - Canada,/github/jobs/3992209,logo/GitHub.jpg,2022-08-12 14:57:16.365304,"[\n, [GitHub is changing the way the world bui..."
3,GitHub,4020252,"Senior Engineering Manager, Primer Brand & Bra...",Remote - Europe,/github/jobs/4020252,logo/GitHub.jpg,2022-08-12 14:57:22.055988,"[\n, [GitHub is looking for an experienced Eng..."
4,GitHub,4098600,Lead Interaction Designer,Remote - US,/github/jobs/4098600,logo/GitHub.jpg,2022-08-12 14:57:27.657470,"[\n, [[Github is looking for a Lead Interactio..."


In [6]:
len(jobs)

706

In [7]:
len(jobs.job_description.unique())

651

### Adding Job type filter

In [13]:
jobs['swe'] = jobs.job_name.apply(
    lambda x: True if 'engineer' in x.lower().replace(' ', '') else False)

jobs['ds'] = jobs.job_name.apply(
    lambda x: True if 'datascientist' in x.lower().replace(' ', '') \
                        or 'dataanalyst' in x.lower().replace(' ', '') \
                        or 'productanalyst' in x.lower().replace(' ', '') else False)

jobs['de'] = jobs.job_name.apply(
    lambda x: True if 'dataengineer' in x.lower().replace(' ', '') else False)

jobs['em'] = jobs.job_name.apply(
    lambda x: True if 'engineeringmanager' in x.lower().replace(' ', '') else False)

jobs['pm'] = jobs.job_name.apply(
    lambda x: True if 'productmanager' in x.lower().replace(' ', '') else False)

jobs['dis'] = jobs.job_name.apply(
    lambda x: True if 'design' in x.lower().replace(' ', '') else False)

jobs['mr'] = jobs.job_name.apply(
    lambda x: True if 'marketing' in x.lower().replace(' ', '') else False)

In [14]:
jobs[jobs.swe].head(5)

Unnamed: 0,company,job_id,job_name,location,job_url,img_url,datetime,job_description,swe,ds,de,em,pm,dis,mr,tags,tags_short
3,GitHub,4020252,"Senior Engineering Manager, Primer Brand & Bra...",Remote - Europe,/github/jobs/4020252,logo/GitHub.jpg,2022-08-12 14:57:22.055988,"[\n, [GitHub is looking for an experienced Eng...",True,False,False,True,False,False,False,"[javascript, java, css, ruby, git, api, shell,...","[javascript, java, css, ruby, git]"
5,GitHub,3906561,Senior Engineering Manager: OSPO,Remote - Canada,/github/jobs/3906561,logo/GitHub.jpg,2022-08-12 14:57:33.216009,"[\n, [[GitHub is changing the way the world bu...",True,False,False,True,False,False,False,"[git, api, shell, express, security, github, t...","[git, api, shell, express, security]"
6,GitHub,3906560,Senior Engineering Manager: OSPO,Remote - US,/github/jobs/3906560,logo/GitHub.jpg,2022-08-12 14:57:38.890185,"[\n, [[GitHub is changing the way the world bu...",True,False,False,True,False,False,False,"[git, api, shell, express, security, github, t...","[git, api, shell, express, security]"
33,GitHub,4048890,CodeQL Analysis Engineer,Remote - US,/github/jobs/4048890,logo/GitHub.jpg,2022-08-12 15:00:10.460081,"[\n, [GitHub is seeking a CodeQL Analysis Engi...",True,False,False,False,False,False,False,"[javascript, python, java, c#, c++, swift, rub...","[javascript, python, java, c#, c++]"
34,GitHub,4048891,CodeQL Analysis Engineer,Remote - Europe,/github/jobs/4048891,logo/GitHub.jpg,2022-08-12 15:00:15.938126,"[\n, [GitHub is seeking a CodeQL Analysis Engi...",True,False,False,False,False,False,False,"[javascript, python, java, c#, c++, swift, rub...","[javascript, python, java, c#, c++]"


### Adding Job Tags

In [10]:
# tags
tags_list = ['javascript',	'python',	'java',	'c#',	'php',	'android',	'html',	'jquery',	'c++',	'css',	'mysql',	'sql',	'nodejs',	'reactjs',	'asp.net',	'json',	'.net',	'sql-server',	'swift',	'django',	'objective-c',	'angular',	'pandas',	'regex',	'ruby',	'ajax',	'linux',	'xml',	'vba',	'spring',	'typescript',	'database',	'wordpress',	'wpf',	'mongodb',	'windows',	'postgresql',	'xcode',	'bash',	'oracle',	'git',	'aws',	'vb.net',	'multithreading',	'flutter',	'firebase',	'dataframe',	'eclipse',	'azure',	'react-native',	'docker',	'algorithm',	'visual-studio',	'scala',	'powershell',	'numpy',	'api',	'selenium',	'performance',	'winforms',	'vuejs',	'matlab',	'sqlite',	'shell',	'express',	'android-studio',	'csv',	'linq',	'maven',	'unit-testing',	'swing',	'tensorflow',	'kotlin',	'spark',	'dart',	'symfony',	'tsql',	'codeigniter',	'opencv',	'perl',	'unity3d',	'matplotlib',	'sockets',	'golang',	'cordova',	'xaml',	'oop',	'ubuntu',	'ms-access',	'parsing',	'elasticsearch',	'security',	'jsp',	'github',	'nginx',	'flask',	'machine-learning',	'delphi',	'kubernetes',	'haskell',	'xamarin',	'ssl',	'ggplot2',	'jenkins',	'gradle',	'visual-studio-code',	'google-apps-script',	'testing',	'tkinter',	'unix',	'google-app-engine',	's3',	'google-sheets',	'web-scraping',	'hadoop',	'mongo',	'heroku',	'animation',	'curl',	'math',	'actionscript',	'assembly',	'image-processing',	'keras',	'gcp',	'd3.js',	'magento',	'networking',	'javafx',	'optimization',	'google-cloud-firestore',	'facebook-graph-api',	'cocoa-touch',	'amazon-ec2',	'pyspark',	'xamarin.forms',	'jdbc',	'data-structures',	'dplyr',	'cakephp',	'awk',	'design-patterns',	'visual-c++',	'rust',	'beautifulsoup',	'ssh',	'kafka',	'sharepoint',	'bootstrap',	'vim',	'graph',	'silverlight',	'plsql',	'aws-lambda',	'scikit-learn',	'websocket',	'shiny',	'sass',	'vuejs2',	'deep-learning',	'extjs',	'apache-flex']

def get_tags(x, tags_list=tags_list):
    x1 = x.lower().replace('-', '').replace(' ', '')
    tags = []
    for tag in tags_list:
        if tag.lower().replace('-', '').replace(' ', '') in x1:
            tags.append(tag)
    
    return tags


jobs['tags'] = jobs['job_description'].apply(lambda x: get_tags(str(x), tags_list=tags_list))

jobs['tags_short'] = jobs['tags'].apply(lambda x: x[0:5] if len(x)>=5 else x[0:len(x)])

In [15]:
jobs.head()

Unnamed: 0,company,job_id,job_name,location,job_url,img_url,datetime,job_description,swe,ds,de,em,pm,dis,mr,tags,tags_short
0,GitHub,4177775,"Design Director – Education, Communities, and ...",Remote - US East,/github/jobs/4177775,logo/GitHub.jpg,2022-08-12 14:57:05.277160,"[\n, [GitHub is changing the way the world bui...",False,False,False,False,False,True,False,"[git, api, shell, express, github, rust]","[git, api, shell, express, github]"
1,GitHub,3992208,"Design Director – Education, Communities, and ...",Remote - US,/github/jobs/3992208,logo/GitHub.jpg,2022-08-12 14:57:10.878391,"[\n, [GitHub is changing the way the world bui...",False,False,False,False,False,True,False,"[git, api, shell, express, github, rust]","[git, api, shell, express, github]"
2,GitHub,3992209,"Design Director – Education, Communities, and ...",Remote - Canada,/github/jobs/3992209,logo/GitHub.jpg,2022-08-12 14:57:16.365304,"[\n, [GitHub is changing the way the world bui...",False,False,False,False,False,True,False,"[git, api, shell, express, github, rust]","[git, api, shell, express, github]"
3,GitHub,4020252,"Senior Engineering Manager, Primer Brand & Bra...",Remote - Europe,/github/jobs/4020252,logo/GitHub.jpg,2022-08-12 14:57:22.055988,"[\n, [GitHub is looking for an experienced Eng...",True,False,False,True,False,False,False,"[javascript, java, css, ruby, git, api, shell,...","[javascript, java, css, ruby, git]"
4,GitHub,4098600,Lead Interaction Designer,Remote - US,/github/jobs/4098600,logo/GitHub.jpg,2022-08-12 14:57:27.657470,"[\n, [[Github is looking for a Lead Interactio...",False,False,False,False,False,True,False,"[c#, git, azure, shell, express, security, git...","[c#, git, azure, shell, express]"


In [16]:
jobs.to_csv('data/jobs_list.csv', index=False)