In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

import greenhouse

### 0. Initializing firestore

In [2]:
#initializing firestore
cred = credentials.Certificate('/Users/dzianis/github/findremote-firebase-adminsdk-p9cw7-633a39d4a9.json')
firebase_admin.initialize_app(cred)

# connect to db
db = firestore.client()

### 1. Read all available records in FireStore

In [3]:
listings_db = db.collection(u'listings_light').stream()

listings = dict()
for listing in listings_db:
    listings[listing.id] = listing.to_dict()

### 2. Scrap greenhouse

In [46]:
def get_roles(title):
    # software engineer
    swe = True if 'engineer' in title.lower().replace(' ', '') else False
    
    # data scientist
    ds = True if 'datascien' in title.lower().replace(' ', '') \
                            or 'dataanaly' in title.lower().replace(' ', '') \
                            or 'productanaly' in title.lower().replace(' ', '') else False
    
    # data engineer
    de = True if 'dataengineer' in title.lower().replace(' ', '') else False
    
    # engineering manager
    em = True if 'engineeringmanag' in title.lower().replace(' ', '') else False

    # product manager
    pm = True if 'productmanag' in title.lower().replace(' ', '') else False

    # design
    dis = True if 'design' in title.lower().replace(' ', '') else False

    # marketing
    mr =  True if 'marketing' in title.lower().replace(' ', '') else False
    
    roles = {
        'swe': swe,
        'ds': ds,
        'de': de,
        'em': em,
        'pm': pm,
        'dis': dis,
        'mr': mr
    }

    # create dict with roles
    return roles



def get_tags(description):
    # tags
    tags_list = ['javascript',	'python',	'java',	'c#',	'php',	'android',	'html',	'jquery',	'c++',	'css',	'mysql',	'sql',	'nodejs',	'reactjs',	'asp.net',	'json',	'.net',	'sql-server',	'swift',	'django',	'objective-c',	'angular',	'pandas',	'regex',	'ruby',	'ajax',	'linux',	'xml',	'vba',	'spring',	'typescript',	'database',	'wordpress',	'wpf',	'mongodb',	'windows',	'postgresql',	'xcode',	'bash',	'oracle',	'git',	'aws',	'vb.net',	'multithreading',	'flutter',	'firebase',	'dataframe',	'eclipse',	'azure',	'react-native',	'docker',	'algorithm',	'visual-studio',	'scala',	'powershell',	'numpy',	'api',	'selenium',	'performance',	'winforms',	'vuejs',	'matlab',	'sqlite',	'shell',	'express',	'android-studio',	'csv',	'linq',	'maven',	'unit-testing',	'swing',	'tensorflow',	'kotlin',	'spark',	'dart',	'symfony',	'tsql',	'codeigniter',	'opencv',	'perl',	'unity3d',	'matplotlib',	'sockets',	'golang',	'cordova',	'xaml',	'oop',	'ubuntu',	'ms-access',	'parsing',	'elasticsearch',	'security',	'jsp',	'github',	'nginx',	'flask',	'machine-learning',	'delphi',	'kubernetes',	'haskell',	'xamarin',	'ssl',	'ggplot2',	'jenkins',	'gradle',	'visual-studio-code',	'google-apps-script',	'testing',	'tkinter',	'unix',	'google-app-engine',	's3',	'google-sheets',	'web-scraping',	'hadoop',	'mongo',	'heroku',	'animation',	'curl',	'math',	'actionscript',	'assembly',	'image-processing',	'keras',	'gcp',	'd3.js',	'magento',	'networking',	'javafx',	'optimization',	'google-cloud-firestore',	'facebook-graph-api',	'cocoa-touch',	'amazon-ec2',	'pyspark',	'xamarin.forms',	'jdbc',	'data-structures',	'dplyr',	'cakephp',	'awk',	'design-patterns',	'visual-c++',	'rust',	'beautifulsoup',	'ssh',	'kafka',	'sharepoint',	'bootstrap',	'vim',	'graph',	'silverlight',	'plsql',	'aws-lambda',	'scikit-learn',	'websocket',	'shiny',	'sass',	'vuejs2',	'deep-learning',	'extjs',	'apache-flex']

    base = description.lower().replace('-', '').replace(' ', '')
    
    tags = []
    for tag in tags_list:
        if tag.lower().replace('-', '').replace(' ', '') in base:
            tags.append(tag)
    
    return tags    





def get_job_description(job_url):
    # scrap job description
    job_description_url = 'https://boards.greenhouse.io' + job_url
    job_description_response = requests.get(job_description_url)
    job_description_html = BeautifulSoup(job_description_response.content, 'html.parser')
    job_description = str(job_description_html.find('div', id='content'))
    
    # unify job description
    job_description = job_description.replace('h5', 'h6')
    job_description = job_description.replace('h4', 'h6')
    job_description = job_description.replace('h3', 'h5')
    job_description = job_description.replace('h2', 'h4')
    job_description = job_description.replace('h1', 'h4')
    
    return job_description
    
    
    


def get_job_data(raw):
    # create dict to store results
    job_data = dict()
    
    # link to job description + job_id + job name
    job = raw.find('a')
    job_data['job_name'] = job.text
    job_data['job_url'] = job['href']
    job_data['job_id'] = job_data['job_url'].split('/')[-1]
    
    # job roles (filter)
    roles = get_roles(job_data['job_name'])
    for role in roles.keys():
        job_data[role] = roles[role]
    
    
    # job location
    job_data['location'] = raw.find('span', class_='location').text
    job_data['location_simp'] = job_data['location'] if len(job_data['location']) <= 30 else 'Remote'
    
    # job description
    job_data['job_description'] = get_job_description(job_data['job_url']) 
    
    # tags
    job_data['tags'] = get_tags(job_data['job_description'])
    job_data['tags_short'] = job_data['tags'][0:5] if len(job_data['tags']) >= 5 else job_data['tags']
    
    return job_data

In [47]:
companies = {
    'github': 'GitHub', 
    'gitlab': 'GitLab', 
    'invision': 'InVision', 
    'blockchain': 'Blockchain', 
    'automatticcareers': 'Automattic', 
    'monzo': 'Monzo', 
    'mozilla': 'Mozilla', 
    'autoscout24': 'Autoscout24',
    'zapiercareers': 'Zapier' 
}

In [4]:
companies = {
    'autoscout24': 'Autoscout24'
}


url_base = 'https://boards.greenhouse.io'
for company in companies.keys():
    
    company_url = url_base + '/' + company
    response = requests.get(company_url)
    time.sleep(5)
    
    response_html = BeautifulSoup(response.content, 'html.parser')
    
    #company name
    company_name = companies[company]
    
    # openings
    results = response_html.find_all('div', class_='opening')
    for result in results:
        job_data = greenhouse.get_job_data(result)
        # additional data
        job_data['company_name'] = company_name
        job_data['img_url'] = 'https://storage.googleapis.com/findremote/' + company_name.lower() + '.jpg'
        
        # add timestamp if job is not in db yet
        if job_data['job_id'] not in listings.keys():
            job_data['datetime'] = pd.to_datetime(datetime.datetime.utcnow())
            print('new job', job_data['job_name'], job_data['company_name'])
        

        # write to Fire Store (Content)
        doc_ref = db.collection(u'listings').document(job_data['job_id'])
        doc_ref.set(job_data)
        
        
        time.sleep(5)

In [9]:
os.path.abspath('../../../') + '/findremote-firebase-adminsdk-p9cw7-633a39d4a9.json'

'/Users/dzianis/github'

In [None]:
'/' + '/'.join(