In [44]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import unicodedata
import os

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

import lever

import tags
import re

In [55]:
#initializing firestore
cert_path = '/Users/dzianis/github/' + 'findremote-firebase-adminsdk-p9cw7-633a39d4a9.json'
cred = credentials.Certificate(cert_path)
firebase_admin.initialize_app(cred)

# connect to db
db = firestore.client()




# get data from FireStore
listings_db = db.collection(u'new_listings').stream()

listings = dict()
for listing in listings_db:
    listings[listing.id] = listing.to_dict()

In [69]:
companies = {
    #'spotify': 'Spotify',
    'binance': 'Binance',
    'kraken': 'Kraken',
    'lime': 'Lime',
    'gantry': 'Gantry',
    'dbtlabs': 'dbt Labs',
    'clearbit': 'Clearbit',
    'sonatype': 'Sonatype',
    'stackadapt': 'StackAdapt',
    'benchsci': 'BenchSci',
    'koho': 'KOHO',
    'gettyimages': 'gettyimages',
    'scribd': 'Scribd',
    'canva': 'Canva',
    'xero': 'Xero'
}


In [70]:
def get_job_description(job_url):
    # scrap job description
    job_description_url = job_url
    job_description_response = requests.get(job_description_url)
    job_description_html = BeautifulSoup(job_description_response.content, 'html.parser')
    job_description = job_description_html.find_all('div', attrs={'class': 'section page-centered'})
    
    job_description = ''.join([unicodedata.normalize("NFKD", str(d)) for d in description])
    
    # unify job description
    job_description = job_description.replace('h5', 'h6')
    job_description = job_description.replace('h4', 'h6')
    job_description = job_description.replace('h3', 'h5')
    job_description = job_description.replace('h2', 'h4')
    job_description = job_description.replace('h1', 'h4')

    return job_description



def get_roles(title):
    # software engineer
    swe = True if 'engineer' in title.lower().replace(' ', '') else False
    
    # data scientist
    ds = True if 'datascien' in title.lower().replace(' ', '') \
                            or 'dataanaly' in title.lower().replace(' ', '') \
                            or 'productanaly' in title.lower().replace(' ', '') else False
    
    # data engineer
    de = True if 'dataengineer' in title.lower().replace(' ', '') else False
    
    # engineering manager
    em = True if 'engineeringmanag' in title.lower().replace(' ', '') else False

    # product manager
    pm = True if 'productmanag' in title.lower().replace(' ', '') else False

    # design
    dis = True if 'design' in title.lower().replace(' ', '') else False

    # marketing
    mr =  True if 'marketing' in title.lower().replace(' ', '') else False
    
    roles = {
        'swe': swe,
        'ds': ds,
        'de': de,
        'em': em,
        'pm': pm,
        'dis': dis,
        'mr': mr
    }

    # create dict with roles
    return roles



def get_tags_lists(tags_list, description):
    job_tags = tags.get_list(tags_list, description)
    
    return job_tags 



def get_job_data(raw):
    # create dict to store results
    job_data = dict()
    
    # link to job description + job_id + job name
    job = raw.find('a', attrs={'class': 'posting-title'})
    job_data['job_name'] = job.find(attrs={'data-qa': 'posting-name'}).text
    job_data['job_url'] = job['href']
    job_data['job_id'] = job_data['job_url'].split('/')[-1]
    job_data['job_category'] = raw.find('span', class_='sort-by-team posting-category small-category-label').text
    
    # job roles (filter)
    roles = get_roles(job_data['job_name'])
    for role in roles.keys():
        job_data[role] = roles[role]
    
    
    # job location
    job_data['location'] = result.find('span', class_='sort-by-location posting-category small-category-label').text
    job_data['location_simp'] = job_data['location'] if len(job_data['location']) <= 30 else 'Remote'
    
    # job description
    job_data['job_description'] = get_job_description(job_data['job_url']) 
    
    # tags
    job_data['tags'] = get_tags_lists(tags.get_tags(), job_data['job_description']) 
    job_data['tags_short'] = job_data['tags'][0:5] if len(job_data['tags']) >= 5 else job_data['tags']
    
    return job_data



def get_job_id(raw):
    job = raw.find('a')
    job_url = job['href']
    job_id = job_url.split('/')[-1]
    
    return job_id

In [71]:
new_listings = []


url_base = 'https://jobs.lever.co'
for company in companies.keys():
    company_url = url_base + '/' + company
    response = requests.get(company_url)
    time.sleep(5)
    
    response_html = BeautifulSoup(response.content, 'html.parser')
    
    #company name
    company_name = companies[company]
    
    # openings
    results = response_html.find_all('div', class_='posting')
    for result in results:
        job_data = get_job_data(result)
        
        new_listings.append(job_data['job_id'])

        # additional data
        job_data['company_name'] = company_name
        job_data['img_url'] = 'https://storage.googleapis.com/findremote/' + company_name.lower() + '.jpg'
        job_data['status'] = 'active'
        
        # add timestamp if job is not in db yet
        if job_data['job_id'] not in listings.keys() or 'datetime' not in listings[job_data['job_id']].keys():
            job_data['datetime'] = pd.to_datetime(datetime.datetime.utcnow())
        else:
            job_data['datetime'] = listings[job_data['job_id']]['datetime']  

        if type(job_data['datetime']) == str:
            job_data['unix_timestamp'] = int((datetime.datetime.strptime(job_data['datetime'], "%Y-%m-%d %H:%M:%S.%f") - datetime.datetime(1970, 1, 1)).total_seconds())
        else:
            job_data['unix_timestamp'] = int((job_data['datetime'].replace(tzinfo=None) - datetime.datetime(1970, 1, 1)).total_seconds())      

        # write to Fire Store (Content)
        doc_ref = db.collection(u'new_listings').document(job_data['job_id'])
        doc_ref.set(job_data)
        
        
        time.sleep(5)

KeyboardInterrupt: 