In [85]:
import requests
import json
from lxml import html
from typing import Dict, Any, List
#from googlesearch import search
from tqdm import tqdm
from user_agent import generate_user_agent
from urllib.parse import urlencode, urljoin
from duckduckgo_search import DDGS

In [2]:
def get_job_data(job_url: str) -> Dict[str,Any]:
    """get job description metadata from a JD url
    right now this only supports greenhouse (boards.greenhouse.io) JDs
    """
    resp = requests.get(job_url)
    tree = html.fromstring(resp.content.decode("utf-8"))
    #the json schema for the greenhouse JDs comes after this javascript tag
    context = tree.xpath(f'/html/body/script[@type="application/ld+json"]/text()')
    assert len(context) == 1
    schema = json.loads(context[0].strip("\n").strip())
    return schema

In [89]:
def get_jobs_from_company(company_url: str, search_term: str ="data scientist", base_url="https://boards.greenhouse.io"):
    """From a listing of jobs at a company, return the list of urls for job postings that are relevant to the search term"""
    
    company = requests.get(company_url)
    html_resp = company.content.decode("utf-8")
    tree = html.fromstring(html_resp)
    anchors = tree.xpath(f'/html/body//a')
    new_links = list()
    for a in anchors: 
        if search_term.lower() in a.text.lower():
            new_link = a.values()[1]
            if not new_link.startswith(base_url):
                new_link = urljoin(base_url,new_link)
            new_links.append(new_link)
    return new_links
            

In [90]:
#deprecated - just use duckduckgo_search
# class SearchJobs:
#     """Need to use this through a vpn"""
#     def __init__(self,base_url: str = "https://html.duckduckgo.com/html/"):
#         self.base_url = base_url
#         self.headers = {'User-Agent': generate_user_agent(),
#            'accept': '*/*',
#             'accept-encoding': 'utf-8',
#            'accept-language': 'en-US,en;q=0.9',
#            }
#         self.site = "boards.greenhouse.io"
        
#     def search(self,search_term: str) -> List[str]:
#         if self.site:
#             #note that including a comma in the join string will screw up the result__url
#             query = " ".join([search_term ,f"site:{self.site}"])
            
            
#         query = urlencode({"q": query,"s":30, "dc": 31})
        
#         query = f"{self.base_url}?{query}"
        
#         results = requests.get(query, headers=self.headers )
#         return self.parse_results(results)


#     def parse_results(self,results)-> List[str]:

#         decoded = results.content.decode('utf-8')
#         tree = html.fromstring(decoded)
#         #the class selector is specific to duckduckgo
#         anchors = tree.xpath(f'/html/body//a[@class="result__url"]')
#       #  anchors = tree.xpath(f'/html/body//a[@class="result__url"]/text()')
#       #  return anchors
#         hrefs = [a.attrib["href"] for a in anchors]
#         return [l.strip() for l in hrefs]


In [31]:
searchjobs =SearchJobs()
results = searchjobs.search("data scientist")
results

['https://boards.greenhouse.io/point72/jobs/4644238002',
 'https://boards.greenhouse.io/gleanwork/jobs/4369272005',
 'https://boards.greenhouse.io/blueoriginllc/jobs/4338813006?gh_jid=4338813006',
 'https://boards.greenhouse.io/oddball/jobs/5897666003',
 'https://boards.greenhouse.io/acorns/jobs/5586409',
 'https://boards.greenhouse.io/garnerhealth/jobs/5063776004',
 'https://boards.greenhouse.io/figma/jobs/5027898004',
 'https://boards.greenhouse.io/valohealth/jobs/7067644002',
 'https://boards.greenhouse.io/scribetherapeutics/jobs/4910734004',
 'https://boards.greenhouse.io/faire/jobs/7014725002?gh_jid=7014725002',
 'https://boards.greenhouse.io/perplexityai/jobs/4231718007',
 'https://boards.greenhouse.io/andurilindustries/jobs/4268757007?gh_jid=4268757007',
 'https://boards.greenhouse.io/smartsheet/jobs/5601508',
 'https://boards.greenhouse.io/systemstechnologyresearch/jobs/4070954006',
 'https://boards.greenhouse.io/moloco/jobs/5841703003',
 'https://boards.greenhouse.io/alpaca/jo

[]

In [5]:
searchjobs =SearchJobs()
searchjobs.search("data scientist")

[]

In [6]:
def parse_duck_results()-> List[str]:
    headers = {'User-Agent': generate_user_agent(),
           'accept': '*/*',
           #'accept-encoding': 'gzip, deflate, br',
            'accept-encoding': 'utf-8',

           'accept-language': 'en-US,en;q=0.9',
        #   'referer': 'https://www.google.com'

           }

    duck_search="https://html.duckduckgo.com/html/?q=%22data%20scientist%22%20site:%20boards.greenhouse.io"
    duck_results = requests.get(duck_search, headers=headers )
    #FIXME take a search string, execute the search
    decoded = duck_results.content.decode('utf-8')
    tree = html.fromstring(decoded)
    anchors = tree.xpath(f'/html/body//a[@class="result__url"]')
    hrefs = [a.attrib["href"] for a in anchors]
    return [l.strip() for l in hrefs]

In [7]:
gh_context = get_job_data("https://boards.greenhouse.io/remotecom/jobs/5820671003")
gh_context

{'@context': 'schema.org',
 '@type': 'JobPosting',
 'hiringOrganization': {'@type': 'Organization',
  'name': 'Remote',
  'logo': 'https://s3-recruiting.cdn.greenhouse.io/external_greenhouse_job_boards/logos/400/495/700/resized/Greenhouse_Profile_Cover.png?1644510527'},
 'title': 'Data Analyst',
 'datePosted': '2023-11-30',
 'jobLocation': {'@type': 'Place',
  'address': {'@type': 'PostalAddress',
   'addressLocality': 'Mexico City, Mexico City, Mexico',
   'addressRegion': 'CMX',
   'addressCountry': None,
   'postalCode': None}},
 'description': '<h2>About Remote</h2>\n<p>Remote is solving global remote organizations’ biggest challenge: employing anyone anywhere compliantly. We make it possible for businesses big and small to employ a global team by handling global payroll, benefits, taxes, and compliance (learn more about <a href="https://remote.com/how-it-works">how it works</a>). We\'re backed by A+ investors and our team is world-class, literally and figuratively, as we\'re all s

In [8]:
resp = requests.get("https://www.google.com/search?q=site%3A+https%3A%2F%2Fboards.greenhouse.io%2F+data+scientist+after%3A2024%2F02%2F28&oq=site%3A+https%3A%2F%2Fboards.greenhouse.io%2F+data+scientist+after%3A2024%2F02%2F28&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRg60gEINTQzM2owajeoAgCwAgA&sourceid=chrome&ie=UTF-8")

In [9]:
resp

<Response [200]>

In [15]:
headers = {'User-Agent': generate_user_agent(),
           'accept': '*/*',
           #'accept-encoding': 'gzip, deflate, br',
            'accept-encoding': 'utf-8',

           'accept-language': 'en-US,en;q=0.9',
        #   'referer': 'https://www.google.com'

           }

__paging__

https://stackoverflow.com/questions/35974954/duck-duck-go-html-version-get-next-page-of-results-url-query-param

In [35]:
headers = {'User-Agent': generate_user_agent(),
       'accept': '*/*',
       #'accept-encoding': 'gzip, deflate, br',
        'accept-encoding': 'utf-8',

       'accept-language': 'en-US,en;q=0.9',
    #   'referer': 'https://www.google.com'

       }
duck_search="https://html.duckduckgo.com/html/?q=%22data%20scientist%22%20site:%20boards.greenhouse.io&s=30&dc=31"
duck_results = requests.get(duck_search, headers=headers )
#FIXME take a search string, execute the search
decoded = duck_results.content.decode('utf-8')
tree = html.fromstring(decoded)
anchors = tree.xpath(f'/html/body//a[@class="result__url"]')
hrefs = [a.attrib["href"] for a in anchors]
[l.strip() for l in hrefs]

['https://boards.greenhouse.io/point72/jobs/4644238002',
 'https://boards.greenhouse.io/figma/jobs/4998647004',
 'https://boards.greenhouse.io/figma/jobs/5027898004',
 'https://boards.greenhouse.io/sentilink/jobs/4060972007',
 'https://boards.greenhouse.io/acorns/jobs/5586409',
 'https://boards.greenhouse.io/figma/jobs/4975816004',
 'https://boards.greenhouse.io/systemstechnologyresearch/jobs/4070954006',
 'https://boards.greenhouse.io/splice/jobs/7216245002',
 'https://boards.greenhouse.io/scribetherapeutics/jobs/4910734004',
 'https://boards.greenhouse.io/valohealth/jobs/7067644002',
 'https://boards.greenhouse.io/smartsheet/jobs/5601508',
 'https://boards.greenhouse.io/moloco/jobs/5841703003',
 'https://boards.greenhouse.io/axios/jobs/5461591',
 'https://boards.greenhouse.io/blueroseresearch/jobs/5085482004',
 'https://boards.greenhouse.io/dataiku/jobs/5088535004',
 'https://boards.greenhouse.io/focalsystems/jobs/4349786004',
 'https://boards.greenhouse.io/blueoriginllc/jobs/4338813

In [37]:
headers = {'User-Agent': generate_user_agent(),
       'accept': '*/*',
       #'accept-encoding': 'gzip, deflate, br',
        'accept-encoding': 'utf-8',

       'accept-language': 'en-US,en;q=0.9',
    #   'referer': 'https://www.google.com'

       }
duck_search="https://html.duckduckgo.com/html/?q=%22data%20scientist%22%20site:%20boards.greenhouse.io&s=60&dc=61"
duck_results = requests.get(duck_search, headers=headers )
#FIXME take a search string, execute the search
decoded = duck_results.content.decode('utf-8')
tree = html.fromstring(decoded)
anchors = tree.xpath(f'/html/body//a[@class="result__url"]')
hrefs = [a.attrib["href"] for a in anchors]
[l.strip() for l in hrefs]

['https://boards.greenhouse.io/point72/jobs/4644238002',
 'https://boards.greenhouse.io/figma/jobs/5027898004',
 'https://boards.greenhouse.io/figma/jobs/4998647004',
 'https://boards.greenhouse.io/dataiku/jobs/5088535004',
 'https://boards.greenhouse.io/figma/jobs/4975816004',
 'https://boards.greenhouse.io/notion/jobs/5895581003',
 'https://boards.greenhouse.io/acorns/jobs/5586409',
 'https://boards.greenhouse.io/systemstechnologyresearch/jobs/4070954006',
 'https://boards.greenhouse.io/moloco/jobs/5841703003',
 'https://boards.greenhouse.io/axios/jobs/5461591',
 'https://boards.greenhouse.io/splice/jobs/7216245002',
 'https://boards.greenhouse.io/smartsheet/jobs/5601508',
 'https://boards.greenhouse.io/blueroseresearch/jobs/5085482004',
 'https://boards.greenhouse.io/blueoriginllc/jobs/4338813006?gh_jid=4338813006',
 'https://boards.greenhouse.io/scribetherapeutics/jobs/4910734004',
 'https://boards.greenhouse.io/focalsystems/jobs/4349786004',
 'https://boards.greenhouse.io/sentilin

In [52]:
with DDGS() as ddgs:
    results = [r for r in ddgs.text("data scientist site:boards.greenhouse.io", max_results=100)]

In [53]:
len(results)

100

In [46]:
links = [l["href"] for l in results]
links

['https://boards.greenhouse.io/point72/jobs/4644238002',
 'https://boards.greenhouse.io/andurilindustries/jobs/4268757007?gh_jid=4268757007',
 'https://boards.greenhouse.io/gleanwork/jobs/4369272005',
 'https://boards.greenhouse.io/blueoriginllc/jobs/4338813006?gh_jid=4338813006',
 'https://boards.greenhouse.io/acorns/jobs/5586409',
 'https://boards.greenhouse.io/figma/jobs/5027898004',
 'https://boards.greenhouse.io/valohealth/jobs/7104398002',
 'https://boards.greenhouse.io/perplexityai/jobs/4231718007',
 'https://boards.greenhouse.io/scribetherapeutics/jobs/4910734004',
 'https://boards.greenhouse.io/bigid/jobs/7080745002',
 'https://boards.greenhouse.io/smartsheet/jobs/5601508',
 'https://boards.greenhouse.io/openai/jobs/4799588004',
 'https://boards.greenhouse.io/carta/jobs/5892496003',
 'https://boards.greenhouse.io/systemstechnologyresearch/jobs/4070954006',
 'https://boards.greenhouse.io/swishanalytics/jobs/4346382005',
 'https://boards.greenhouse.io/moloco/jobs/5841703003',
 '

In [47]:
len(list(set(links)))

100

so basically if a link has hte prefix https://boards.greenhouse.io/company/jobs, scrape the job metadata. if it is just  https://boards.greenhouse.io/company/, then go to the company site and look for links where the anchor text or heading has the search term, and then add those to the list of jobs to scrape

In [54]:
company_url = "https://boards.greenhouse.io/notion"
company = requests.get(company_url)

In [57]:
html_resp = company.content.decode("utf-8")

In [64]:
tree = html.fromstring(html_resp)
anchors = tree.xpath(f'/html/body//a')

In [74]:
search_term = "data scientist"
new_links = list()
for a in anchors: 
    if search_term.lower() in a.text.lower():
        new_links.append(a.values()[1])
        

In [75]:
new_links

['/notion/jobs/5759768003',
 '/notion/jobs/5895559003',
 '/notion/jobs/5895581003',
 '/notion/jobs/5895534003']

In [70]:
a.base_url

In [73]:
a.values()

['true', '/notion/jobs/5759768003']

In [87]:
get_jobs_from_company(company_url)

['https://boards.greenhouse.io/notion/jobs/5759768003',
 'https://boards.greenhouse.io/notion/jobs/5895559003',
 'https://boards.greenhouse.io/notion/jobs/5895581003',
 'https://boards.greenhouse.io/notion/jobs/5895534003']