# Development Notebook for Career Pages Watchdog    

This file is meant to be able to test out functionalities before production.

The system is made of the following parts:

1. Career pages extractor:  Using selenium and LLMs for a given company URL we will auto-detect it's career page url where all jobs can be found.
2. Job count extractor: Given a career page, we will try to estimate how many jobs they have published.

In [1]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import time
import random
import json
from bs4 import BeautifulSoup
import csv


# Load environment variables from .env file
load_dotenv()
nw_openai_api_key = os.getenv('NW_OPENAI_API_KEY')
if nw_openai_api_key is None:
    raise ValueError("NW OpenAI API key is not set. Please check your .env file or environment variables.")

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise ValueError("OpenAI API key is not set. Please check your .env file or environment variables.")

MODEL_QWEN = 'qwen2.5'
MODEL_LLAMA = 'llama3.2'
MODEL_NW_GPT = 'gpt-4o-mini'
MODEL_GPT = 'gpt-4o-mini'

openai = OpenAI(api_key=openai_api_key)
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
nwopenai = OpenAI(base_url='https://litellm.data-int.odyssey.preview.nwse.cloud', api_key=nw_openai_api_key)

system_prompt_career_page_finder = """You will act as a web scraper and search fot the page that contains the list of job openings for a given company. \
Bear in mind, that you'll be looking at german websites, so the page might be in german. \
Some keyword to look for are "Careers", "Jobs", "Join Us", "Work with us", "Karriere", "stellenangebote", or similar. \
Many sites also have their jobs inside Career or Karriere section. \
"""

## Test Qwen running from local

Be sure to run

```bash
ollama serve
```



In [None]:
response = ollama.chat.completions.create(
            model=MODEL_QWEN,
            messages=[
                {"role": "system", "content": system_prompt_career_page_finder},
                {"role": "user", "content": "Hallo qwen, wie geht es dir?"},])

print(response.choices[0].message.content)

## Scrapper functions

Now let's add the website scrapper and link extractor functions.

In [16]:

def order_links_user_prompt(links):
        user_prompt = (
        "You are analyzing a list of links from a company's website. "
        "Your task is to rank them from most likely to least likely to point to the company's job listings page — "
        "the page where users can actively browse or search for open positions.\n\n"
        "Links that contain terms like 'job-search', 'stellenangebote', 'search-jobs', or similar phrases should be ranked "
        "higher than more generic pages like 'karriere', 'career', or 'about'. The goal is to find the actual **job search interface** — "
        "a page that likely has job filters, job counters, and application buttons.\n\n"
        "Note: the site may be in German. Prioritize links that clearly suggest access to a list of current job openings.\n\n"
        "Here are the links (some may be relative):\n"
        )
        user_prompt += "\n".join(links)
        user_prompt += "\n\nRespond only in JSON format like this:\n"
        user_prompt += """
        {
        "links": [
            "https://full.url/job-search",
            "https://full.url/karriere"
        ]
        }
        """
        return user_prompt

def check_job_page_user_prompt(content):
    user_prompt = (
        "You are analyzing the content of a subpage from a company's website. "
        "Your task is to determine whether this specific page is where the company lists its job openings.\n\n"
        "Indicators that the page lists job openings include:\n"
        "- A list of current job openings with titles and descriptions\n"
        "- A job search interface with filters (e.g., location, department)\n"
        "- Buttons or links labeled 'Apply Now' or similar\n"
        "- A job counter indicating the number of available positions (e.g., '123 jobs available')\n\n"
        "Note that general career pages that provide information about working at the company, company culture, or employee testimonials, "
        "but do not list specific job openings, should not be considered as pages that list job openings.\n\n"
        "Content might be in German.\n\n"
        "Only the career page with apply button without search interface or job listings is not a job listing page.\n\n"
        "If you find a job search interface or/and a job counter like '123 jobs available', please answer with **yes**. "
        "Please answer only with yes or no.\n\n"
        f"Here is the page content:\n---\n{content[:7000]}\n---"
    )
    return user_prompt

def order_links_llm(links, openaiSession, model=MODEL_QWEN):
        messages = [
            {"role": "system", "content": system_prompt_career_page_finder},
            {"role": "user", "content": order_links_user_prompt(links)}
        ]
        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0,
            response_format={"type": "json_object"}
        )
        result = response.choices[0].message.content
        # Parse the JSON response to a Python dictionary
        result_dict = json.loads(result)
        # Extract the list of links from the dictionary
        ordered_links = result_dict["links"]
        return ordered_links

def check_job_page_llm(content, openaiSession, model=MODEL_QWEN):
        messages = [
            {"role": "system", "content": system_prompt_career_page_finder},
            {"role": "user", "content": check_job_page_user_prompt(content)}
        ]
        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0
        )
        result = response.choices[0].message.content.strip().lower()
        # print(f"LLM response: {result}")
        # Check if the response contains "yes" or "no"
        return result == "yes"

def crawl(driver, base_url, url, collected_links, all_seen_links=None, max_depth=2, current_depth=0, openaiSession=None, model=MODEL_LLAMA):
        # Initialize the all_seen_links set if it's None
        if all_seen_links is None:
            all_seen_links = set()

        if current_depth > max_depth:
            return
        
        if url in collected_links:
            return
        
        # Add the current URL to our collected links
        collected_links.add(url)
        
        try:
            driver.get(url)
            time.sleep(random.uniform(1.5, 3.5))  # Human-like pause
            
            print(f"Crawling (depth {current_depth}): {url}")

            if current_depth != 0:
                page_source = driver.page_source
                # Parse the page source with BeautifulSoup
                soup = BeautifulSoup(page_source, 'html.parser')

                # Remove irrelevant tags to clean up the content
                for irrelevant in soup(["script", "style", "img", "input", "meta", "noscript", "iframe"]):
                    irrelevant.decompose()

                # Extract visible text from the page
                page_text = soup.get_text(separator="\n", strip=True)

                # Further clean up the text by removing excessive newlines
                page_text = "\n".join(line for line in page_text.splitlines() if line.strip())

                # Check if the page is a job listings page using the LLM
                if check_job_page_llm(page_text, openaiSession, model):
                    print(f"✅ {url} is a job listings page.")
                    return
                else:
                    print(f"❌ {url} is not the job listing page.")

            # Find all <a> links
            hrefs = []
            links = driver.find_elements(By.TAG_NAME, "a")
            for link in links:
                href = link.get_attribute("href")
                # Only include links that:
                # 1. Are not None
                # 2. Start with the base URL (same domain)
                # 3. Haven't been seen before across all pages
                if href and href.startswith(base_url) and href not in all_seen_links:  
                    hrefs.append(href)
                    # Add to the all_seen_links set to avoid duplicate exploration
                    all_seen_links.add(href)

            print(f"Found {len(hrefs)} new links in {url}")
            if len(hrefs) > 0:
                print("New links:")
                for link in hrefs[:3]:  # Only show first 5 to avoid console clutter
                    print(link) 

                if openaiSession:
                    try:
                        ordered_hrefs = order_links_llm(hrefs, openaiSession, model)
                        print("Ordered links:")
                        for link in ordered_hrefs[:3]:  # Only show first 5
                            print(link) 
                        # Now recursively crawl those hrefs
                        for href in ordered_hrefs[:3]:  # Limit to first 5 to avoid excessive crawling
                            crawl(driver, base_url, href, collected_links, all_seen_links, max_depth, current_depth + 1, openaiSession, model)
                    except Exception as e:
                        print(f"Error ordering links: {e}")
                        # If ordering fails, just use the original links
                        for href in hrefs[:3]:  # Limit to first 5
                            crawl(driver, base_url, href, collected_links, all_seen_links, max_depth, current_depth + 1, openaiSession, model)
            
        except Exception as e:
            print(f"Failed to crawl {url}: {e}")

def get_career_page(url, openaiSession, model=MODEL_LLAMA):
    """
    Create this WebsiteSelenium object from the given URL using Selenium and BeautifulSoup.
    """

    # Extract base URL for domain matching
    base_url_parts = url.split('/')
    if len(base_url_parts) >= 3:
        base_url = base_url_parts[0] + '//' + base_url_parts[2]
    else:
        base_url = url
    
    print(f"Processing website: {url}")
    print(f"Base domain: {base_url}")
    
    # Create a fresh driver for each website
    options = uc.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    # More human-like user agent
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")
    options.add_argument("--headless=new")
    
    # Try with regular Chrome driver if undetected fails
    driver = None
    try:
        driver = uc.Chrome(options=options)
        print("Browser initialized successfully")
    except Exception as e:
        print(f"Failed to create undetected_chromedriver: {e}")

    try:
        # Configure browser to avoid detection
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                })
            """
        })
        
        collected_links = set()
        all_seen_links = set()  # Initialize the set of all seen links
        max_depth = 2  # Keep depth low for testing
        
        # Start crawling
        try:
            crawl(driver, base_url, url, collected_links, all_seen_links, max_depth, 0, openaiSession, model)
            
            return url  # Return original URL if no links were found
        except Exception as e:
            print(f"Error during crawling: {e}")
            return url
    finally:
        try:
            if driver:
                driver.quit()
                print("Browser closed successfully")
        except Exception as e:
            print(f"Error closing driver: {e}")

## Now let's look at the companies.

In [5]:
class Company:
    def __init__(self, company_name, company_url):
        """
        Initialize a Company object with the given name and URL.

        Args:
            company_name (str): The name of the company.
            company_url (str): The URL of the company's website.
        """
        self.company_name = company_name
        self.company_url = company_url
        self.career_url = None  # This will be populated later
        
def process_companies_from_csv(file_path, openaiSession, model=MODEL_LLAMA):
    """
    Process companies from a CSV file, extract their career page URLs, and return a list of Company objects.

    Args:
        file_path (str): Path to the CSV file containing company data.
        model (str): The model to use for extracting career page URLs.

    Returns:
        list: A list of Company objects with career URLs populated.
    """
    companies = []

    # Open the CSV file and iterate over its rows
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)  # Use DictReader to access columns by name
        for row in reader:
            # Create a Company object for each row
            company = Company(company_name=row['company_name'], company_url=row['company_url'])
            company.career_url = get_career_page(company.company_url, openaiSession, model)  # Get the career page URL
            print(f"Company Name: {company.company_name}, Career URL: {company.career_url}")
            companies.append(company)  # Add the Company object to the companies list

    return companies

In [17]:
process_companies_from_csv('data/sampledomains.csv', openai, MODEL_GPT)

Processing website: https://www.munichre.com
Base domain: https://www.munichre.com
Browser initialized successfully
Crawling (depth 0): https://www.munichre.com
Found 72 new links in https://www.munichre.com
New links:
https://www.munichre.com/en.html
https://www.munichre.com/en/risks.html
https://www.munichre.com/en/solutions.html
Ordered links:
https://www.munichre.com/en/careers.html
Crawling (depth 1): https://www.munichre.com/en/careers.html
✅ https://www.munichre.com/en/careers.html is a job listings page.
Browser closed successfully
Company Name: Munich Re, Career URL: https://www.munichre.com
Processing website: https://www.rwe.com
Base domain: https://www.rwe.com
Browser initialized successfully
Crawling (depth 0): https://www.rwe.com
Found 189 new links in https://www.rwe.com
New links:
https://www.rwe.com/en/contact-services/
https://www.rwe.com/en/contact-services/apps-and-tools/
https://www.rwe.com/en/
Ordered links:
https://www.rwe.com/en/rwe-careers-portal/job-offers/
ht

KeyboardInterrupt: 

In [None]:
process_companies_from_csv('data/sampledomains.csv', ollama)

In [None]:
process_companies_from_csv('data/sampledomains.csv', nwopenai, MODEL_NW_GPT)

In [None]:
# Process without browser automation
process_companies_from_csv('data/sampledomains.csv', openai, MODEL_GPT, use_browser=False)