# Development Notebook for Career Pages Watchdog    

This file is meant to be able to test out functionalities before production.

The system is made of the following parts:

1. Career pages extractor:  Using selenium and LLMs for a given company URL we will auto-detect it's career page url where all jobs can be found.
2. Job count extractor: Given a career page, we will try to estimate how many jobs they have published.

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import time
import random
import json
from bs4 import BeautifulSoup
import csv


# Load environment variables from .env file
load_dotenv()
nw_openai_api_key = os.getenv('NW_OPENAI_API_KEY')
if nw_openai_api_key is None:
    raise ValueError("NW OpenAI API key is not set. Please check your .env file or environment variables.")

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise ValueError("OpenAI API key is not set. Please check your .env file or environment variables.")

MODEL_QWEN = 'qwen2.5'
MODEL_LLAMA = 'llama3.2'
MODEL_NW_GPT = 'gpt-4o-mini'
MODEL_GPT = 'gpt-4o-mini'
CHAR_LIMIT = 20000

openai = OpenAI(api_key=openai_api_key)
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
nwopenai = OpenAI(base_url='https://litellm.data.odyssey.preview.nwse.cloud', api_key=nw_openai_api_key)


system_prompt_career_page_finder = """You will act as a web scraper and search fot the page that contains the list of job openings for a given company. \
Bear in mind, that you'll be looking at german websites, so the page might be in german. \
Some keyword to look for are "Careers", "Jobs", "Join Us", "Work with us", "Karriere", "stellenangebote", or similar. \
Many sites also have their jobs inside Career or Karriere section. \
"""

## Test Qwen running from local

Be sure to run

```bash
ollama serve
```



In [None]:
response = ollama.chat.completions.create(
            model=MODEL_QWEN,
            messages=[      
                {"role": "system", "content": system_prompt_career_page_finder},
                {"role": "user", "content": "Hallo qwen, wie geht es dir?"},])

print(response.choices[0].message.content)

## Scrapper functions

Now let's add the website scrapper and link extractor functions.

In [None]:

def order_links_user_prompt(links):
        user_prompt = (
        "You are analyzing a list of links from a company's website. "
        "Your task is to rank them from most likely to least likely to point to the company's job listings page — "
        "the page where users can actively browse or search for open positions.\n\n"
        "Links that contain terms like 'job-search', 'stellenangebote', 'search-jobs', 'jobs.company_url' or similar phrases should be ranked "
        "higher than more generic pages like 'karriere', 'career', or 'about'. The goal is to find the actual **job search interface** — "
        "a page that likely has job filters, job counters, and application buttons.\n\n"
        "Prioritize links without query parameters, such as 'https://jobs.siemens-healthineers.com/careers', over links with query parameters, "
        "such as 'https://jobs.siemens-healthineers.com/careers?pid=563156115690465' .\n\n"
        "Note: the site may be in German. Prioritize links that clearly suggest access to a list of current job openings.\n\n"
        )
        user_prompt += "\n\nRespond only in JSON format like this and ensure you rank and return ALL the links you get:\n"
        user_prompt += """
        {
            "links": [
                "https://full.url/job-search",
                "https://full.url/karriere",
                ...
            ]
        }
        """
        user_prompt += "Here are the links (some may be relative):\n"
        user_prompt += "\n".join(links)
        return user_prompt

def check_job_page_user_prompt(content):
    user_prompt_old = (
        "You are analyzing the content of a subpage from a company's website. "
        "Your task is to determine whether this specific page explicitly lists job openings.\n\n"
        "Stronger indicators that the page lists job openings include:\n"
        "- A comprehensive list of current job openings with detailed titles and descriptions\n"
        "- A job search interface with specific filters (e.g., location, department) and results displayed\n"
        "- Prominent buttons or links labeled 'Apply Now' directly corresponding to job titles\n"
        "- A visible job counter explicitly indicating the number of available positions (e.g., '123 jobs available') or similar metrics\n\n"
        "General career pages that solely provide information about company values, work culture, employee experiences, "
        "or other non-specific content related to employment should not be considered as listing specific job openings.\n\n"
        "Pages that have a single 'Apply Now' button without any accompanying job titles or specific descriptions do not qualify as a job listing page.\n\n"
        "Considerations for the presence of elements like detailed descriptions or categorizations tied directly to job roles.\n\n"
        "If you can pinpoint a job search interface or job counter like '123 jobs available', please answer with yes. "
        "Specific job titles and descriptions MUST be listed, if such specifics are missing, answer with no.\n\n"
    )
    user_prompt = (
        "Determine if this company subpage explicitly lists job openings.\n\n"
        "Requirements for a Job Listings Page:\n"
        "- Lists job titles and descriptions.\n"
        "- Shows a job counter like '123 jobs available'.\n\n"
        "Non-Qualifying Pages:\n"
        "- General career information without specific job details.\n"
        "- Mentions of jobs or current jobs.\n"
        "- Single 'Apply Now' button with no job titles or descriptions.\n\n"
        "Answer 'yes' if the page lists specific job titles and descriptions, along with a job counter or if it has at least 3 indicators.\n"
        "If you are not sure, no job counter is found or not specifics job descriptions are found, answer 'no'."
        "Specific job titles and descriptions MUST be listed, if such specifics are missing, answer with no.\n\n"
    )
    user_prompt += "\n\nRespond only in JSON format like this:"
    user_prompt += """
    {
        "job_page": 1 for "yes" or 0 for "no",
        "available_jobs": 123,
        "reason": "explanation of the decision"
    }
    """
    user_prompt += f"Here is the page content:\n---\n{content[:CHAR_LIMIT]}\n---"
    return user_prompt

def order_links_llm(links, openaiSession, model=MODEL_QWEN):
        user_prompt = order_links_user_prompt(links)
        messages = [
            {"role": "system", "content": system_prompt_career_page_finder},
            {"role": "user", "content": user_prompt}
        ]
        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages,
            response_format={"type": "json_object"}
        )
        result = response.choices[0].message.content.replace("```json", "").replace("```", "").strip()

        # Parse the JSON response to a Python dictionary
        result_dict = json.loads(result)
       
        # Extract the list of links from the dictionary
        ordered_links = result_dict["links"]
        return ordered_links

def check_job_page_llm(content, openaiSession, model=MODEL_QWEN):
        messages = [
            {"role": "system", "content": system_prompt_career_page_finder},
            {"role": "user", "content": check_job_page_user_prompt(content)}
        ]

        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0
        )
        result = response.choices[0].message.content.replace("```json", "").replace("```", "").strip()

        # Parse the JSON response to a Python dictionary
        result_dict = json.loads(result)

        # Check if the response contains "yes" or "no"
        return result_dict

def remove_cookie_banner(driver):
    script = """
    const banner = document.getElementById('cookiebanner');
    if (banner) {
        banner.remove();
        return true;
    }
    return false;
    """
    result = driver.execute_script(script)
    return result

def extract_page_source(driver, url):
    """
    Extract the page source from the given URL using Selenium.
    """
    try:
        driver.get(url)
        time.sleep(random.uniform(1.5, 3.5))  # Human-like pause

            # Use XPath for case-insensitive ID partial matching
        try:
            # XPath function to ensure case insensitivity
            cookie_accept_button = WebDriverWait(driver, 2.5).until(
                EC.element_to_be_clickable((
                By.XPATH, 
                     '//*[contains(translate(@id, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "allowall") or ' +
                    'contains(translate(@id, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "acceptall") or ' +
                    'contains(translate(@id, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "accept-all") or ' +
                    'contains(translate(@id, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "accept") or ' +
                    'contains(translate(@data-testid, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "allowall") or ' +
                    'contains(translate(@data-testid, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "acceptall") or ' +
                    'contains(translate(@data-testid, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "accept-all") or ' +
                    'contains(translate(@data-testid, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "accept") or ' +
                    'contains(translate(@class, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "allowall") or ' +
                    'contains(translate(@class, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "acceptall") or ' +
                    'contains(translate(@class, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "accept-all") or ' +
                    'contains(translate(@class, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"), "accept")]'
                ))
            )   

            # Click on the "Allow All" button
            cookie_accept_button.click()
            print("Cookie consent with case-insensitive 'allowall' ID accepted.")

        except Exception:
            print("No acceptable cookie consent button found with 'allowall' in its id, case-insensitive check.")
    
        # Allow some time for actions after accepting cookies
        WebDriverWait(driver, 3)

        try:
            # Optionally remove the element
            removed = remove_cookie_banner(driver)
            print(f"Banner removed from DOM: {removed}")
        except Exception as e:
            print(f"Banner not found or already removed")
        
        page_source = driver.page_source

         # Check for iframes and append their content to page_source
        iframes = driver.find_elements(By.TAG_NAME, "iframe")
        for iframe in iframes:
            try:
                driver.switch_to.frame(iframe)
                page_source += driver.page_source
                driver.switch_to.default_content()
            except Exception as e:
                print(f"Failed to extract content from iframe: {e}")

                # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Remove irrelevant tags to clean up the content
        for irrelevant in soup(["script", "style", "img", "input", "meta", "noscript", "iframe"]):
            irrelevant.decompose()

        # Extract visible text from the page
        page_text = soup.get_text(separator="\n", strip=True)

        # Further clean up the text by removing excessive newlines
        page_text = "\n".join(line for line in page_text.splitlines() if line.strip())

        # Store the page source in a file for debugging or further analysis
        with open("data/page_content.txt", "w", encoding="utf-8") as file:
            file.write(page_text[:CHAR_LIMIT])

        return page_text
    except Exception as e:
        print(f"Failed to extract page source: {e}")
        return None

def crawl(driver, base_url, url, collected_links, all_seen_links=None, max_depth=2, current_depth=0, openaiSession=None, model=MODEL_LLAMA, found_job_page=None):
    # Initialize the all_seen_links set if it's None
    if all_seen_links is None:
        all_seen_links = set()

    # If a job page has already been found, stop further crawling
    if found_job_page[0] is not None:
        return

    if current_depth > max_depth:
        return
    
    if url in collected_links:
        return
    
    # Add the current URL to our collected links
    collected_links.add(url)
    
    try:
        # Check if the page is a job listings page using the LLM if we are not at the root level
        #if current_depth != 0:
        page_text = extract_page_source(driver, url)

        # Check if the page source contains indications of Cloudflare or scraping protection
        if "cloudflare" in page_text.lower() or "attention required" in page_text.lower():
            print("⚠️ Warning: The page might be protected by Cloudflare or other scraping protections.")
            found_job_page[0] = 'https://www.cloudflare.com/blocked'
            return
        
        job_page_results = check_job_page_llm(page_text, openaiSession, model)

        # Check if the page is a job listings page using the LLM
        if job_page_results.get("job_page") == 1:
            print(f"✅ {url} is a job listings page.")
            print(f"Reason: {job_page_results.get('reason')}")
            found_job_page[0] = url  # Set the found job page URL
            return
        else:
            print(f"❌ {url} is not the job listing page.")
            print(f"Reason: {job_page_results.get('reason')}")

        hrefs = []
        # Find all <a> links
        links = driver.find_elements(By.TAG_NAME, "a")
        for link in links:
            href = link.get_attribute("href")
            # Only include links that:
            # 1. Are not None
            # 2. Start with the base URL (same domain) or contain ".base_url" in their domain
            # 3. Haven't been seen before across all pages
            if href and href not in all_seen_links:
                hrefs.append(href)
                # Add to the all_seen_links set to avoid duplicate exploration
                all_seen_links.add(href)

        # Find all <button> elements with href attributes
        buttons = driver.find_elements(By.TAG_NAME, "button")
        for button in buttons:
            href = button.get_attribute("href")
            # Apply the same filtering logic as for <a> links
            if href and href not in all_seen_links:
                hrefs.append(href)
                all_seen_links.add(href)

        print(f"Found {len(hrefs)} new links in {url}")
        if len(hrefs) > 0:
            if openaiSession:
                try:
                    ordered_hrefs = order_links_llm(hrefs, openaiSession, model)
                    # Print the first ordered link
                    if ordered_hrefs:
                        print(f"LLM returned {len(ordered_hrefs)} ordered links")
                        print(f"Best bet: {ordered_hrefs[0]}")
                    
                    # Now recursively crawl those hrefs
                    for href in ordered_hrefs[:3]:  # Limit to first 3 to avoid excessive crawling
                        crawl(driver, base_url, href, collected_links, all_seen_links, max_depth, current_depth + 1, openaiSession, model, found_job_page)
                        # Stop further crawling if a job page has been found
                        if found_job_page[0] is not None:
                            return
                        
                except Exception as e:
                    print(f"Error ordering links: {e}")
        
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")

def init_driver():
    # Create a fresh driver for each website
    options = uc.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    # More human-like user agent
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")
    options.add_argument("--headless=new")

    # Try with regular Chrome driver if undetected fails
    driver = None
    try:
        driver = uc.Chrome(options=options)

        # Configure browser to avoid detection
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
            """
        })

    except Exception as e:
        print(f"Failed to create undetected_chromedriver: {e}")

    return driver 

def get_page_content(url):
    """
    Get the page content from the given URL using Selenium and BeautifulSoup.
    """
    driver = init_driver()
    page_source = extract_page_source(driver, url)

    return page_source

def get_career_page(url, openaiSession, model=MODEL_LLAMA):
    """
    Create this WebsiteSelenium object from the given URL using Selenium and BeautifulSoup.
    """

    # Extract base URL for domain matching
    base_url_parts = url.split('/')
    if len(base_url_parts) >= 3:
        base_url = base_url_parts[0] + '//' + base_url_parts[2]
    else:
        base_url = url
    
    print(f"Processing website: {url}") 
    
    # Try with regular Chrome driver if undetected fails
    driver = init_driver()

    if driver is None:
        return

    try:       
        collected_links = set()
        all_seen_links = set()  # Initialize the set of all seen links
        max_depth = 2  # Keep depth low for testing
        found_job_page = [None]  # Use a mutable object to track the found job page URL
        
        # Start crawling
        try:
            crawl(driver, base_url, url, collected_links, all_seen_links, max_depth, 0, openaiSession, model, found_job_page)
            
            # Return the found job page URL if any, otherwise return the original URL
            return found_job_page[0] if found_job_page[0] is not None else url
        except Exception as e:
            print(f"Error during crawling: {e}")
            return url
    finally:
        try:
            if driver:
                driver.quit()
                print("Browser closed successfully")
        except Exception as e:
            print(f"Error closing driver: {e}")

## Now let's look at the companies.

In [None]:
class Company:
    def __init__(self, company_name, company_url):
        """
        Initialize a Company object with the given name and URL.

        Args:
            company_name (str): The name of the company.
            company_url (str): The URL of the company's website.
        """
        self.company_name = company_name
        self.company_url = company_url
        self.career_url = None  # This will be populated later
        
def process_companies_from_csv(file_path):
    """
    Process companies from a CSV file, extract their career page URLs, and return a list of Company objects.

    Args:
        file_path (str): Path to the CSV file containing company data.
        model (str): The model to use for extracting career page URLs.

    Returns:
        list: A list of Company objects with career URLs populated.
    """
    companies = []

    # Open the CSV file and iterate over its rows
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)  # Use DictReader to access columns by name
        for row in reader:
            # Create a Company object for each row
            company = Company(company_name=row['company_name'], company_url=row['company_url'])
            companies.append(company)  # Add the Company object to the companies list

    # Save companies to a CSV file
    output_file = 'data/processed_companies.csv'
    with open(output_file, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['Company Name', 'Company URL', 'Career URL'])
        # Write the company data
        try:
            for company in companies:
                company.career_url = get_career_page(company.company_url, nwopenai, MODEL_NW_GPT)  # Get the career page URL
                writer.writerow([company.company_name, company.company_url, company.career_url])
        except Exception as e:
            print(f"An error occurred while processing companies: {e}")
            file.close()  # Ensure the file is closed in case of an error
            raise

    print(f"Companies saved to {output_file}")

    return companies



## CHAT GPT OFFICAL

In [None]:
process_companies_from_csv('data/sampledomains.csv', openai, MODEL_GPT)

In [None]:
get_career_page('https://www.munichre.com', openai, MODEL_GPT)

## OLLAMA

In [None]:
process_companies_from_csv('data/sampledomains.csv', ollama)

## NW CHAT GPT

In [None]:
companies = process_companies_from_csv('data/sampledomains.csv')

### Debugging different cases

Sometimes the prompt for detecting job listing pages does not work properly.

In [None]:
page_content = get_page_content('https://www.munichre.com')
check_job_page_llm(page_content, nwopenai, MODEL_NW_GPT)