# Development Notebook for Career Pages Watchdog    

This file is meant to be able to test out functionalities before production.

The system is made of the following parts:

1. Career pages extractor:  Using selenium and LLMs for a given company URL we will auto-detect it's career page url where all jobs can be found.
2. Job count extractor: Given a career page, we will try to estimate how many jobs they have published.

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
import time
import random
import json
from bs4 import BeautifulSoup
import csv


# Load environment variables from .env file
load_dotenv()
nw_openai_api_key = os.getenv('NW_OPENAI_API_KEY')
if nw_openai_api_key is None:
    raise ValueError("NW OpenAI API key is not set. Please check your .env file or environment variables.")

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise ValueError("OpenAI API key is not set. Please check your .env file or environment variables.")

MODEL_QWEN = 'qwen2.5'
MODEL_LLAMA = 'llama3.2'
MODEL_NW_GPT = 'gpt-4o-mini'
MODEL_GPT = 'gpt-4o-mini'

openai = OpenAI(api_key=openai_api_key)
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
nwopenai = OpenAI(base_url='https://litellm.data-int.odyssey.preview.nwse.cloud', api_key=nw_openai_api_key)

system_prompt_career_page_finder = """You will act as a web scraper and search fot the page that contains the list of job openings for a given company. \
Bear in mind, that you'll be looking at german websites, so the page might be in german. \
Some keyword to look for are "Careers", "Jobs", "Join Us", "Work with us", "Karriere", "stellenangebote", or similar. \
Many sites also have their jobs inside Career or Karriere section. \
"""

## Test Qwen running from local

Be sure to run

```bash
ollama serve
```



In [None]:
response = ollama.chat.completions.create(
            model=MODEL_QWEN,
            messages=[
                {"role": "system", "content": system_prompt_career_page_finder},
                {"role": "user", "content": "Hallo qwen, wie geht es dir?"},])

print(response.choices[0].message.content)

## Scrapper functions

Now let's add the website scrapper and link extractor functions.

In [None]:

def order_links_user_prompt(links):
        user_prompt = f"You'll analyse a list of links for a company website. "
        user_prompt += "Now you need to order them. The links that are most likely the career page or job listing page that could contain a job opening counter will be first in the list. Order them from most likely to less likely. Bear in mind the site might be in german.\n"
        user_prompt += "Links (some could be relative):"
        user_prompt += "\n".join(links)
        user_prompt += "You should respond in JSON as in this example:"
        user_prompt += """
        {
            "links": [
                "https://full.url/goes/here/about",
                "https://another.full.url/careers"
            ]
        }
        """
        return user_prompt

def check_job_page_user_prompt(content):
        user_prompt = (
        "You are analyzing the content of a subpage from a company's website. "
        "Your task is to determine whether this specific page is where the company lists its job openings.\n\n"
        "Things to look for include:\n"
        "- Phrases like 'open positions', 'jobs', 'career opportunities', etc. (in English or German)\n"
        "- Buttons or links to apply for jobs\n"
        "- Job search inputs, filters, or department/category selectors\n"
        "- A job counter (e.g., '123 jobs available')\n\n"
        "Content might be in German.\n\n"
        "Please answer with a single word: **yes** or **no**.\n\n"
        f"Here is the page content:\n---\n{content[:3500]}\n---"  # Trim to avoid prompt length issues
        )   

def order_links_llm(links, openaiSession, model=MODEL_QWEN):
        messages = [
            {"role": "system", "content": system_prompt_career_page_finder},
            {"role": "user", "content": order_links_user_prompt(links)}
        ]
        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0,
            response_format={"type": "json_object"}
        )
        result = response.choices[0].message.content
        # Parse the JSON response to a Python dictionary
        result_dict = json.loads(result)
        # Extract the list of links from the dictionary
        ordered_links = result_dict["links"]
        return ordered_links

def check_job_page_llm(url, openaiSession, model=MODEL_QWEN):
        messages = [
            {"role": "system", "content": system_prompt_career_page_finder},
            {"role": "user", "content": check_job_page_user_prompt(url)}
        ]
        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0
        )
        result = response.choices[0].message.content.strip().lower()
        return result == "yes"

def crawl(driver, base_url, url, collected_links, max_depth=2, current_depth=0, openaiSession=None, model=MODEL_LLAMA):
        if current_depth > max_depth:
            return
        
        if url in collected_links:
            return
        
        collected_links.add(url)
        
        try:
            driver.get(url)
            time.sleep(random.uniform(1.5, 3.5))  # Human-like pause
            
            print(f"Crawling (depth {current_depth}): {url}")

            if current_depth != 0:
                page_text = driver.find_element(By.TAG_NAME, "body").text
                if check_job_page_user_prompt(page_text):
                    print(f"✅ {url} is a job listings page.")
                    return
                else:
                    print(f"❌ {url} is not the job listing page.")

            # Find all <a> links
            hrefs = []
            links = driver.find_elements(By.TAG_NAME, "a")
            for link in links:
                href = link.get_attribute("href")
                if href and href.startswith(base_url):  # Only crawl inside the same domain
                    hrefs.append(href)

            print(f"Found {len(hrefs)} links")
            if len(hrefs) > 0:
                print(f"Sample: {hrefs[:3]}")

                if openaiSession:
                    try:
                        ordered_hrefs = order_links_llm(hrefs, openaiSession, model)
                        print(f"Ordered links: {ordered_hrefs[:3]}")
                        
                        # Now recursively crawl those hrefs
                        for href in ordered_hrefs[:5]:  # Limit to first 5 to avoid excessive crawling
                            crawl(driver, base_url, href, collected_links, max_depth, current_depth + 1, openaiSession, model)

                    except Exception as e:
                        print(f"Error ordering links: {e}")
            
        except Exception as e:
            print(f"Failed to crawl {url}: {e}")

def get_career_page(url, openaiSession, model=MODEL_LLAMA):
    """
    Create this WebsiteSelenium object from the given URL using Selenium and BeautifulSoup.
    """

    # Extract base URL for domain matching
    base_url_parts = url.split('/')
    if len(base_url_parts) >= 3:
        base_url = base_url_parts[0] + '//' + base_url_parts[2]
    else:
        base_url = url
    
    print(f"Base URL for domain matching: {base_url}")
    
    # Create a fresh driver for each website
    options = uc.ChromeOptions()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    # More human-like user agent
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36")
    options.add_argument("--headless=new")
    
    # Try with regular Chrome driver if undetected fails
    driver = None
    try:
        driver = uc.Chrome(options=options)
        print(f"Successfully created undetected_chromedriver for {url}")
    except Exception as e:
        print(f"Failed to create undetected_chromedriver: {e}")

    try:
        # Configure browser to avoid detection
        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                })
            """
        })
        
        collected_links = set()
        max_depth = 1  # Keep depth low for testing
        
        # Start crawling
        try:
            crawl(driver, base_url, url, collected_links, max_depth, 0, openaiSession, model)
            
            return url  # Return original URL if no links were found
        except Exception as e:
            print(f"Error during crawling: {e}")
            return url
    finally:
        try:
            if driver:
                driver.quit()
        except Exception as e:
            print(f"Error closing driver: {e}")

## Simple URL Guesser Alternative

If the browser automation approach is failing, we can try a direct URL guess approach.

In [None]:
def guess_career_url(url, openaiSession, model=MODEL_GPT):
    """Simple function to guess career URL without browser automation"""
    prompt = f"Given the company website {url}, predict what their careers/jobs page URL would be. Look for common patterns like /careers, /jobs, /karriere for German sites. Return only the full URL without explanation."
    
    response = openaiSession.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert at guessing company career page URLs."},
            {"role": "user", "content": prompt}
        ]
    )
    
    return response.choices[0].message.content.strip()

In [None]:
# Test the URL guesser with a problematic URL
print(guess_career_url("https://www.sparkasse.de", openai, MODEL_GPT))

## Test with one URL

A simple test

In [None]:
print(get_career_page("https://www.sparkasse.de",ollama))

## Now let's look at the companies.

In [None]:
class Company:
    def __init__(self, company_name, company_url):
        """
        Initialize a Company object with the given name and URL.

        Args:
            company_name (str): The name of the company.
            company_url (str): The URL of the company's website.
        """
        self.company_name = company_name
        self.company_url = company_url
        self.career_url = None  # This will be populated later

In [None]:
def process_companies_from_csv(file_path, openaiSession, model=MODEL_LLAMA):
    """
    Process companies from a CSV file, extract their career page URLs, and return a list of Company objects.

    Args:
        file_path (str): Path to the CSV file containing company data.
        model (str): The model to use for extracting career page URLs.

    Returns:
        list: A list of Company objects with career URLs populated.
    """
    companies = []

    # Open the CSV file and iterate over its rows
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)  # Use DictReader to access columns by name
        for row in reader:
            # Create a Company object for each row
            company = Company(company_name=row['company_name'], company_url=row['company_url'])
            company.career_url = get_career_page(company.company_url, openaiSession, model)  # Get the career page URL
            print(f"Company Name: {company.company_name}, Career URL: {company.career_url}")
            companies.append(company)  # Add the Company object to the companies list

    return companies

In [None]:
process_companies_from_csv('data/sampledomains.csv', openai, MODEL_GPT)

In [None]:
process_companies_from_csv('data/sampledomains.csv', ollama)

In [None]:
process_companies_from_csv('data/sampledomains.csv', nwopenai, MODEL_NW_GPT)

In [None]:
# Process without browser automation
process_companies_from_csv('data/sampledomains.csv', openai, MODEL_GPT, use_browser=False)