# Development Notebook for Career Pages Watchdog    

This file is meant to be able to test out functionalities before production.

The system is made of the following parts:

1. Career pages extractor:  Using selenium and LLMs for a given company URL we will auto-detect it's career page url where all jobs can be found.
2. Job count extractor: Given a career page, we will try to estimate how many jobs they have published.

In [13]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc
import time
import random
from bs4 import BeautifulSoup
import csv


# Load environment variables from .env file
load_dotenv()
nw_openai_api_key = os.getenv('NW_OPENAI_API_KEY')
if nw_openai_api_key is None:
    raise ValueError("NW OpenAI API key is not set. Please check your .env file or environment variables.")

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key is None:
    raise ValueError("OpenAI API key is not set. Please check your .env file or environment variables.")

MODEL_QWEN = 'qwen2.5'
MODEL_LLAMA = 'llama3.2'
MODEL_NW_GPT = 'gpt-4o-mini'
MODEL_GPT = 'gpt-4o-mini'

openai = OpenAI(api_key=openai_api_key)
ollama = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
nwopenai = OpenAI(base_url='https://litellm.data-int.odyssey.preview.nwse.cloud', api_key=nw_openai_api_key)

system_prompt_career_page_finder = """Sie werden eine Reihe von Unternehmens-URLs deutschsprechender Unternehmen analysieren.
 Ihre Aufgabe besteht darin, die Karriereseite des Unternehmens zu finden und den URL zurückzugeben. Bitte antworte mit nur eine URL."""

## Test Qwen running from local

Be sure to run

```bash
ollama serve
```



In [None]:
response = ollama.chat.completions.create(
            model=MODEL_QWEN,
            messages=[
                {"role": "system", "content": system_prompt_career_page_finder},
                {"role": "user", "content": "Hallo qwen, wie geht es dir?"},])

print(response.choices[0].message.content)

## Scrapper functions

Now let's add the website scrapper and link extractor functions.

In [17]:
class Website:
    def __init__(self, url):
        """
        Create this WebsiteSelenium object from the given URL using Selenium and BeautifulSoup.
        """
        self.url = url

        options = uc.ChromeOptions()
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                                    "Chrome/122.0.0.0 Safari/537.36")
        options.add_argument("--headless=new")  # Only if really needed

        driver = uc.Chrome(options=options)

        driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                    get: () => undefined
                })
            """
        })

        try:
            # Fetch the webpage
            driver.get(url)
            time.sleep(random.uniform(2, 5))
            
            # Get the page source
            page_source = driver.page_source

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')
            self.title = soup.title.string if soup.title else "No title found"
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)

            links = [link.get('href') for link in soup.find_all('a')]
            self.links = [link for link in links if link]
        finally:
            # Close the WebDriver
            driver.quit()

def get_links_user_prompt(website):
        user_prompt = f"Hier ist die Liste der Links auf der Website von {website.url} - "
        user_prompt += "Sie sollten den einen Link finden, der am wahrscheinlichsten die Karriereseite enthält, auf der die offenen Stellenlisten stehen. Bitte nur mit einen einzigen link antworten.\n"
        user_prompt += "Links (einige könnten relative Links sein):"
        user_prompt += "\n".join(website.links)
        return user_prompt

def get_career_page(url,openaiSession,model=MODEL_LLAMA):
        website = Website(url)
        messages = [
                    {"role": "system", "content": system_prompt_career_page_finder},
                    {"role": "user", "content": get_links_user_prompt(website)}
                ] 
        
        response = openaiSession.chat.completions.create(
            model=model,
            messages=messages
        )
        result = response.choices[0].message.content
        return result
        

## Test with one URL

A simple test

In [None]:
print(get_career_page("https://www.sparkasse.de",ollama))

## Now let's look at the companies.

In [None]:
class Company:
    def __init__(self, company_name, company_url):
        """
        Initialize a Company object with the given name and URL.

        Args:
            company_name (str): The name of the company.
            company_url (str): The URL of the company's website.
        """
        self.company_name = company_name
        self.company_url = company_url
        self.career_url = None  # This will be populated later

In [None]:
def process_companies_from_csv(file_path, openaiSession, model=MODEL_LLAMA):
    """
    Process companies from a CSV file, extract their career page URLs, and return a list of Company objects.

    Args:
        file_path (str): Path to the CSV file containing company data.
        model (str): The model to use for extracting career page URLs.

    Returns:
        list: A list of Company objects with career URLs populated.
    """
    companies = []

    # Open the CSV file and iterate over its rows
    with open(file_path, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)  # Use DictReader to access columns by name
        for row in reader:
            # Create a Company object for each row
            company = Company(company_name=row['company_name'], company_url=row['company_url'])
            company.career_url = get_career_page(company.company_url, openaiSession, model)  # Get the career page URL
            print(f"Company Name: {company.company_name}, Career URL: {company.career_url}")
            companies.append(company)  # Add the Company object to the companies list

    return companies

In [18]:
process_companies_from_csv('data/sampledomains.csv', openai, MODEL_GPT)

Company Name: Munich Re, Career URL: https://www.munichre.com/en/careers.html
Company Name: RWE, Career URL: https://www.rwe.com/en/rwe-careers-portal/


KeyboardInterrupt: 

In [None]:
process_companies_from_csv('data/sampledomains.csv', ollama)

In [None]:
process_companies_from_csv('data/sampledomains.csv', nwopenai, MODEL_NW_GPT)