## Day 5 Challenge
I decided to spice things up a bit. Instead of a bronchure, I decided to create an expert system for a company.
It takes a company name and a url, and then it will answer questions about the company based on the information aggregated from the website.
Check out my read me where I broke down my thought process.

In [None]:
# Imports
import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from scraper import fetch_website_links, fetch_website_contents
from openai import OpenAI

In [None]:
# Initialize and constants

load_dotenv(override=True)
base_url = "https://openrouter.ai/api/v1"
api_key = os.getenv('OPENROUTER_API_KEY')

if api_key and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
LINKS_MODEL = 'gpt-4o-mini'
EXPERT_MODEL = 'gpt-5-nano'

In [None]:
# Prompt templates (separated for clarity and easier maintenance)
# Use {placeholder} for values filled at runtime

LINKS_SYSTEM_PROMPT = """
You are provided with a list of links found on a webpage.
For a **data bank** (vs. a brochure), we want a **broader** set of links â€” anything that could inform answers:
- About, Company, Team
- Products, Services, Solutions
- Blog, News, Articles
- Careers, Jobs, Culture
- Contact, Support, FAQ, Help
- Documentation, Docs, Guides

**Exclude:** Terms of Service, Privacy Policy, cookie banners, social media, email `mailto:` links.

Respond in JSON only:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

LINKS_USER_PROMPT_TEMPLATE = """
Here are the links found on {url}. Select those relevant for a company knowledge base.
Return full absolute URLs in JSON format. Do not include Terms of Service, Privacy, or email links.

Links:
{links}
"""

EXPERT_SYSTEM_PROMPT_TEMPLATE = """
You are an expert system for {company_name}. Your ONLY source of information is the knowledge base provided below. Answer questions accurately based on it. You must NOT use external knowledge or assumptions.

RULES:
1. Answer questions ONLY based on the provided knowledge base.
2. If the knowledge base does not contain enough information to answer, say: "I don't have enough information in the provided content to answer this question."
3. If the question is unrelated to the company (e.g., general trivia, other companies), say: "This question appears unrelated to {company_name}. I can only answer questions about the information in the knowledge base."
4. When you do have an answer, cite the relevant section or page when helpful (e.g., "According to the About page...").
5. Be concise but accurate. Do not speculate or hallucinate.

OUTPUT FORMAT:
- Do NOT repeat the user's question.
- Start with your answer. If you cite sources, end with a "References:" section listing relevant URLs from the knowledge base.
- Use markdown. Do NOT wrap your response in code blocks.

Example (when you have an answer):
**Answer:** The main product is the Hugging Face Hub and API, which provides access to models, datasets, and ML tools.

**References:**
- https://huggingface.co/about

Example (when you lack information):
I don't have enough information in the provided content to answer this question.
"""

EXPERT_USER_PROMPT_TEMPLATE = """
Knowledge base for {company_name} (from {url}):

---
{knowledge_base}
---

Question: {question}
"""

In [None]:
# Expert System Class

class ExpertSystem:
    """
    A class for creating an expert system for a given URL.
    Supports caching of answers to avoid redundant API calls.
    """
    
    def __init__(self, url, company_name):
        self.company_name = company_name
        self.url = url
        self.user_question = None
        self.client = OpenAI(base_url=base_url, api_key=api_key)
        self.links = None
        self.relevant_links = None
        self.links_model = LINKS_MODEL
        self.expert_model = EXPERT_MODEL
        self.knowledge_base = None
        self.expert_system = None
        self._cache = {}  # question -> answer (normalized question as key)
        
        # Fetch website links
        self.links = fetch_website_links(self.url)
        
        # Select relevant links
        self.relevant_links = self.select_relevant_links()
        
        # Build knowledge base
        self.knowledge_base = self.build_knowledge_base()[:10_000]
    
    @staticmethod
    def _normalize_question(question: str) -> str:
        """Normalize question for cache key (strip, collapse whitespace)."""
        return " ".join(question.strip().lower().split())
    
    def clear_cache(self):
        """Clear the question-answer cache."""
        self._cache.clear()
    
    def _build_prompt(self, template: str, **kwargs) -> str:
        """Format a prompt template with the given variables."""
        return template.format(**kwargs)
      
    def query_model(self, model, messages, **kwargs):
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        if kwargs.get('stream', False):
            return response
        return response.choices[0].message.content
    
    def select_relevant_links(self):
        user_prompt = self._build_prompt(
            LINKS_USER_PROMPT_TEMPLATE,
            url=self.url,
            links="\n".join(str(link) for link in self.links)
        )
        messages = [
            {"role": "system", "content": LINKS_SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ]
        links = json.loads(
            self.query_model(self.links_model, messages, response_format={"type": "json_object"})
        )
        print(f"Found {len(links['links'])} relevant links")
        return links
     
    def build_knowledge_base(self):
        contents = fetch_website_contents(self.url)
        result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
        for link in self.relevant_links['links']:
            result += f"\n\n### Link: {link['type']}\n"
            result += fetch_website_contents(link["url"])
        return result    
    
    def _get_expert_messages(self, question: str):
        """Build messages for the expert Q&A call."""
        system_prompt = self._build_prompt(
            EXPERT_SYSTEM_PROMPT_TEMPLATE,
            company_name=self.company_name
        )
        user_prompt = self._build_prompt(
            EXPERT_USER_PROMPT_TEMPLATE,
            company_name=self.company_name,
            url=self.url,
            knowledge_base=self.knowledge_base,
            question=question
        )
        return [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    
    def _stream_response(self, stream) -> str:
        """Stream the response to the display and return the full accumulated text."""
        full_response = ""
        display_handle = display(Markdown(""), display_id=True)
        for chunk in stream:
            content = chunk.choices[0].delta.content or ''
            full_response += content
            update_display(Markdown(full_response), display_id=display_handle.display_id)
        return full_response
    
    def answer_question(self, question: str, use_cache: bool = True):
        """
        Answer a question about the company. Checks cache first if use_cache=True.
        """
        self.user_question = question
        cache_key = self._normalize_question(question)
        
        if use_cache and cache_key in self._cache:
            print("(from cache)")
            display(Markdown(self._cache[cache_key]))
            return
        
        messages = self._get_expert_messages(question)
        stream = self.query_model(self.expert_model, messages, stream=True)
        full_response = self._stream_response(stream)
        self._cache[cache_key] = full_response

In [None]:
# Demo

# Initialize the ExpertSystem
expert_system = ExpertSystem(url="https://huggingface.co", company_name="Hugging Face")

# Answer a question (first call - hits the API)
expert_system.answer_question("What is the main product of Hugging Face?")

# Ask the same question again (second call - served from cache, no API call)
expert_system.answer_question("What is the main product of Hugging Face?")

# Ask a different question
expert_system.answer_question("Does Hugging Face have a careers page?")

# To bypass cache: expert_system.answer_question("...", use_cache=False)
# To clear cache: expert_system.clear_cache()
