In [None]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from week1.scraper import fetch_website_links, fetch_website_contents
from openai import OpenAI

In [None]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-5-nano'
openai = OpenAI()

In [None]:
links = fetch_website_links("https://www.figma.com")
links

In [None]:
link_system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to build a knowledgebase/how-to/tutorials section on Figma.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [None]:
def get_links_user_prompt(url):
    user_prompt = f"""
Here is the list of links on the website {url} -
Please decide which of these are relevant to build a knowledgebase/how-to/tutorials section on Figma, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links. Do necessary translation to understand what is what if the website not in English. Mind that tutorials might be included on a separate subpage of the main website, broken down into granular how-to articles. If so, find and include rather them than generic website pages. 

Links (some might be relative links):

"""
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [None]:
def select_relevant_links(url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    return links

In [None]:
select_relevant_links("https://www.figma.com")

In [None]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [None]:
print(fetch_page_and_all_relevant_links("https://www.figma.com"))

In [None]:
tutorial_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a well-organized, clean and informative list of tutorials/how-to pages that relate to Figma. 
The tutorial articles should be grouped in a few buckets depending on level of those who'd be undertaking learning (levels being - beginner, intermediate, pro). No need to list the entire text of the articles - titles and descriptions will do.
Apply judgement to define the levels. Make sure to include and stylistically emphasize any articles on AI. Include the links to the given tutorials in your final response. 
CRITICAL: You must ONLY use URLs that are explicitly provided in the user's message. 
    Do NOT invent, guess, or construct any URLs. If you need a URL for a topic but it 
    wasn't provided, either:
    1. Skip that link entirely, OR
    2. Mention the topic without a link
    
Never include placeholder URLs like 'XXX' or assume URL patterns.
Respond in markdown without code blocks. Formulate the wording in an easy-to-understand way and engaging, playful manner.
"""

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# brochure_system_prompt = """
# You are an assistant that analyzes the contents of several relevant pages from a company website
# and creates a short, humorous, entertaining, witty brochure about the company for prospective customers, investors and recruits.
# Respond in markdown without code blocks.
# Include details of company culture, customers and careers/jobs if you have the information.
# """


In [None]:
def get_tutorial_user_prompt(company_name, url, links, contents):
    """Modified prompt that clearly lists valid URLs"""
    return f"""Create a comprehensive tutorial guide for {company_name} based on {url}.

AVAILABLE URLS (these are the ONLY valid URLs you can use):
{chr(10).join(f"- {link}" for link in links)}

WEBSITE CONTENT:
{contents}

Create an engaging, well-structured tutorial organized by skill level.
Remember: Only use URLs from the "AVAILABLE URLS" list above. 
If you want to mention a topic but don't have a valid URL for it, just describe it without a link."""

In [None]:
import re 
def validate_urls_in_response(response_text, valid_urls):
    """
    Check if all URLs in the AI response are in the valid_urls list.
    Returns the response with broken URLs removed or flagged.
    """
    # Find all markdown links: [text](url)
    url_pattern = r'\[([^\]]+)\]\(([^\)]+)\)'
    
    def check_and_fix_link(match):
        link_text = match.group(1)
        url = match.group(2)
        
        # If URL is in valid list, keep it
        if url in valid_urls:
            return match.group(0)
        # If URL looks incomplete (has XXX or placeholders), remove the link but keep text
        elif 'XXX' in url or 'XXXX' in url:
            return f"**{link_text}**"
        # If URL is not in valid list, remove link but keep text
        else:
            return f"**{link_text}**"
    
    # Replace all URLs
    fixed_response = re.sub(url_pattern, check_and_fix_link, response_text)
    return fixed_response


In [None]:
def create_tutorial(company_name, url):
    # Fetch all page content and relevant links
    contents = fetch_website_contents(url)
    relevant_links_dict = select_relevant_links(url)
    
    # Extract just the URLs from the links dictionary
    links = [link['url'] for link in relevant_links_dict['links']]
    
    # Fetch content from all relevant links
    all_contents = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link_info in relevant_links_dict['links']:
        all_contents += f"\n\n### Link: {link_info['type']}\n"
        all_contents += fetch_website_contents(link_info['url'])
    
    # Create valid URL list for validation
    valid_urls = set(links)
    valid_urls.add(url)
    
    # Extract any other URLs from the content
    import re
    url_pattern = r'https?://[^\s\)\]\'"<>]+'
    for match in re.finditer(url_pattern, all_contents):
        valid_urls.add(match.group(0))
    
    print(f"Found {len(valid_urls)} valid URLs from scraped content")
    
    # Get the user prompt with all required parameters
    user_content = get_tutorial_user_prompt(company_name, url, links, all_contents[:5000])
    
    # Call the AI
    response = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": tutorial_system_prompt},
            {"role": "user", "content": user_content}
        ],
    )
    
    result = response.choices[0].message.content
    
    # VALIDATE and fix URLs before displaying
    validated_result = validate_urls_in_response(result, valid_urls)
    
    display(Markdown(validated_result))


In [None]:
create_tutorial("Figma", "https://www.figma.com") 