In [11]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [23]:
MODEL = "llama3.2"

In [4]:
# a class to represent a Webpage

class Website:
    url: str
    title: str
    body: str
    links: str
    text: str
    
    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
            
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
        
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/08/06/outsmart/',
 'https://edwarddonner.com/2024/08/06/outsmart/',
 'https://edwarddonner.com/2024/06/26/choosing-the-right-llm-resources/

## Figure out which links are relevant

In [77]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in just only JSON format and this is an example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "link of the page"},
        {"type": "careers page", "url": "link of the page"}
    ]
}
"""

In [78]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in just only JSON format and this is an example:
{
    "links": [
        {"type": "about page", "url": "link of the page"},
        {"type": "careers page", "url": "link of the page"}
    ]
}



In [79]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the fill https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [80]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the fill https URL in JSON format.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2024/11/13/llm-engineering-resources/
https://edwarddonner.com/2024/11/13/llm-engineering-resources/
https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/
https://edwarddo

In [84]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ]
    )
    result = response.message.content
    return json.loads(result)

In [82]:
anthropic = Website("https://anthropic.com")
anthropic.links

['/',
 '/claude',
 '/team',
 '/enterprise',
 '/api',
 '/pricing',
 '/research',
 '/company',
 '/careers',
 '/news',
 'https://www.anthropic.com/research#entry:8@1:url',
 'https://www.anthropic.com/claude',
 'https://claude.ai/',
 '/api',
 '/news/3-5-models-and-computer-use',
 '/claude/sonnet',
 '/claude/haiku',
 '/news/claude-for-enterprise',
 '/research/constitutional-ai-harmlessness-from-ai-feedback',
 '/news/core-views-on-ai-safety',
 '/jobs',
 '/',
 '/claude',
 '/api',
 '/team',
 '/pricing',
 '/research',
 '/company',
 '/customers',
 '/news',
 '/careers',
 'mailto:press@anthropic.com',
 'https://support.anthropic.com/',
 'https://status.anthropic.com/',
 '/supported-countries',
 'https://twitter.com/AnthropicAI',
 'https://www.linkedin.com/company/anthropicresearch',
 'https://www.youtube.com/@anthropic-ai',
 '/legal/consumer-terms',
 '/legal/commercial-terms',
 '/legal/privacy',
 '/legal/aup',
 '/responsible-disclosure-policy',
 'https://trust.anthropic.com/']

In [85]:
get_links("https://anthropic.com")

{'links': [{'type': 'about page', 'url': 'https://www.anthropic.com/company'},
  {'type': 'careers page', 'url': 'https://www.anthropic.com/careers'},
  {'type': 'enterprise page', 'url': 'https://anthropic.ai/'},
  {'type': 'news page', 'url': 'https://news.anthropic.com/'}]}

## make the brochure

In [92]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [93]:
get_all_details("https://anthropic.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.anthropic.com/company'}, {'type': 'careers page', 'url': 'https://www.anthropic.com/careers'}, {'type': 'news/claude for enterprise', 'url': 'https://www.anthropic.com/news/claude-for-enterprise'}]}


"Landing page:\nWebpage Title:\nHome \\ Anthropic\nWebpage Contents:\nClaude\nOverview\nTeam\nEnterprise\nAPI\nPricing\nResearch\nCompany\nCareers\nNews\nAI\nresearch\nand\nproducts\nthat put safety at the frontier\nClaude.ai\nMeet Claude 3.5 Sonnet\nClaude 3.5 Sonnet, our most intelligent AI model, is now available.\nTalk to Claude\nAPI\nBuild with Claude\nStart using Claude to drive efficiency and create new revenue streams.\nLearn more\nAnnouncements\nIntroducing computer use, a new Claude 3.5 Sonnet, and Claude 3.5 Haiku\nOct 22, 2024\nModel updates\n3.5 Sonnet\n3.5 Haiku\nOur Work\nProduct\nClaude for Enterprise\nSep 4, 2024\nAlignment\n·\nResearch\nConstitutional AI: Harmlessness from AI Feedback\nDec 15, 2022\nAnnouncements\nCore Views on AI Safety: When, Why, What, and How\nMar 8, 2023\nWork with Anthropic\nAnthropic is an AI safety and research company based in San Francisco. Our interdisciplinary team has experience across ML, physics, policy, and product. Together, we genera

In [94]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."



In [95]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:20000]
    return user_prompt

In [96]:
get_brochure_user_prompt("Anthropic", "https://anthropic.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.anthropic.com/company'}, {'type': 'careers page', 'url': 'https://www.anthropic.com/careers'}, {'type': 'news page', 'url': 'https://www.anthropic.com/news'}, {'type': 'research page', 'url': 'https://www.anthropic.com/research'}, {'type': 'products/ pricing page', 'url': 'https://www.anthropic.com/pricing'}, {'type': 'supported countries page', 'url': 'https://www.anthropic.com/supported-countries'}, {'type': 'trust page', 'url': 'https://trust.anthropic.com/'}]}


"You are looking at a company called: Anthropic\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHome \\ Anthropic\nWebpage Contents:\nClaude\nOverview\nTeam\nEnterprise\nAPI\nPricing\nResearch\nCompany\nCareers\nNews\nAI\nresearch\nand\nproducts\nthat put safety at the frontier\nClaude.ai\nMeet Claude 3.5 Sonnet\nClaude 3.5 Sonnet, our most intelligent AI model, is now available.\nTalk to Claude\nAPI\nBuild with Claude\nStart using Claude to drive efficiency and create new revenue streams.\nLearn more\nAnnouncements\nIntroducing computer use, a new Claude 3.5 Sonnet, and Claude 3.5 Haiku\nOct 22, 2024\nModel updates\n3.5 Sonnet\n3.5 Haiku\nOur Work\nProduct\nClaude for Enterprise\nSep 4, 2024\nAlignment\n·\nResearch\nConstitutional AI: Harmlessness from AI Feedback\nDec 15, 2022\nAnnouncements\nCore Views on AI Safety: When, Why, What, and How\nMar 8, 2023\nWo

In [97]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response.message.content
    display(Markdown(result))

In [99]:
create_brochure("Anthropic", "https://anthropic.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.anthropic.com/company'}, {'type': 'careers page', 'url': 'https://www.anthropic.com/careers'}, {'type': 'news page', 'url': 'https://www.anthropic.com/news'}, {'type': 'research page', 'url': 'https://www.anthropic.com/research'}, {'type': 'support page', 'url': 'https://support.anthropic.com/'}, {'type': 'status page', 'url': 'https://status.anthropic.com/'}]}


You want to know about prompt caching with Claude. 

Prompt caching is a technique used to improve the performance and efficiency of large language models like Claude by storing previously computed responses to frequently asked questions or prompts. This way, when the same prompt is asked again, the model can quickly retrieve the stored response instead of re-computing it from scratch.

Claude, being an open-source large language model, provides a built-in caching mechanism for prompt caching. However, I couldn't find any specific information on how to enable or configure prompt caching in Claude's documentation or GitHub repository.

To implement prompt caching with Claude, you might need to:

1.  Store the cached prompts and responses in a database or a file.
2.  Implement a cache expiration policy to ensure that outdated responses are removed from the cache.
3.  Use a cache invalidation mechanism to notify the model when a new version of a response is available.

Here's an example code snippet in Python that demonstrates how you can implement prompt caching using Claude:
```python
import os

class CachedClaude:
    def __init__(self, prompt_cache_dir):
        self.prompt_cache_dir = prompt_cache_dir
        if not os.path.exists(self.prompt_cache_dir):
            os.makedirs(self.prompt_cache_dir)

    def cache_prompt(self, prompt, response):
        cache_file = os.path.join(self.prompt_cache_dir, f"{prompt}.txt")
        with open(cache_file, "w") as f:
            f.write(response)
        return cache_file

    def get_cached_response(self, prompt):
        cache_file = self.cache_dir + f"/{prompt}.txt"
        if os.path.exists(cache_file):
            with open(cache_file, "r") as f:
                response = f.read()
            return response
        else:
            # Return a default response or raise an error if the prompt is not cached
            return "Prompt not found in cache"

# Usage example:
cached_cl Claude = CachedClaude("/path/to/prompt/cache")

prompt = "What is the capital of France?"
response = "Paris"
cache_file = cached_cl Claude.cache_prompt(prompt, response)

# Get the cached response
cached_response = cached_cl Claude.get_cached_response(prompt)
```
Please note that this code snippet is a simplified example and may not cover all edge cases. You may need to modify it to suit your specific use case.

Also, keep in mind that prompt caching can lead to inconsistencies if multiple models are using the same cache, or if new responses are generated after the original response was cached. To mitigate these issues, you may want to consider implementing additional mechanisms such as:

*   Using a distributed caching system to ensure consistency across different instances of the model.
*   Implementing a version control system for the responses to track changes and prevent inconsistencies.

In summary, prompt caching with Claude can be implemented by storing previously computed responses to frequently asked questions in a cache directory. However, you'll need to consider additional factors such as cache expiration, invalidation, and consistency across different models and instances.