Web Research Tool

In [8]:
#Latest version of OLLAMA allready downloaded to machine helps facilitate free Web Scraping
!ollama pull llama3.2

[?2026h[?25l[1Gpulling manifest â ‹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest â ™ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest â ¹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest â ¸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest â ¼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling dde5aa3fc5ff... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 2.0 GB                         [K
pulling 966de95ca8a6... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 1.4 KB                         [K
pulling fcc5a6bec9da... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 7.7 KB                         [K
pulling a70ff7e570d9... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 6.0 KB                         [K
pulling 56bb8bd477a5... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�   96 B                         [K
pulling 34bb5ab01051... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–�  5

In [9]:
#Imports
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai
import anthropic
from IPython.display import Markdown, display, update_display
import ollama

In [10]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")


OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyBX


In [11]:
# Connect to OpenAI, Anthropic and Google; comment out the Claude or Google lines if you're not using them

openai = OpenAI()

claude = anthropic.Anthropic()

google.generativeai.configure()

In [12]:
# A class to represent a Webpage
class Website:
    url: str
    title: str
    text: str

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [13]:
# Function to stream responses from OpenAI GPT models
def stream_gpt(prompt, mod, history):
    messages = [
        {"role": "system", "content": system_message}
    ] + history + [{"role": "user", "content": prompt}]
    
    response_stream = openai.chat.completions.create(
        model=mod,
        messages=messages,
        stream=True
    )
    
    reply = ""
    for chunk in response_stream:
        text = chunk.choices[0].delta.content
        if text:
            reply += text
            yield text
    history.append({"role": "assistant", "content": reply})  # Update history outside the loop

# Function to stream responses from Claude models
def stream_claude(prompt,mod):
    result = claude.messages.stream(
        model=mod,
        max_tokens=1000,
        temperature=0.7,
        system=system_message,
        messages=[
            {"role": "user", "content": prompt},
        ],
    )
    response = ""
    with result as stream:
        for text in stream.text_stream:
            if text:
                response += text
                yield response
    history.append({"role":"assistant", "content":response})


def stream_ollama(prompt, MODEL, system_message):
    """Generates text using Ollama with the corrected message format."""
    model_key = MODEL
    messages = [
        {"role": "system", "content": system_message},  # System message
        {"role": "user", "content": prompt}            # User prompt
    ]
    try:
        response_gen = ollama.chat(model=model_key, messages=messages, stream=True)
        response = ""
        for chunk in response_gen:
            if 'message' in chunk and 'content' in chunk['message']:
                response += chunk['message']['content']
        return response
    except Exception as e:
        print(f"Ollama Error: {e}")
        return None

In [14]:
system_message = """You are a deep esoteric mystic. You are a devout student of all things sacred and you treat each text you read with deep respect. 
You have expert inside knowledge about religion and mysticism from around the world. You make excellent notes
For each page, make a summary of the entire page, extract important quotes word for word, and then separately relate those quotes to other sacred texts. 
Include as many quotes as possible and do not add any unnecessary text. Maximize your word count in every output. Make sure at least 65% of all text is quotes from the input."""

In [15]:
import requests
from bs4 import BeautifulSoup

# A class to represent a Webpage
class Website:
    url: str
    title: str
    text: str

    def __init__(self, url):
        self.url = url
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise HTTPError for bad responses
            self.body = response.content
            soup = BeautifulSoup(self.body, 'html.parser')
            self.title = soup.title.string if soup.title else "No title found"
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        except requests.exceptions.RequestException as e:
            print(f"Request error during Website init: {e}")
            self.title = "Error"
            self.text = f"Request error: {e}"
            self.body = None
        except Exception as e:
            print(f"Error during Website init: {e}")
            self.title = "Error"
            self.text = f"Error: {e}"
            self.body = None

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

def stream_scrape(url, model, file_path, system_message):
    try:
        website = Website(url)
        webpage_contents = website.get_contents()  # Get webpage title and contents
        prompt = f"{webpage_contents}" #The prompt now contains the correct instructions AND the website content
        
        response = ""
        if model == "GPT-Mini":
            response_gen = stream_gpt(prompt, 'gpt-4o-mini', history=[])  # Pass empty history
        elif model == "4o":
            response_gen = stream_gpt(prompt, 'gpt-4o', history=[]) # Pass empty history
        elif model == "Haiku":
            response_gen = stream_claude(prompt, 'claude-3-haiku-20240307')
        elif model == "Sonnet":
            response_gen = stream_claude(prompt, 'claude-3-sonnet-20240229')
        elif model == "Ollama":
            response_gen = stream_ollama(prompt, "llama3.2", system_message)
        else:
            raise ValueError("Unknown model")

        if response_gen is not None:
            print("Response generated successfully")
            with open(file_path, "w", encoding="utf-8") as file:
                file.write(response_gen)
                print(response_gen)
            print(f"Scraped text written to {file_path}")
        else:
            print(f"Scraping failed for {url} with model {model}. No data written to {file_path}")
    except Exception as e:
        print(f"An error occurred during scraping: {e}")

def stream_ollama(prompt, MODEL, system_message):
    """Generates text using Ollama with the corrected message format."""
    model_key = MODEL
    messages = [
        {"role": "system", "content": system_message},  # System message
        {"role": "user", "content": prompt}            # User prompt
    ]
    try:
        response_gen = ollama.chat(model=model_key, messages=messages, stream=True)
        response = ""
        for chunk in response_gen:
            try:
                if 'message' in chunk and 'content' in chunk['message']:
                    response += chunk['message']['content']
            except Exception as e:
                print(f"Error processing chunk: {e}")
                continue  # Skip to the next chunk

        return response
    except Exception as e:
        print(f"Ollama Error: {e}")
        return None


# Example Usage (replace with your actual URL and file path)
#stream_scrape("https://sacred-texts.com/cla/hh/hh1010.htm", "Ollama", r"C:/Users/Lucian/LLM_Platform/llm_engineering/tester.md", system_message)

The System Message Instructs the Model How to summarize the raw input from Beautiful Soup.

Manually enter web pages you wish to scrape

In [17]:
web_list = []
for i in range(3,63):
    if i < 10:
        web_list.append("https://www.sacred-texts.com/egy/pyt/pyt0"+str(i)+".htm")
    #elif i < 100:
        #web_list.append("https://www.sacred-texts.com/zor/sbe18/sbe180"+str(i)+".htm")
    else:
        web_list.append("https://www.sacred-texts.com/egy/pyt/pyt"+str(i)+".htm")

In [18]:
web_list

['https://www.sacred-texts.com/egy/pyt/pyt03.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt04.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt05.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt06.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt07.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt08.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt09.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt10.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt11.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt12.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt13.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt14.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt15.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt16.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt17.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt18.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt19.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt20.htm',
 'https://www.sacred-texts.com/egy/pyt/pyt21.htm',
 'https://www.sacred-texts.com/

In [None]:
#Use Ollama for free webscraping without calling an API
file_loc = r"C:/Users/Lucian/Vizier/sacred_collection/Egyptian/"

for w in (web_list):
    print(w)
    stream_scrape(w,"Ollama",file_loc+str(w[-8:])+".md",system_message)

https://www.sacred-texts.com/egy/pyt/pyt03.htm
Response generated successfully
Here is the revised version of the introduction with the changes in numbering:

Introduction

This translation of the Pyramid Texts is based on the edition by Sethe (1906-1916) with corrections and additions by other scholars. The text is divided into sections, with each section containing a group of related utterances or spells.

The following pages contain the translations of the Pyramid Texts, with commentary at appropriate points.

CHANGES IN NUMBERING OF LINES

Note: The changes in numbering are listed below:

* 1757 = Sethe
* 1760a = "1760b"
* 1760b = "1760c"
* 1825a = "1825a-b"
* 1845b = "1845"
* 1857a = "1857"
* 1886a (in part) = "
* 1887b (in part) = "
* 1902c = "
* 1902d = "
* 1903a = "
* 1906f-g = "
* 1909c = "
* 1909d = "
* 1908a-b = "
* 1928a-b = "
* 1930 = "
* 1939 = "

CHANGES IN NUMBERING OF UTTERANCES

Note: The changes in numbering are listed below:

* Ut. 665 (§§ 1898-1907)
* "665A (§§ 190