In [3]:
# Imports
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

ERROR! Session/line number was not unique in database. History logging moved to new session 17


In [4]:
# Load environment variables
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise Exception('API key is not found!')
elif api_key != api_key.strip():
    raise Exception('API key is found but it has leading or trailing spaces which should be removed')
else:
    print('API key is found!')

API key is found!


In [5]:
# Hello chatgpt! This is expected to fail if the API key is not set correctly.
openai = OpenAI()
model = 'gpt-4o-mini'
messages = [{'role': 'user', 'content': 'Hello llm!'}]
# response = openai.chat.completions.create(model=model, messages=messages)
# print(response.choices[0].message.content)

## Web Scraping for Content

In [6]:
# Web scraping example

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

class Website:
    def __init__(self, url):
        self.url = url
        self.content = None
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else 'No title found'
        self.text = soup.get_text(separator='\n', strip=True)

In [14]:
url = 'https://www.wikipedia.org/'

wikipedia = Website(url)
print(f'Title of the webpage: {wikipedia.title}')
print(f'Content of the webpage: {wikipedia.text[:100]}...')  # Print the first 100 characters of the content.

Title of the webpage: Wikipedia
Content of the webpage: Wikipedia
Wikipedia
The Free Encyclopedia
English
7,009,000+
articles
日本語
1,462,000+
記事
Русский
2 05...


## Prompts

### Types of prompt
- System prompt - Type of task and the tone to be used.
- User prompt - Conversation the system should reply for.

In [8]:
system_prompt = 'You are an AI assistant that summarizes webpages. \
Provide a concise summary of the content. \
Ignore any HTML tags and navigation related text and focus on the main text. \
Respond in markdown format.'

def get_user_prompt(website):
    return f'Summarize the content of the webpage {url} with the title "{website.title}". \
The content of the website is as follows:\n{website.text}'

print(f'System prompt: {system_prompt}')
print(f'User prompt: {get_user_prompt(wikipedia)}')



System prompt: You are an AI assistant that summarizes webpages. Provide a concise summary of the content. Ignore any HTML tags and navigation related text and focus on the main text. Respond in markdown format.
User prompt: Summarize the content of the webpage https://www.wikipedia.org/ with the title "Wikipedia". The content of the website is as follows:
Wikipedia
Wikipedia
The Free Encyclopedia
English
7,009,000+
articles
日本語
1,462,000+
記事
Русский
2 050 000+
статей
Deutsch
3.024.000+
Artikel
Español
2.041.000+
artículos
Français
2 690 000+
articles
中文
1,482,000+
条目 / 條目
Italiano
1.922.000+
voci
Português
1.148.000+
artigos
فارسی
۱٬۰۴۳٬۰۰۰+
مقاله
Search Wikipedia
Afrikaans
العربية
Asturianu
Azərbaycanca
Български
閩南語 / Bân-lâm-gú
বাংলা
Беларуская
Català
Čeština
Cymraeg
Dansk
Deutsch
Eesti
Ελληνικά
English
Español
Esperanto
Euskara
فارسی
Français
Galego
한국어
Հայերեն
हिन्दी
Hrvatski
Bahasa Indonesia
Italiano
עברית
ქართული
Ladin
Latina
Latviešu
Lietuvių
Magyar
Македонски
Malagasy
مصرى
Ba

## Messages

In [None]:
## Simple message.

messages = [
    {'role': 'system', 'content': 'You are an AI assistant that is very funny but also an expert in mathematics.'},
    {'role': 'user', 'content': 'What is the square root of 16?'}
]

In [9]:
def get_messages(website: Website) -> list:
    """
    Returns a list of messages with system and user prompts.
    """
    return [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': get_user_prompt(website)}
    ]

print(get_messages(wikipedia))

[{'role': 'system', 'content': 'You are an AI assistant that summarizes webpages. Provide a concise summary of the content. Ignore any HTML tags and navigation related text and focus on the main text. Respond in markdown format.'}, {'role': 'user', 'content': 'Summarize the content of the webpage https://www.wikipedia.org/ with the title "Wikipedia". The content of the website is as follows:\nWikipedia\nWikipedia\nThe Free Encyclopedia\nEnglish\n7,009,000+\narticles\n日本語\n1,462,000+\n記事\nРусский\n2\xa0050\xa0000+\nстатей\nDeutsch\n3.024.000+\nArtikel\nEspañol\n2.041.000+\nartículos\nFrançais\n2\u202f690\u202f000+\narticles\n中文\n1,482,000+\n条目 / 條目\nItaliano\n1.922.000+\nvoci\nPortuguês\n1.148.000+\nartigos\nفارسی\n۱٬۰۴۳٬۰۰۰+\nمقاله\nSearch Wikipedia\nAfrikaans\nالعربية\nAsturianu\nAzərbaycanca\nБългарски\n閩南語 / Bân-lâm-gú\nবাংলা\nБеларуская\nCatalà\nČeština\nCymraeg\nDansk\nDeutsch\nEesti\nΕλληνικά\nEnglish\nEspañol\nEsperanto\nEuskara\nفارسی\nFrançais\nGalego\n한국어\nՀայերեն\nहिन्दी\nHr

## Summarize Website

In [None]:
def summarize_website_with_openai(website):
    model = 'gpt-4o-mini'
    messages = get_messages(website)
    response = openai.chat.completions.create(model=model, messages=messages)
    summary = Markdown(response.choices[0].message.content)
    return summary

summarize_website_with_openai(wikipedia)

In [16]:
import ollama

def summarize_website_with_ollama(website):
    model = 'llama3.2'
    messages = get_messages(website)
    response = ollama.chat(model=model, messages=messages, stream=False)
    summary = Markdown(response['message']['content'])
    return summary

summarize_website_with_ollama(wikipedia)

**Wikipedia**
================

Wikipedia is a free online encyclopedia with over 7 million articles in more than 300 languages.

**Features and Statistics**

* Over 1.5 billion unique visitors per month
* 7,009,000+ articles available in English
* 3,024,000+ articles available in German
* 2,041,000+ articles available in Spanish
* Available in over 300 languages with more than 100,000 articles each

**Support and Resources**

* Supported by the Wikimedia Foundation, a non-profit organization
* Can be supported with donations to continue operations
* Official Wikipedia app available for Android and iOS devices
* Free media collection on Commons
* Free travel guide on Wikivoyage

**Other Projects and Resources**

* Wiktionary: free dictionary
* Wikibooks: free textbooks
* Wikinews: free news source
* Wikidata: free knowledge base
* Wikiversity: free learning resources
* Wikiquote: free quote compendium