In [60]:
import openai
import os
import pyperclip

import requests
from bs4 import BeautifulSoup
import tiktoken

from dotenv import load_dotenv, find_dotenv
from langchain.llms import OpenAI
from langchain.document_loaders import BSHTMLLoader, WebBaseLoader
from langchain.chat_models import ChatOpenAI

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-3.5-turbo-16k", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [50]:
llm = OpenAI()

wikipedia_articles = [
    "https://en.wikipedia.org/wiki/Large_language_model",
    "https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)",
    "https://en.wikipedia.org/wiki/Dual-phase_evolution",
    "https://en.wikipedia.org/wiki/Tessellation",
    "https://en.wikipedia.org/wiki/Climate_change",
    "https://en.wikipedia.org/wiki/DNA_nanotechnology",
    "https://en.wikipedia.org/wiki/Self-driving_car",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://en.wikipedia.org/wiki/2022%E2%80%932023_food_crises",
    "https://en.wikipedia.org/wiki/Economic_impacts_of_climate_change",
]

dual_phase = wikipedia_articles[2]

article = wikipedia_articles[0]

# Send an HTTP request to the URL
response = requests.get(article)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content from the response
    html_content = response.text

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
else:
    print(f"Failed to download the webpage. Status code: {response.status_code}")

In [18]:
loader = WebBaseLoader(article)
data = loader.load()
len(data)

1

In [56]:
# dual_phase = wikipedia_articles[2]
dual_phase = 'https://en.wikipedia.org/wiki/Climate_change'
response_dp = requests.get(dual_phase)
html_content_dp = response_dp.text
soup_dp = BeautifulSoup(html_content_dp, 'html.parser')

# Extract all headers and titles
titles_and_headers = []

# Find all title tags and extract their text
title_tags = soup_dp.find_all('title')
for title in title_tags:
    titles_and_headers.append(title.text.strip())

# Find all header tags (h1 to h6) and extract their text
header_tags = soup_dp.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for header in header_tags:
    titles_and_headers.append(header.text.strip())

# Print the extracted headers and titles
for item in titles_and_headers:
    print(item)


Climate change - Wikipedia
Contents
Climate change
Terminology
Observed temperature rise
Attribution of recent temperature rise
Greenhouse gases
Aerosols and clouds
Land surface changes
Solar and volcanic activity
Climate change feedback
Modelling
Impacts
Environmental effects
Tipping points and long-term impacts
Nature and wildlife
Humans
Food and health
Livelihoods
Reducing and recapturing emissions
Clean energy
Energy conservation
Agriculture and industry
Carbon sequestration
Adaptation
Policies and politics
Policy options
Climate justice
International climate agreements
National responses
Society
Denial and misinformation
Public awareness and opinion
Climate movement
History
Early discoveries
Development of a scientific consensus
See also
References
Sources
IPCC reports
Other peer-reviewed sources
Books, reports and legal documents
Non-technical sources
Further reading
External links


In [48]:
all_articles = WebBaseLoader(wikipedia_articles).load()
print('Total number of articles: ', len(all_articles))
for i, doc in enumerate(all_articles):
    print(f"Article {i} contains {num_tokens_from_string(doc.page_content):,} tokens - {wikipedia_articles[i]}")

Total number of articles:  10
Article 0 contains 23,811 tokens - https://en.wikipedia.org/wiki/Large_language_model
Article 1 contains 15,847 tokens - https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)
Article 2 contains 3,643 tokens - https://en.wikipedia.org/wiki/Dual-phase_evolution
Article 3 contains 14,128 tokens - https://en.wikipedia.org/wiki/Tessellation
Article 4 contains 51,822 tokens - https://en.wikipedia.org/wiki/Climate_change
Article 5 contains 17,996 tokens - https://en.wikipedia.org/wiki/DNA_nanotechnology
Article 6 contains 34,859 tokens - https://en.wikipedia.org/wiki/Self-driving_car
Article 7 contains 24,120 tokens - https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle
Article 8 contains 24,161 tokens - https://en.wikipedia.org/wiki/2022%E2%80%932023_food_crises
Article 9 contains 14,238 tokens - https://en.wikipedia.org/wiki/Economic_impacts_of_climate_change


In [58]:
shortest_article = all_articles[2]
shortest_article.page_content

'\n\n\n\nDual-phase evolution - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate\n\n\n\n\n\t\tContribute\n\t\n\nHelpLearn to editCommunity portalRecent changesUpload file\n\n\n\n\nLanguages\n\nLanguage links are at the top of the page across from the title.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\nCreate accountLog in\n\n\n\n\n\nPersonal tools\n\n\n\n\n Create account Log in\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n\n\n\n\n\n1Introduction\n\n\n\n\n\n\n\n2The DPE mechanism\n\n\n\nToggle The DPE mechanism subsection\n\n\n\n\n\n2.1Underlying network\n\n\n\n\n\n\n\n2.2Phase shifts\n\n\n\n\n\n\n\n2.3Sel

In [61]:
llm_16k.predict(f"Summarize the following text: {shortest_article.page_content}")

"Dual-phase evolution is a process that drives self-organization within complex adaptive systems. It occurs when a system switches between different phases, each with different processes acting on the system's components. Dual-phase evolution is found in various physical, biological, and social systems. It has applications in technology, such as manufacturing novel materials and solving complex computational problems. Examples of dual-phase evolution include social networks, socio-economics, forest ecology, and search algorithms. It is related to self-organized criticality but differs in several fundamental ways."

In [45]:
# llm_default.predict(f'Translate the following text into German: {all_articles[3].page_content}')
output = llm_16k.predict(f'Translate the following text into German: {all_articles[3].page_content}')

In [47]:
num_tokens_from_string(output)

545

In [21]:
wikipedia_articles[3]

'https://en.wikipedia.org/wiki/Tessellation'

In [31]:
len(all_articles[3].page_content)

enc = tiktoken.encoding_for_model('gpt-3.5-turbo')
len(all_articles[3].page_content.split(' '))

6375

In [36]:
num_tokens_from_string(all_articles[3].page_content)

14128