In [16]:
import openai
import os
import pyperclip
from pprint import pprint

import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import tiktoken

from dotenv import load_dotenv, find_dotenv
from langchain.llms import OpenAI
from langchain.document_loaders import BSHTMLLoader, WebBaseLoader
from langchain.chat_models import ChatOpenAI

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-3.5-turbo-16k", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [2]:
llm = OpenAI()

wikipedia_articles = [
    "https://en.wikipedia.org/wiki/Large_language_model",
    "https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)",
    "https://en.wikipedia.org/wiki/Dual-phase_evolution",
    "https://en.wikipedia.org/wiki/Tessellation",
    "https://en.wikipedia.org/wiki/Climate_change",
    "https://en.wikipedia.org/wiki/DNA_nanotechnology",
    "https://en.wikipedia.org/wiki/Self-driving_car",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://en.wikipedia.org/wiki/2022%E2%80%932023_food_crises",
    "https://en.wikipedia.org/wiki/Economic_impacts_of_climate_change",
]

dual_phase = wikipedia_articles[2]

article = wikipedia_articles[0]

# Send an HTTP request to the URL
response = requests.get(article)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content from the response
    html_content = response.text

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
else:
    print(f"Failed to download the webpage. Status code: {response.status_code}")

In [3]:
loader = WebBaseLoader(article)
data = loader.load()
len(data)

1

In [4]:
dual_phase = 'https://en.wikipedia.org/wiki/Dual-phase_evolution'
# dual_phase = 'https://en.wikipedia.org/wiki/Climate_change'
response_dp = requests.get(dual_phase)
html_content_dp = response_dp.text
soup_dp = BeautifulSoup(html_content_dp, 'html.parser')

# Extract all headers and titles
titles_and_headers = []

# Find all title tags and extract their text
title_tags = soup_dp.find_all('title')
for title in title_tags:
    titles_and_headers.append(title.text.strip())

# Find all header tags (h1 to h6) and extract their text
header_tags = soup_dp.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for header in header_tags:
    titles_and_headers.append(header.text.strip())

# Print the extracted headers and titles
for item in titles_and_headers:
    print(item)


Dual-phase evolution - Wikipedia
Contents
Dual-phase evolution
Introduction[edit]
The DPE mechanism[edit]
Underlying network[edit]
Phase shifts[edit]
Selection and variation[edit]
System memory[edit]
Examples[edit]
Social networks[edit]
Socio-economics[edit]
Forest ecology[edit]
Search algorithms[edit]
Related processes[edit]
References[edit]


In [14]:
soup_dp.h1

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Dual-phase evolution</span></h1>

In [78]:
url = 'https://en.wikipedia.org/wiki/Dual-phase_evolution'
url = 'https://en.wikipedia.org/wiki/Self-driving_car'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
i = 0
for tag in found_tags:
    content = []
    next_tag = tag.find_next()

    # Look for next tags until the next header tag
    while next_tag and next_tag.name not in tags:
        # Reference section can contain both p and li tags
        if 'reference' in str(next_tag).lower() and next_tag.name in ['p', 'li']:
            # print(str(next_tag).lower())
            content.append(next_tag.get_text(strip=False))
        elif next_tag.name == 'p':
            content.append(next_tag.get_text(strip=False))
        next_tag = next_tag.find_next()

    key = f"{tag.name}_{tag.get_text(strip=True)}"
    doc_dict[key] = " ".join(content)

for key in list(doc_dict.keys()):  # Using list() to avoid changing the dictionary size during iteration
    new_key = key.rstrip('[edit]')
    if new_key != key:
        doc_dict[new_key] = doc_dict.pop(key)

del doc_dict['h2_Contents']

pprint(doc_dict, sort_dicts=False)
pprint(doc_dict.keys())

{'h1_Self-driving car': '\n'
                        ' A self-driving car, also known as an autonomous car, '
                        'driverless car, or robotic car (robo-car),[1][2][3] '
                        'is a car that is capable of traveling without human '
                        'input.[4][5] Self-driving cars use sensors to '
                        'perceive their surroundings, such as optical and '
                        'thermographic cameras, radar, lidar, '
                        'ultrasound/sonar, GPS, odometry and inertial '
                        'measurement units.[6] Control systems interpret '
                        'sensory information to create a three-dimensional '
                        "model of the vehicle's surroundings. Based on the "
                        'model, the car then identifies an appropriate '
                        'navigation path and strategies for managing traffic '
                        'controls (stop signs, traffic lights, spe

In [86]:
with open('self_driving.html', 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

In [85]:
[x for x in soup.find_all('h3') if 'Approaches' in x.get_text()][0].find_next()

<span class="mw-headline" id="Approaches">Approaches</span>

In [79]:
def convert_to_markdown(article_dict):
    md_text = ""

    for key, value in article_dict.items():
        # Split the key on the underscore to separate the heading level and the title
        split_key = key.split("_")
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(split_key[0][1])
        heading = split_key[1]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{value}\n\n"

    return md_text

article_dict = {
    'h1_Self-driving car': 'Content for Self-driving car',
    'h2_History': 'Content for History',
    'h2_Definitions': 'Content for Definitions',
    'h3_Terminology and safety considerations': 'Content for Terminology and safety considerations',
    'h3_Autonomous vs. automa': 'Content for Autonomous vs. automa',
    'h3_Autonomous versus cooperativ': 'Content for Autonomous versus cooperativ',
    'h2_Classifications': 'Content for Classifications',
    'h3_Self-driving car': 'Content for Self-driving car',
    'h3_SAE classification': 'Content for SAE classification',
    'h3_Levels of driving automation': 'Content for Levels of driving automation'
}

print(convert_to_markdown(doc_dict))


# Self-driving car


 A self-driving car, also known as an autonomous car, driverless car, or robotic car (robo-car),[1][2][3] is a car that is capable of traveling without human input.[4][5] Self-driving cars use sensors to perceive their surroundings, such as optical and thermographic cameras, radar, lidar, ultrasound/sonar, GPS, odometry and inertial measurement units.[6] Control systems interpret sensory information to create a three-dimensional model of the vehicle's surroundings. Based on the model, the car then identifies an appropriate navigation path and strategies for managing traffic controls (stop signs, traffic lights, speed limits, yield signs, etc.) and obstacles.[7][8][9][10][11]
 Once the technology matures, autonomous vehicles are predicted to impact the automotive industry, health, welfare, urban planning, traffic, insurance, labor market, and other fields. Their regulation is becoming an increasingly important issue.
 Autonomy in vehicles is often divided into six l

In [60]:
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
found_tags[1].name

'h1'

In [51]:
# Assuming 'my_dict' is your dictionary
my_dict = {
    'Contents': None,
    'Dual-phase evolution': None,
    'Introduction[edit]': None,
    'The DPE mechanism[edit]': None,
    'Underlying network[edit]': None,
    'Phase shifts[edit]': None,
    'Selection and variation[edit]': None
}

# Remove '[edit]' from the end of the keys
for key in list(my_dict.keys()):  # Using list() to avoid changing the dictionary size during iteration
    new_key = key.rstrip('[edit]')
    if new_key != key:
        my_dict[new_key] = my_dict.pop(key)

print(my_dict)


{'Contents': None, 'Dual-phase evolution': None, 'Introduction': None, 'The DPE mechanism': None, 'Underlying network': None, 'Phase shifts': None, 'Selection and variation': None}


In [34]:
soup.h1.find_next_sibling().find_next_sibling().find_next_sibling()

AttributeError: 'NoneType' object has no attribute 'find_next_sibling'

In [28]:
# Load the webpage content
url = 'https://en.wikipedia.org/wiki/Dual-phase_evolution'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling('p')

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name == 'p':
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

pprint(doc_dict, sort_dicts=False)

{'Contents': '',
 'Dual-phase evolution': '',
 'Introduction[edit]': 'Dual phase evolution (DPE) is a process that promotes '
                       'the emergence of large-scale order incomplex systems. '
                       'It occurs when a system repeatedly switches between '
                       'various kinds of phases, and in each phase different '
                       'processes act on the components or connections in the '
                       'system. DPE arises because of a property '
                       'ofgraphsandnetworks: the connectivity avalanche that '
                       'occurs in graphs as the number of edges increases.[2] '
                       'Social networks provide a familiar example. In asocial '
                       'networkthe nodes of the network are people and the '
                       'network connections (edges) are relationships or '
                       'interactions between people. For any individual, '
                       

In [36]:
with open('soup.html', 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

In [37]:
broken_html = """<h1 class="firstHeading mw-first-heading" id="firstHeading">
        <span class="mw-page-title-main">
         Dual-phase evolution
        </span>
       </h1>
       <div class="vector-dropdown mw-portlet mw-portlet-lang" id="p-lang-btn">
        <input aria-haspopup="true" aria-label="Go to an article in another language. Available in 1 language" class="vector-dropdown-checkbox mw-interlanguage-selector" data-event-name="ui.dropdown-p-lang-btn" id="p-lang-btn-checkbox" role="button" type="checkbox">
         <label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-1" for="p-lang-btn-checkbox" id="p-lang-btn-label">
         </label>
         <span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive">
         </span>
         <span class="vector-dropdown-label-text">
          1 language
         </span>
         <div class="vector-dropdown-content">
          <div class="vector-menu-content">
           <ul class="vector-menu-content-list">
            <li class="interlanguage-link interwiki-fa mw-list-item">
             <a class="interlanguage-link-target" href="https://fa.wikipedia.org/wiki/%D9%81%D8%B1%DA%AF%D8%B4%D8%AA_%D8%AF%D9%88%D9%81%D8%A7%D8%B2%DB%8C" hreflang="fa" lang="fa" title="فرگشت دوفازی – Persian">
              <span>
               فارسی
              </span>
             </a>
            </li>
           </ul>
           <div class="after-portlet after-portlet-lang">
            <span class="wb-langlinks-edit wb-langlinks-link">
             <a class="wbc-editpage" href="https://www.wikidata.org/wiki/Special:EntityPage/Q25109659#sitelinks-wikipedia" title="Edit interlanguage links">
              Edit links
             </a>
            </span>
           </div>
          </div>
         </div>
        </input>
       </div>
      </header>
      <div class="vector-page-toolbar">
       <div class="vector-page-toolbar-container">
        <div id="left-navigation">
         <nav aria-label="Namespaces">
          <div class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" id="p-associated-pages">
           <div class="vector-menu-content">
            <ul class="vector-menu-content-list">
             <li class="selected vector-tab-noicon mw-list-item" id="ca-nstab-main">
              <a accesskey="c" class="" data-mw="interface" href="/wiki/Dual-phase_evolution" title="View the content page [c]">
               <span>
                Article
               </span>
              </a>
             </li>
             <li class="vector-tab-noicon mw-list-item" id="ca-talk">
              <a accesskey="t" class="" data-mw="interface" href="/wiki/Talk:Dual-phase_evolution" rel="discussion" title="Discuss improvements to the content page [t]">
               <span>
                Talk
               </span>
              </a>
             </li>
            </ul>
           </div>
          </div>
          <div class="vector-dropdown mw-portlet mw-portlet-variants emptyPortlet" id="p-variants">
           <input aria-haspopup="true" aria-label="Change language variant" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-p-variants" id="p-variants-checkbox" role="button" type="checkbox"/>
           <label aria-hidden="true" class="vector-dropdown-label" for="p-variants-checkbox" id="p-variants-label">
           </label>
           <span class="vector-dropdown-label-text">
            English
           </span>
           <div class="vector-dropdown-content">
            <div class="vector-menu-content">
             <ul class="vector-menu-content-list">
             </ul>
            </div>
           </div>
          </div>
         </nav>
        </div>
        <div class="vector-collapsible" id="right-navigation">
         <nav aria-label="Views">
          <div class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" id="p-views">
           <div class="vector-menu-content">
            <ul class="vector-menu-content-list">
             <li class="selected vector-tab-noicon mw-list-item" id="ca-view">
              <a class="" data-mw="interface" href="/wiki/Dual-phase_evolution">
               <span>
                Read
               </span>
              </a>
             </li>
             <li class="vector-tab-noicon mw-list-item" id="ca-edit">
              <a accesskey="e" class="" data-mw="interface" href="/w/index.php?title=Dual-phase_evolution&amp;action=edit" title="Edit this page [e]">
               <span>
                Edit
               </span>
              </a>
             </li>
             <li class="vector-tab-noicon mw-list-item" id="ca-history">
              <a accesskey="h" class="" data-mw="interface" href="/w/index.php?title=Dual-phase_evolution&amp;action=history" title="Past revisions of this page [h]">
               <span>
                View history
               </span>
              </a>
             </li>
            </ul>
           </div>
          </div>
         </nav>
         <nav aria-label="More options" class="vector-page-tools-landmark">
          <div class="vector-dropdown vector-page-tools-dropdown" id="vector-page-tools-dropdown">
           <input aria-haspopup="true" aria-label="Tools" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-tools-dropdown" id="vector-page-tools-dropdown-checkbox" role="button" type="checkbox"/>
           <label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" for="vector-page-tools-dropdown-checkbox" id="vector-page-tools-dropdown-label">
           </label>
           <span class="vector-dropdown-label-text">
            Tools
           </span>
           <div class="vector-dropdown-content">
            <div class="vector-unpinned-container" id="vector-page-tools-unpinned-container">
             <div class="vector-page-tools vector-pinnable-element" id="vector-page-tools">
              <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container">
               <div class="vector-pinnable-header-label">
                Tools
               </div>
               <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">
                move to sidebar
               </button>
               <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">
                hide
               </button>
              </div>
              <div class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" id="p-cactions" title="More options">
               <div class="vector-menu-heading">
                Actions
               </div>
               <div class="vector-menu-content">
                <ul class="vector-menu-content-list">
                 <li class="selected vector-more-collapsible-item mw-list-item" id="ca-more-view">
                  <a href="/wiki/Dual-phase_evolution">
                   <span>
                    Read
                   </span>
                  </a>
                 </li>
                 <li class="vector-more-collapsible-item mw-list-item" id="ca-more-edit">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                   <span>
                    Edit
                   </span>
                  </a>
                 </li>
                 <li class="vector-more-collapsible-item mw-list-item" id="ca-more-history">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;action=history">
                   <span>
                    View history
                   </span>
                  </a>
                 </li>
                </ul>
               </div>
              </div>
              <div class="vector-menu mw-portlet mw-portlet-tb" id="p-tb">
               <div class="vector-menu-heading">
                General
               </div>
               <div class="vector-menu-content">
                <ul class="vector-menu-content-list">
                 <li class="mw-list-item" id="t-whatlinkshere">
                  <a accesskey="j" href="/wiki/Special:WhatLinksHere/Dual-phase_evolution" title="List of all English Wikipedia pages containing links to this page [j]">
                   <span>
                    What links here
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-recentchangeslinked">
                  <a accesskey="k" href="/wiki/Special:RecentChangesLinked/Dual-phase_evolution" rel="nofollow" title="Recent changes in pages linked from this page [k]">
                   <span>
                    Related changes
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-upload">
                  <a accesskey="u" href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]">
                   <span>
                    Upload file
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-specialpages">
                  <a accesskey="q" href="/wiki/Special:SpecialPages" title="A list of all special pages [q]">
                   <span>
                    Special pages
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-permalink">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;oldid=1126289474" title="Permanent link to this revision of this page">
                   <span>
                    Permanent link
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-info">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;action=info" title="More information about this page">
                   <span>
                    Page information
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-cite">
                  <a href="/w/index.php?title=Special:CiteThisPage&amp;page=Dual-phase_evolution&amp;id=1126289474&amp;wpFormIdentifier=titleform" title="Information on how to cite this page">
                   <span>
                    Cite this page
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-wikibase">
                  <a accesskey="g" href="https://www.wikidata.org/wiki/Special:EntityPage/Q25109659" title="Structured data on this page hosted by Wikidata [g]">
                   <span>
                    Wikidata item
                   </span>
                  </a>
                 </li>
                </ul>
               </div>
              </div>
              <div class="vector-menu mw-portlet mw-portlet-coll-print_export" id="p-coll-print_export">
               <div class="vector-menu-heading">
                Print/export
               </div>
               <div class="vector-menu-content">
                <ul class="vector-menu-content-list">
                 <li class="mw-list-item" id="coll-download-as-rl">
                  <a href="/w/index.php?title=Special:DownloadAsPdf&amp;page=Dual-phase_evolution&amp;action=show-download-screen" title="Download this page as a PDF file">
                   <span>
                    Download as PDF
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-print">
                  <a accesskey="p" href="/w/index.php?title=Dual-phase_evolution&amp;printable=yes" title="Printable version of this page [p]">
                   <span>
                    Printable version
                   </span>
                  </a>
                 </li>
                </ul>
               </div>
              </div>
             </div>
            </div>
           </div>
          </div>
         </nav>
        </div>
       </div>
      </div>
      <div class="vector-column-end">
       <nav aria-label="More options" class="vector-page-tools-landmark vector-sticky-pinned-container">
        <div class="vector-pinned-container" id="vector-page-tools-pinned-container">
        </div>
       </nav>
      </div>
      <div aria-labelledby="firstHeading" class="vector-body" data-mw-ve-target-container="" id="bodyContent">
       <div class="vector-body-before-content">
        <div class="mw-indicators">
        </div>
        <div class="noprint" id="siteSub">
         From Wikipedia, the free encyclopedia
        </div>
       </div>
       <div id="contentSub">
        <div id="mw-content-subtitle">
        </div>
       </div>
       <div class="mw-body-content mw-content-ltr" dir="ltr" id="mw-content-text" lang="en">
        <div class="mw-parser-output">
         <div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">
          Process that drives self-organization within complex adaptive systems
         </div>
         <table class="box-Multiple_issues plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox" role="presentation">
          <tbody>
           <tr>
            <td class="mbox-image">
             <div class="mbox-image-div">
              <img alt="" data-file-height="40" data-file-width="40" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/60px-Ambox_important.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/80px-Ambox_important.svg.png 2x" width="40"/>
             </div>
            </td>
            <td class="mbox-text">
             <div class="mbox-text-span">
              <div class="multiple-issues-text mw-collapsible">
               <b>
                This article has multiple issues.
               </b>
               Please help
               <b>
                <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                 improve it
                </a>
               </b>
               or discuss these issues on the
               <b>
                <a href="/wiki/Talk:Dual-phase_evolution" title="Talk:Dual-phase evolution">
                 talk page
                </a>
               </b>
               .
               <small>
                <i>
                 (
                 <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                  Learn how and when to remove these template messages
                 </a>
                 )
                </i>
               </small>
               <div class="mw-collapsible-content">
                <link href="mw-data:TemplateStyles:r1097763485" rel="mw-deduplicated-inline-style"/>
                <table class="box-More_citations_needed plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation">
                 <tbody>
                  <tr>
                   <td class="mbox-image">
                    <div class="mbox-image-div">
                     <a class="image" href="/wiki/File:Question_book-new.svg">
                      <img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/>
                     </a>
                    </div>
                   </td>
                   <td class="mbox-text">
                    <div class="mbox-text-span">
                     This article
                     <b>
                      needs additional citations for
                      <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">
                       verification
                      </a>
                     </b>
                     .
                     <span class="hide-when-compact">
                      Please help
                      <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                       improve this article
                      </a>
                      by
                      <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">
                       adding citations to reliable sources
                      </a>
                      . Unsourced material may be challenged and removed.
                      <br/>
                      <small>
                       <span class="plainlinks">
                        <i>
                         Find sources:
                        </i>
                        <a class="external text" href="https://www.google.com/search?as_eq=wikipedia&amp;q=%22Dual-phase+evolution%22" rel="nofollow">
                         "Dual-phase evolution"
                        </a>
                        –
                        <a class="external text" href="https://www.google.com/search?tbm=nws&amp;q=%22Dual-phase+evolution%22+-wikipedia&amp;tbs=ar:1" rel="nofollow">
                         news
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://www.google.com/search?&amp;q=%22Dual-phase+evolution%22&amp;tbs=bkt:s&amp;tbm=bks" rel="nofollow">
                         newspapers
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://www.google.com/search?tbs=bks:1&amp;q=%22Dual-phase+evolution%22+-wikipedia" rel="nofollow">
                         books
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://scholar.google.com/scholar?q=%22Dual-phase+evolution%22" rel="nofollow">
                         scholar
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://www.jstor.org/action/doBasicSearch?Query=%22Dual-phase+evolution%22&amp;acc=on&amp;wc=on" rel="nofollow">
                         JSTOR
                        </a>
                       </span>
                      </small>
                     </span>
                     <span class="date-container">
                      <i>
                       (
                       <span class="date">
                        May 2015
                       </span>
                       )
                      </i>
                     </span>
                     <span class="hide-when-compact">
                      <i>
                       (
                       <small>
                        <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                         Learn how and when to remove this template message
                        </a>
                       </small>
                       )
                      </i>
                     </span>
                    </div>
                   </td>
                  </tr>
                 </tbody>
                </table>
                <link href="mw-data:TemplateStyles:r1097763485" rel="mw-deduplicated-inline-style"/>
                <table class="box-Technical plainlinks metadata ambox ambox-style ambox-technical" role="presentation">
                 <tbody>
                  <tr>
                   <td class="mbox-image">
                    <div class="mbox-image-div">
                     <img alt="" data-file-height="48" data-file-width="48" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/40px-Edit-clear.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/60px-Edit-clear.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/80px-Edit-clear.svg.png 2x" width="40"/>
                    </div>
                   </td>
                   <td class="mbox-text">
                    <div class="mbox-text-span">
                     This article
                     <b>
                      may be too technical for most readers to understand
                     </b>
                     .
                     <span class="hide-when-compact">
                      Please
                      <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                       help improve it
                      </a>
                      to
                      <a href="/wiki/Wikipedia:Make_technical_articles_understandable" title="Wikipedia:Make technical articles understandable">
                       make it understandable to non-experts
                      </a>
                      , without removing the technical details.
                     </span>
                     <span class="date-container">
                      <i>
                       (
                       <span class="date">
                        May 2015
                       </span>
                       )
                      </i>
                     </span>
                     <span class="hide-when-compact">
                      <i>
                       (
                       <small>
                        <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                         Learn how and when to remove this template message
                        </a>
                       </small>
                       )
                      </i>
                     </span>
                    </div>
                   </td>
                  </tr>
                 </tbody>
                </table>
               </div>
              </div>
              <span class="hide-when-compact">
               <i>
                (
                <small>
                 <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                  Learn how and when to remove this template message
                 </a>
                </small>
                )
               </i>
              </span>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
         <p>
          <b>
           Dual phase evolution
          </b>
          (
          <b>
           DPE
          </b>
          ) is a process that drives
          <a href="/wiki/Self-organization" title="Self-organization">
           self-organization
          </a>
          within
          <a href="/wiki/Complex_adaptive_system" title="Complex adaptive system">
           complex adaptive systems
          </a>
          .
          <sup class="reference" id="cite_ref-DPE2_1-0">
           <a href="#cite_note-DPE2-1">
            [1]
           </a>
          </sup>
          It arises in response to phase changes within the network of connections formed by a system's components. DPE occurs in a wide range of physical, biological and social systems. Its applications to technology include methods for manufacturing novel materials and algorithms to solve complex problems in computation.
         </p>
         <meta property="mw:PageProp/toc">
          <h2>
           <span class="mw-headline" id="Introduction">
            Introduction
           </span>
           <span class="mw-editsection">
            <span class="mw-editsection-bracket">
             [
            </span>
            <a href="/w/index.php?title=Dual-phase_evolution&amp;action=edit&amp;section=1" title="Edit section: Introduction">
             edit
            </a>
            <span class="mw-editsection-bracket">
             ]
            </span>
           </span>
          </h2>
          <p>
           Dual phase evolution (DPE) is a process that promotes the emergence of large-scale order in
           <a class="mw-redirect" href="/wiki/Complex_systems" title="Complex systems">
            complex systems
           </a>"""

working_code = """from bs4 import BeautifulSoup, Comment
import requests

# Load the webpage content
url = 'http://your-website-url.com'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(text=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling()

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name == 'p':
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

# Print the resulting dictionary
for k, v in doc_dict.items():
    print(f'{k}: {v}')
"""

prompt = f"""The following code was created by GPT-4 and successfully for each header tag finds and returns all the subsequent p tags until the next header tag and keeps doing that for each header tag and adds them to a dict.

However, it does not find the p tag for the h1 tag. Please correct it so that
it finds the p tag after the h1 tag

Working code: ####
{working_code}
####

html soup: ####
{broken_html}
"""
llm_16k.predict(prompt)

'To fix the code, you need to modify the while loop condition to include the h1 tag. Here\'s the modified code:\n\n```python\ndoc_dict = {}\nfor tag in found_tags:\n    content = []\n    next_sibling = tag.find_next_sibling()\n\n    while next_sibling and (next_sibling.name not in tags or tag.name == \'h1\'):\n        if next_sibling.name == \'p\':\n            content.append(next_sibling.get_text(strip=True))\n        next_sibling = next_sibling.find_next_sibling()\n\n    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n```\n\nBy adding `tag.name == \'h1\'` to the while loop condition, it will include the p tags after the h1 tag as well.'

In [38]:
pprint('To fix the code, you need to modify the while loop condition to include the h1 tag. Here\'s the modified code:\n\n```python\ndoc_dict = {}\nfor tag in found_tags:\n    content = []\n    next_sibling = tag.find_next_sibling()\n\n    while next_sibling and (next_sibling.name not in tags or tag.name == \'h1\'):\n        if next_sibling.name == \'p\':\n            content.append(next_sibling.get_text(strip=True))\n        next_sibling = next_sibling.find_next_sibling()\n\n    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n```\n\nBy adding `tag.name == \'h1\'` to the while loop condition, it will include the p tags after the h1 tag as well.')

('To fix the code, you need to modify the while loop condition to include the '
 "h1 tag. Here's the modified code:\n"
 '\n'
 '```python\n'
 'doc_dict = {}\n'
 'for tag in found_tags:\n'
 '    content = []\n'
 '    next_sibling = tag.find_next_sibling()\n'
 '\n'
 '    while next_sibling and (next_sibling.name not in tags or tag.name == '
 "'h1'):\n"
 "        if next_sibling.name == 'p':\n"
 '            content.append(next_sibling.get_text(strip=True))\n'
 '        next_sibling = next_sibling.find_next_sibling()\n'
 '\n'
 '    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n'
 '```\n'
 '\n'
 "By adding `tag.name == 'h1'` to the while loop condition, it will include "
 'the p tags after the h1 tag as well.')


In [39]:
working_code = """from bs4 import BeautifulSoup, Comment
import requests

# Load the webpage content
url = 'http://your-website-url.com'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(text=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling()

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name not in ["script", "style", "head", "title", "[document]"]:
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

# Print the resulting dictionary
for k, v in doc_dict.items():
    print(f'{k}: {v}')
"""

prompt = f"""The following code was created by GPT-4 and successfully achives this goal: I only care about what a person reading the page will read. So just get me the headers and actual content text content that a human would see. The current code I tried gets things like 'document.documentElement.className="client-js ' which is obviously js code that the user would not read. add everything to a dict in the order you find them.

However, it does not find the p tag for the h1 tag. Please correct it so that
it finds the p tag after the h1 tag

Working code: ####
{working_code}
####

html soup: ####
{broken_html}
####
"""
output = llm_16k.predict(prompt)
pprint(output)

('To modify the existing code to include the `<p>` tag after the `<h1>` tag, '
 'you can update the `while` loop inside the `for` loop as follows:\n'
 '\n'
 '```python\n'
 'doc_dict = {}\n'
 'for tag in found_tags:\n'
 '    content = []\n'
 '    next_sibling = tag.find_next_sibling()\n'
 '\n'
 '    # Add the content of the <p> tag after the <h1> tag\n'
 "    if tag.name == 'h1':\n"
 "        p_tag = tag.find_next_sibling('p')\n"
 '        if p_tag:\n'
 '            content.append(p_tag.get_text(strip=True))\n'
 '\n'
 '    while next_sibling and next_sibling.name not in tags:\n'
 '        if next_sibling.name not in ["script", "style", "head", "title", '
 '"[document]"]:\n'
 '            content.append(next_sibling.get_text(strip=True))\n'
 '        next_sibling = next_sibling.find_next_sibling()\n'
 '\n'
 '    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n'
 '```\n'
 '\n'
 'This code checks if the current tag is an `<h1>` tag and then finds the next '
 'sibling `<p>` tag usin

In [40]:
working_code = """from bs4 import BeautifulSoup, Comment
import requests

# Load the webpage content
url = 'http://your-website-url.com'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(text=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling()

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name not in ["script", "style", "head", "title", "[document]"]:
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

# Print the resulting dictionary
for k, v in doc_dict.items():
    print(f'{k}: {v}')
"""

prompt = f"""The following code finds all the p tags after header tags. But it does not find the p tags for the h1 tag. Why does it not find it?

Code: ####
{working_code}
####

html soup: ####
{broken_html}
####
"""
output = llm_16k.predict(prompt)
pprint(output)

('The code does not find the p tags for the h1 tag because the h1 tag is not '
 'included in the list of tags to find. In the code, the tags variable is '
 "defined as ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], so only h2 to h6 tags will "
 'be found. If you want to include the h1 tag as well, you can modify the tags '
 "variable to ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].")


In [35]:
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">

<body class="skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Dual-phase_evolution rootpage-Dual-phase_evolution skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a>
<div class="vector-header-container">
<header class="vector-header mw-header">
<div class="vector-header-start">
<nav aria-label="Site" class="vector-main-menu-landmark" role="navigation">
<div class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" i

In [5]:
Introduction
The DPE mechanism
Underlying network
Phase shifts
Selection and variation
System memory
Examples
Social networks
Socio-economics

SyntaxError: invalid syntax (221947299.py, line 2)

In [None]:
all_articles = WebBaseLoader(wikipedia_articles).load()
print('Total number of articles: ', len(all_articles))
for i, doc in enumerate(all_articles):
    print(f"Article {i} contains {num_tokens_from_string(doc.page_content):,} tokens - {wikipedia_articles[i]}")

In [None]:
shortest_article = all_articles[2]
shortest_article.page_content

In [None]:
llm_16k.predict(f"Summarize the following text: {shortest_article.page_content}")

In [None]:
# llm_default.predict(f'Translate the following text into German: {all_articles[3].page_content}')
output = llm_16k.predict(f'Translate the following text into German: {all_articles[3].page_content}')

In [None]:
num_tokens_from_string(output)

In [None]:
wikipedia_articles[3]

In [None]:
len(all_articles[3].page_content)

enc = tiktoken.encoding_for_model('gpt-3.5-turbo')
len(all_articles[3].page_content.split(' '))

In [None]:
num_tokens_from_string(all_articles[3].page_content)