In [75]:
import os
from copy import copy, deepcopy
from pprint import pprint
from typing import List
from uuid import uuid4

import evaluate
import openai
import requests
import tiktoken
from bs4 import BeautifulSoup, Comment
from doctran import Doctran, ExtractProperty
from dotenv import load_dotenv, find_dotenv
from evaluate import load
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceEmbeddings,
)
from langchain.llms import OpenAI
from langchain.text_splitter import (
    MarkdownTextSplitter,
    MarkdownHeaderTextSplitter,
    LineType,
    RecursiveCharacterTextSplitter,
)
from loguru import logger
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-3.5-turbo-16k", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def convert_to_markdown(article_dict):
    md_text = ""

    for heading, content in article_dict.items():
        # heading is of form: 'h3 Example'
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(heading[1])
        heading = heading[3:]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{content}\n\n"

    return md_text

In [2]:
embed_ada = OpenAIEmbeddings(
    model="text-embedding-ada-002",
)

embed_e5 = HuggingFaceEmbeddings(
    model_name='intfloat/e5-base-v2',
    encode_kwargs = {'normalize_embeddings': True}
)

text_1 = 'hello'
text_2 = 'hi'
text_3 = 'England is a small country'

# e5-base-v2 requires all embeddings to be prepended with 'query: '
e51 = embed_e5.embed_query("query: " + text_1)
e52 = embed_e5.embed_query("query: " + text_2)
e53 = embed_e5.embed_query("query: " + text_3)
print(len(e51))
ada1 = embed_ada.embed_query(text_1)
ada2 = embed_ada.embed_query(text_2)
ada3 = embed_ada.embed_query(text_3)
print(len(ada1))

768
1536


In [70]:
a = [1, 2, 3]
b = [10, 20, 30]

print(np.mean([a, b], axis=0))

[ 5.5 11.  16.5]


In [22]:
a = [1, 2, 3, 4, 5]
b = [10, 20, 30, 40, 50]
# a = np.array(a)
# b = np.array(b)

d = dict(a=ada1, b=ada2)

# np.mean([ada1, ada2], axis=0)
np.mean([*d.values()], axis=0)

array([-0.03005476, -0.02001589, -0.02160983, ..., -0.01395465,
       -0.0030084 , -0.00075671])

In [24]:
url = 'https://en.wikipedia.org/wiki/Dual-phase_evolution'
url = 'https://en.wikipedia.org/wiki/Self-driving_car'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
i = 0
for tag in found_tags:
    content = []
    next_tag = tag.find_next()

    # Look for next tags until the next header tag
    while next_tag and next_tag.name not in tags:
        # Reference section can contain both p and li tags
        if 'reference' in str(next_tag).lower() and next_tag.name in ['p', 'li']:
            # print(str(next_tag).lower())
            content.append(next_tag.get_text(strip=False))
        elif next_tag.name == 'p':
            content.append(next_tag.get_text(strip=False))
        next_tag = next_tag.find_next()

    key = f"{tag.name} {tag.get_text(strip=True)}"
    doc_dict[key] = " ".join(content)

for key in list(doc_dict.keys()):  # Using list() to avoid changing the dictionary size during iteration
    if key.endswith('[edit]'):
        new_key = key.rsplit('[edit]', 1)[0]
        doc_dict[new_key] = doc_dict.pop(key)
#     new_key = key.rstrip('\[edit\]')
#     if new_key != key:
#         doc_dict[new_key] = doc_dict.pop(key)
#
# processed = []
# for s in strings:
#     if s.endswith('[edit]'):
#         s = s.rsplit('[edit]', 1)[0]
#     processed.append(s)

del doc_dict['h2 Contents']

pprint(doc_dict, sort_dicts=False)
pprint(doc_dict.keys())

{'h1 Self-driving car': '\n'
                        ' A self-driving car, also known as an autonomous car, '
                        'driverless car, or robotic car (robo-car),[1][2][3] '
                        'is a car that is capable of traveling without human '
                        'input.[4][5] Self-driving cars use sensors to '
                        'perceive their surroundings, such as optical and '
                        'thermographic cameras, radar, lidar, '
                        'ultrasound/sonar, GPS, odometry and inertial '
                        'measurement units.[6] Control systems interpret '
                        'sensory information to create a three-dimensional '
                        "model of the vehicle's surroundings. Based on the "
                        'model, the car then identifies an appropriate '
                        'navigation path and strategies for managing traffic '
                        'controls (stop signs, traffic lights, spe

In [27]:
doctran = Doctran(openai_api_key=os.getenv('OPENAI_API_KEY'), openai_model='gpt-3.5-turbo')

async def extract_topics(string: str):
    """Extract topics discussed in a string."""
    document = doctran.parse(content=string)
    properties = ExtractProperty(
        # name="keywords_max_three",
        # description="The 1, 2, or 3 most important keywords from the document.",
        name='title',
        description='The title of the document (max 7 words).',
        type="string",
        required=True
    )
    document = await document.extract(properties=[properties]).execute()
    return document.transformed_content

In [28]:
pprint(doc_dict, sort_dicts=False)

{'h1 Self-driving car': '\n'
                        ' A self-driving car, also known as an autonomous car, '
                        'driverless car, or robotic car (robo-car),[1][2][3] '
                        'is a car that is capable of traveling without human '
                        'input.[4][5] Self-driving cars use sensors to '
                        'perceive their surroundings, such as optical and '
                        'thermographic cameras, radar, lidar, '
                        'ultrasound/sonar, GPS, odometry and inertial '
                        'measurement units.[6] Control systems interpret '
                        'sensory information to create a three-dimensional '
                        "model of the vehicle's surroundings. Based on the "
                        'model, the car then identifies an appropriate '
                        'navigation path and strategies for managing traffic '
                        'controls (stop signs, traffic lights, spe

In [29]:
doc_dict.keys()

dict_keys(['h1 Self-driving car', 'h2 History', 'h2 Definitions', 'h3 Terminology and safety considerations', 'h3 Autonomous vs. automated', 'h3 Autonomous versus cooperative', 'h2 Classifications', 'h3 Self-driving car', 'h3 SAE classification', 'h3 Levels of driving automation', 'h3 Criticism of SAE', 'h2 Technology', 'h3 General perspectives', 'h3 Hybrid navigation', 'h3 Drive by wire', 'h3 Driver monitoring system', 'h3 Vehicular communication', 'h3 Re-programmable', 'h3 Modularity', 'h3 Homogenization', 'h3 Mathematical safety model', 'h2 Challenges', 'h3 Obstacles', 'h3 Concerns', 'h3 Human factors', 'h3 Moral issues', 'h3 Level 4 infrastructure', 'h2 Applications', 'h2 Testing', 'h3 Approaches', 'h3 Disengagements in the 2010s', 'h3 In the 2020s', 'h2 Incidents', 'h3 Tesla Autopilot', 'h3 Waymo', "h3 Uber's Advanced Technologies Group (ATG)", 'h3 Navya Arma driving system', 'h3 NIO Navigate on Pilot', 'h3 Toyota e-Palette operation', 'h2 Public opinion surveys', 'h3 In the 2010s

In [30]:
final_dict: dict = {}
start_dict = copy(doc_dict)
for heading, content in start_dict.items():
    print(heading)

h1 Self-driving car
h2 History
h2 Definitions
h3 Terminology and safety considerations
h3 Autonomous vs. automated
h3 Autonomous versus cooperative
h2 Classifications
h3 Self-driving car
h3 SAE classification
h3 Levels of driving automation
h3 Criticism of SAE
h2 Technology
h3 General perspectives
h3 Hybrid navigation
h3 Drive by wire
h3 Driver monitoring system
h3 Vehicular communication
h3 Re-programmable
h3 Modularity
h3 Homogenization
h3 Mathematical safety model
h2 Challenges
h3 Obstacles
h3 Concerns
h3 Human factors
h3 Moral issues
h3 Level 4 infrastructure
h2 Applications
h2 Testing
h3 Approaches
h3 Disengagements in the 2010s
h3 In the 2020s
h2 Incidents
h3 Tesla Autopilot
h3 Waymo
h3 Uber's Advanced Technologies Group (ATG)
h3 Navya Arma driving system
h3 NIO Navigate on Pilot
h3 Toyota e-Palette operation
h2 Public opinion surveys
h3 In the 2010s
h2 Regulation
h2 Commercialization
h3 Level 2 commercialization
h4 Level 2 development
h3 Level 3 commercialization
h4 Level 3 deve

In [31]:
final_dict: dict = {}
start_dict = copy(doc_dict)
for heading, content in start_dict.items():
    if 'reference' in heading.lower() or 'further reading' in heading.lower() or 'see also' in heading.lower():
        print(f"Found 'referece' in {heading}")
        final_dict[heading] = content
        continue
    num_tokens = num_tokens_from_string(content)
    max_allowed_tokens = 512
    if num_tokens <= max_allowed_tokens:
        final_dict[heading] = content
    else:
        # Split the document into smaller chunks, then add topics
        char_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_allowed_tokens,
            chunk_overlap=0,
            # ' ' separator means sometimes sentences will be cut in two to ensure
            # the chunk size is not exceeded
            separators=["\n\n", "\n", ' '],
            length_function=num_tokens_from_string,
        )
        splits: List[str] = char_splitter.split_text(content)
        for split in splits:
            # Headings are of the form h1, h2, h3 etc. we split it into more of the same level
            heading_level = int(heading[1])
            title = await extract_topics(split)
            new_heading = f"h{heading_level}_{title}"
            final_dict[new_heading] = split
            print(f'Added {new_heading} split original heading {heading}')

n_keys_start = len(start_dict.keys())
n_keys_final = len(final_dict.keys())
print(f"We started with {n_keys_start} sections and got {n_keys_final} final sections")
print(f"That's a {n_keys_final / n_keys_start:.2f}x increase in sections")
# pprint(final_dict, sort_dicts=False)

Added h2_History of Automated Driving Systems split original heading h2 History
Added h2_Automated Driving split original heading h2 History
Added h2_Autonomous Vehicles split original heading h2 History
Added h3_SAE Automation Level Definitions split original heading h3 Levels of driving automation
Added h3_Levels of Vehicle Automation split original heading h3 Levels of driving automation
Added h3_SAE Levels of Automation split original heading h3 Levels of driving automation
Added h3_Hybrid Navigation split original heading h3 Hybrid navigation
Added h3_Path Planning split original heading h3 Hybrid navigation
Added h3_Deceptive Marketing and National Security in Self-Driving Cars split original heading h3 Concerns
Added h3_Information on Hacking Attack and Automotive Data Security split original heading h3 Concerns
Added h3_Implications of Liability and the Trolley Problem in Self-Driving Cars split original heading h3 Moral issues
Added h3_Autonomous Vehicles and Ethical Dilemmas 

In [32]:
pprint(list(final_dict.keys()))

['h1 Self-driving car',
 'h2_History of Automated Driving Systems',
 'h2_Automated Driving',
 'h2_Autonomous Vehicles',
 'h2 Definitions',
 'h3 Terminology and safety considerations',
 'h3 Autonomous vs. automated',
 'h3 Autonomous versus cooperative',
 'h2 Classifications',
 'h3 Self-driving car',
 'h3 SAE classification',
 'h3_SAE Automation Level Definitions',
 'h3_Levels of Vehicle Automation',
 'h3_SAE Levels of Automation',
 'h3 Criticism of SAE',
 'h2 Technology',
 'h3 General perspectives',
 'h3_Hybrid Navigation',
 'h3_Path Planning',
 'h3 Drive by wire',
 'h3 Driver monitoring system',
 'h3 Vehicular communication',
 'h3 Re-programmable',
 'h3 Modularity',
 'h3 Homogenization',
 'h3 Mathematical safety model',
 'h2 Challenges',
 'h3 Obstacles',
 'h3_Deceptive Marketing and National Security in Self-Driving Cars',
 'h3_Information on Hacking Attack and Automotive Data Security',
 'h3 Human factors',
 'h3_Implications of Liability and the Trolley Problem in Self-Driving Cars',


In [33]:
pprint(final_dict, sort_dicts=False)

{'h1 Self-driving car': '\n'
                        ' A self-driving car, also known as an autonomous car, '
                        'driverless car, or robotic car (robo-car),[1][2][3] '
                        'is a car that is capable of traveling without human '
                        'input.[4][5] Self-driving cars use sensors to '
                        'perceive their surroundings, such as optical and '
                        'thermographic cameras, radar, lidar, '
                        'ultrasound/sonar, GPS, odometry and inertial '
                        'measurement units.[6] Control systems interpret '
                        'sensory information to create a three-dimensional '
                        "model of the vehicle's surroundings. Based on the "
                        'model, the car then identifies an appropriate '
                        'navigation path and strategies for managing traffic '
                        'controls (stop signs, traffic lights, spe

In [34]:
text_1 = final_dict['h2 Regulation']
text_2 = final_dict['h2 Commercialization']

# Normalized by default
embed_ada = OpenAIEmbeddings(
    model="text-embedding-ada-002",
)

embed_e5 = HuggingFaceEmbeddings(
    model_name='intfloat/e5-base-v2',
    encode_kwargs = {'normalize_embeddings': True}
)

# e5-base-v2 requires all embeddings to be prepended with 'query: '
e51 = embed_e5.embed_query("query: " + text_1)
e52 = embed_e5.embed_query("query: " + text_2)
print(len(e51))
ada1 = embed_ada.embed_query(text_1)
ada2 = embed_ada.embed_query(text_2)
print(len(ada1))

768
1536


In [35]:
cosine_similarity([e51], [e51])[0][0]

1.0000000000000016

In [30]:
modified_dict = copy(final_dict)
modified_dict['h3_extra key'] = 'extra value'
article_1 = convert_to_markdown(final_dict)
article_2 = convert_to_markdown(modified_dict)

short_1 = article_1[:1000]
short_2 = article_2[:1000]

In [33]:
num_tokens_from_string(short_2)

208

In [34]:
evaluate.logging.set_verbosity_info()
# Mauve has to compare the actual text strings themselves, not embeddings
mauve = load('mauve')
mauve_results = mauve.compute(
    predictions=[short_1],
    references=[short_2],
    device_id=0,
)
mauve_results.mauve

Tokenizing text...
Featurizing tokens


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds


  explained_variance_ratio_ = explained_variance_ / total_var


1.0

In [None]:
mauve_results.mauve

In [35]:
rouge = load('rouge')
results = rouge.compute(predictions=[short_1], references=[short_2], rouge_types=['rougeL'])
print(results['rougeL'])

INFO:evaluate.module:Removing C:\Users\User\.cache\huggingface\metrics\rouge\default\default_experiment-1-0.arrow


1.0


In [37]:
a = embed_ada.embed_query('hello')
a_ = str(a)
print(num_tokens_from_string(a_))

15605


In [55]:
for heading, content in final_dict.items():
    print(heading, num_tokens_from_string(content))

h1 Self-driving car 450
h2_History of Automated Driving Systems 427
h2_Automated Driving 410
h2_Autonomous Vehicles 432
h2 Definitions 113
h3 Terminology and safety considerations 409
h3 Autonomous vs. automated 470
h3 Autonomous versus cooperative 78
h2 Classifications 0
h3 Self-driving car 154
h3 SAE classification 175
h3_SAE Automation Level Definitions 54
h3_Levels of Vehicle Automation 511
h3_SAE Levels of Automation 49
h3 Criticism of SAE 74
h2 Technology 0
h3 General perspectives 87
h3_Hybrid Navigation 425
h3_Path Planning 184
h3 Drive by wire 30
h3 Driver monitoring system 70
h3 Vehicular communication 100
h3 Re-programmable 164
h3 Modularity 69
h3 Homogenization 191
h3 Mathematical safety model 108
h2 Challenges 0
h3 Obstacles 442
h3_Deceptive Marketing and National Security in Self-Driving Cars 501
h3_Information on Hacking Attack and Automotive Data Security 155
h3 Human factors 472
h3_Implications of Liability and the Trolley Problem in Self-Driving Cars 382
h3_Autonomous 

In [56]:
final_dict['h2 See also']

''

In [62]:
embed_e5.embed_query('query:  ')

[-0.027671251446008682,
 -0.014324747025966644,
 -0.07623715698719025,
 -0.008881746791303158,
 0.05483413487672806,
 -0.05948346108198166,
 0.019534360617399216,
 0.03808154910802841,
 -0.05160670727491379,
 -0.030902404338121414,
 -0.01734822243452072,
 0.02849997766315937,
 -0.05307517200708389,
 -0.011364475823938847,
 -0.05454074963927269,
 0.059753913432359695,
 0.024845324456691742,
 -0.03145405277609825,
 0.025407718494534492,
 -0.003935365471988916,
 -0.008711738511919975,
 -0.016896313056349754,
 0.056399039924144745,
 -0.006800119765102863,
 0.020830068737268448,
 -0.005601944401860237,
 0.0018398178508505225,
 0.04083454608917236,
 -0.042921312153339386,
 -0.0027586885262280703,
 0.01847963221371174,
 0.03277849033474922,
 0.04608035460114479,
 -0.008207514882087708,
 -0.06770320236682892,
 0.04357730224728584,
 -0.05002405494451523,
 0.014490899629890919,
 -0.037664540112018585,
 -0.004635797347873449,
 -0.05585611239075661,
 0.00017891127208713442,
 -0.01470094732940197,


In [63]:
# Create json plan

final_headings = list(final_dict.keys())
final_content = list(final_dict.values())

def create_section_json(heading, content, id=1):
    result = {
        'section_id': id,
        'section': heading,
        'content': content,
        'section_embedding_1': embed_ada.embed_query(heading),
        'section_embedding_2': embed_e5.embed_query("query: " + heading),
        'content_embedding_1': embed_ada.embed_query(content),
        'content_embedding_2': embed_e5.embed_query("query: " + content),
    }
    print(f"Created embeddings for {heading}")
    return result

def calcuate_plan_embedding(plan: List[dict], i: int) -> List[float]:
    """Calculate plan embedding by averaging the section embeddings and content embeddings
    sequentially.

    Parameters
    ----------
    plan : List[dict]
        List of dictionaries, each containing the section and content embeddings.
    i : int
        The index of the embedding to use.
    """
    try:
        s1_mean = np.mean([x[f'section_embedding_{i}'] for x in plan], axis=0)
        c1_mean = np.mean([x[f'content_embedding_{i}'] for x in plan], axis=0)
    except KeyError:
        raise KeyError(
            f'Could not find section_embedding_{i} or content_embedding_{i} in '
            f'every element in plan. Please check that every element has the correct keys.'
        )
    total_mean = np.mean([c1_mean, s1_mean], axis=0)
    total_mean = list(total_mean)
    print(f"Calculated plan embedding {i}")
    return total_mean

# title is h1_example
title = final_headings[0][3:]
abstract = final_content[0]
plan = [
    create_section_json(heading, content, id=i)
    for i, (heading, content) in enumerate(zip(
        final_headings[1:], final_content[1:]), start=1
    )
    if 0 < num_tokens_from_string(content) <= 8000
]
plan_embed_1 = calcuate_plan_embedding(plan, 1)
plan_embed_2 = calcuate_plan_embedding(plan, 2)
plan_json = {
    'id': str(uuid4()),
    'title': title,
    'abstract': abstract,
    'title_embedding_1': embed_ada.embed_query(title),
    'title_embedding_2': embed_e5.embed_query("query: " + title),
    'abstract_embedding_1': embed_ada.embed_query(abstract),
    'abstract_embedding_2': embed_e5.embed_query("query: " + abstract),
    'plan': plan,
    'plan_embedding_1': plan_embed_1,
    'plan_embedding_2': plan_embed_2,
    'embedding1_model': 'text-embedding-ada-002',
    'embedding2_model': 'e5-base-v2',
    'success': True,
    'error': None,
}
plan_json

Created embeddings for h2_History of Automated Driving Systems
Created embeddings for h2_Automated Driving
Created embeddings for h2_Autonomous Vehicles
Created embeddings for h2 Definitions
Created embeddings for h3 Terminology and safety considerations
Created embeddings for h3 Autonomous vs. automated
Created embeddings for h3 Autonomous versus cooperative
Created embeddings for h3 Self-driving car
Created embeddings for h3 SAE classification
Created embeddings for h3_SAE Automation Level Definitions
Created embeddings for h3_Levels of Vehicle Automation
Created embeddings for h3_SAE Levels of Automation
Created embeddings for h3 Criticism of SAE
Created embeddings for h3 General perspectives
Created embeddings for h3_Hybrid Navigation
Created embeddings for h3_Path Planning
Created embeddings for h3 Drive by wire
Created embeddings for h3 Driver monitoring system
Created embeddings for h3 Vehicular communication
Created embeddings for h3 Re-programmable
Created embeddings for h3 Mo

{'id': '39535ce4-241a-4fb5-8c9f-7be440a258ca',
 'title': 'Self-driving car',
 'abstract': "\n A self-driving car, also known as an autonomous car, driverless car, or robotic car (robo-car),[1][2][3] is a car that is capable of traveling without human input.[4][5] Self-driving cars use sensors to perceive their surroundings, such as optical and thermographic cameras, radar, lidar, ultrasound/sonar, GPS, odometry and inertial measurement units.[6] Control systems interpret sensory information to create a three-dimensional model of the vehicle's surroundings. Based on the model, the car then identifies an appropriate navigation path and strategies for managing traffic controls (stop signs, traffic lights, speed limits, yield signs, etc.) and obstacles.[7][8][9][10][11]\n Once the technology matures, autonomous vehicles are predicted to impact the automotive industry, health, welfare, urban planning, traffic, insurance, labor market, and other fields. Their regulation is becoming an increa

In [89]:
'plan' in

True

In [81]:
plan_json['plan'][0].keys()

dict_keys(['section_id', 'section', 'content', 'section_embedding_1', 'section_embedding_2', 'content_embedding_1', 'content_embedding_2'])

In [102]:
async def get_embeddings(
    input: str | List[str], model: str = "text-embedding-ada-002"
) -> List[float] | List[List[float]]:
    """This function takes one string or a list of strings and a model name,
    generates an embedding for each string in the list using the specified model,
    and returns a list of embeddings, each represented as a list of floating point
    numbers.

    Parameters
    ----------
    input : str | List[str]
        The input string or list of strings to be embedded.
    model : str, optional ['text-embedding-ada-002', 'e5-base-v2']
        The name of the model to be used for embedding.
    """
    if model == "text-embedding-ada-002":
        embedder = OpenAIEmbeddings(model=model)
    elif model == "e5-base-v2":
        embedder = HuggingFaceEmbeddings(
            model_name=f"intfloat/{model}", encode_kwargs={"normalize_embeddings": True}
        )
    else:
        raise ValueError(
            f"Model name must be 'text-embedding-ada-002' or 'e5-base-v2'. Received {model}"
        )

    if isinstance(input, str):
        try:
            return await embedder.aembed_query(input)
        except NotImplementedError as e:
            return embedder.embed_query(input)
    elif isinstance(input, list):
        try:
            return await embedder.aembed_documents(input)
        except NotImplementedError as e:
            return embedder.embed_documents(input)
    else:
        raise ValueError(
            f"Input must be a string or a list of strings. Received {type(input)}"
        )

In [105]:
await get_embeddings(['hello', 'there'])

[[-0.025024859691861624,
  -0.01943293435610205,
  -0.027708454563770944,
  -0.031039815963126702,
  -0.024667928445974433,
  0.027417623007502064,
  -0.01249920972174263,
  -0.008513475923798008,
  -0.01746320197596711,
  -0.008453986761935083,
  0.03252042144342819,
  0.0042930921392632625,
  -0.02449607148814104,
  -0.0006386759458372029,
  0.014145060700799534,
  -0.001504565150955045,
  0.03942109343901068,
  0.0020209590807955756,
  0.026875615610915893,
  -0.012585138200659327,
  -0.020992856856242255,
  0.00889684648673897,
  0.008427547444881313,
  -0.00306200893601763,
  -0.005413460178477275,
  -0.009491732517432683,
  0.01109131352749225,
  -0.001705339093181914,
  0.003476776478998253,
  -0.023200544021183715,
  0.006725513499161767,
  -0.007931810017292476,
  -0.023887966264581735,
  -0.008943115524413715,
  0.006847795689781426,
  -0.013682372186697262,
  0.009485121989677298,
  -0.014131840576611354,
  0.02170671934801664,
  -0.010569136782849643,
  0.003407373155316783

In [92]:
prediction = deepcopy(plan_json)
document = deepcopy(plan_json)

def compare_documents(document: dict, prediction: dict, compare_on: str ='section'):
    """Compare the 'compare_on' sections of document and prediction. Calculate MAUVE,
    and ROUGE-L scores on the actual text, and cosine similarity on the embeddings.
    """
    if compare_on not in ['section', 'content']:
        raise ValueError(
            f"`compare_on` must be 'section' or 'content'. Received {compare_on}"
        )

    if not isinstance(document, dict) or not isinstance(prediction, dict):
        raise TypeError(
            'Both `document` and `prediction` must be dictionaries. Received '
            f'{type(document)} and {type(prediction)}'
        )

    if 'plan' not in document or 'plan' not in prediction:
        raise ValueError(
            f'Both `document` and `prediction` must contain the key "plan". At least '
            f'one of them does not.'
        )

    mauve = load("mauve")
    rouge = load("rouge")

    section_results = []
    predict_plan = prediction['plan']
    doc_plan = document['plan']
    for i, (p_dict, d_dict) in enumerate(zip(predict_plan, doc_plan), start=1):
        idx = i
        # Compute MAUVE
        mauve_results = mauve.compute(
            predictions=[p_dict['section']], references=[d_dict['section']]
        )
        mauve_score = mauve_results.mauve
        # Compute ROUGE-L
        results = rouge.compute(
            predictions=[p_dict['section']],
            references=[d_dict['section']],
            rouge_types=["rougeL"],
        )
        rouge_score = results["rougeL"]
        # Compute cosine distance between both section embeddings
        cosine_1 = cosine_similarity(
            [p_dict["section_embedding_1"]], [d_dict["section_embedding_1"]]
        )[0][0]
        cosine_2 = cosine_similarity(
            [p_dict["section_embedding_2"]], [d_dict["section_embedding_2"]]
        )[0][0]
        # Combine results
        result = {
            'section_id': idx,
            'mauve_similarity': mauve_score,
            'rouge_L_similarity': rouge_score,
            'embedding1_cosine_similarity': cosine_1,
            'embedding2_cosine_similarity': cosine_2,
        }
        section_results.append(result)

    # Calcualte total scores
    mauve_total = np.mean([x['mauve_similarity'] for x in section_results])
    rouge_total = np.mean([x['rouge_L_similarity'] for x in section_results])
    cosine_1_total = np.mean([x['embedding1_cosine_similarity'] for x in section_results])
    cosine_2_total = np.mean([x['embedding2_cosine_similarity'] for x in section_results])

    total_results = {
        'mauve_similarity': mauve_total,
        'rouge_L_similarity': rouge_total,
        'embedding1_cosine_similarity': cosine_1_total,
        'embedding2_cosine_similarity': cosine_2_total,
    }

    if compare_on == 'section':
        compare_on = 'plan'

    output = {
        'document_id': document['id'],
        'prediction_id': prediction['id'],
        f'{compare_on}_total_similarity': total_results,
        f'{compare_on}_bysection_similarity': section_results,
    }
    return output

In [93]:
sec_results = compare_documents(document, prediction, compare_on='section')

Tokenizing text...
Featurizing tokens


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds


  explained_variance_ratio_ = explained_variance_ / total_var


In [94]:
content_results = compare_documents(document, prediction, compare_on='content')

Tokenizing text...
Featurizing tokens


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.0 seconds
Tokenizing text...
Featurizing tokens


  explained_variance_ratio_ = explained_variance_ / total_var


Featurizing p:   0%|          | 0/1 [00:00<?, ?it/s]

Tokenizing text...
Featurizing tokens


Featurizing q:   0%|          | 0/1 [00:00<?, ?it/s]

seed = 25
performing clustering in lower dimension = 0
kmeans time: 0.0 s
total discretization time: 0.01 seconds


  explained_variance_ratio_ = explained_variance_ / total_var


In [96]:
sec_results

{'document_id': '39535ce4-241a-4fb5-8c9f-7be440a258ca',
 'prediction_id': '39535ce4-241a-4fb5-8c9f-7be440a258ca',
 'plan_total_similarity': {'mauve_similarity': 1.0,
  'rouge_L_similarity': 1.0,
  'embedding1_cosine_similarity': 1.0,
  'embedding2_cosine_similarity': 1.000000000000001},
 'plan_bysection_similarity': [{'section_id': 1,
   'mauve_similarity': 1.0,
   'rouge_L_similarity': 1.0,
   'embedding1_cosine_similarity': 0.9999999999999983,
   'embedding2_cosine_similarity': 1.0000000000000009},
  {'section_id': 2,
   'mauve_similarity': 1.0,
   'rouge_L_similarity': 1.0,
   'embedding1_cosine_similarity': 0.999999999999999,
   'embedding2_cosine_similarity': 1.0000000000000022},
  {'section_id': 3,
   'mauve_similarity': 1.0,
   'rouge_L_similarity': 1.0,
   'embedding1_cosine_similarity': 1.0000000000000013,
   'embedding2_cosine_similarity': 1.0},
  {'section_id': 4,
   'mauve_similarity': 1.0,
   'rouge_L_similarity': 1.0,
   'embedding1_cosine_similarity': 1.0000000000000009,

In [87]:
section_results

[{'section_id': 1,
  'mauve_similarity': 1.0,
  'rouge_L_similarity': 1.0,
  'embedding1_cosine_similarity': 0.9999999999999983,
  'embedding2_cosine_similarity': 1.0000000000000009},
 {'section_id': 2,
  'mauve_similarity': 1.0,
  'rouge_L_similarity': 1.0,
  'embedding1_cosine_similarity': 0.999999999999999,
  'embedding2_cosine_similarity': 1.0000000000000022}]

In [85]:
total_results

{'mauve_similarity': 1.0,
 'rouge_L_similarity': 1.0,
 'embedding1_cosine_similarity': 0.9999999999999987,
 'embedding2_cosine_similarity': 1.0000000000000016}

In [68]:
num_tokens_from_string('')

0

In [51]:
def create_section_json(heading, content, id=1):
    return {
        'section_id': id,
        'section': heading,
        'content': content,
        'section_embedding_1': embed_ada.embed_query(heading),
        'section_embedding_2': embed_e5.embed_query("query: " + heading),
        'content_embedding_1': embed_ada.embed_query(content),
        'content_embedding_2': embed_e5.embed_query("query: " + content),
    }

# plan_embedding_1 = np.mean([s['section_embedding_1']])
plan = [
    dict(s1=[1,2,3], c1=[4,5,6]),
    dict(s1=[7,8,9], c1=[10,11,12]),
]
s1_mean = np.mean([x['s1'] for x in plan], axis=0)
c1_mean = np.mean([x['c1'] for x in plan], axis=0)
total_mean = np.mean([c1_mean, s1_mean], axis=0)
total_mean

array([5.5, 6.5, 7.5])

In [53]:
(26)/4

6.5

In [None]:
def compare_content(plan_1: List[dict], plan_2: List[dict]):
    document_id = uuid4()
    prediction_id = uuid4()
    total_similarity = 'not possible'
    by_section_similarity = ...

In [None]:
def compare_plans(plan_1, plan_2):
    # Mauve has to compare the actual text strings themselves, not embeddings
    mauve = load('mauve')
    mauve_results = mauve.compute(predictions=[article_1], references=[article_2])
    mauve_results.mauve

    cosine_1 = cosine_similarity([plan_1['plan_embedding_1']], [plan_2['plan_embedding_1']])[0][0]
    cosine_2 = cosine_similarity([plan_1['plan_embedding_2']], [plan_2['plan_embedding_2']])[0][0]
    mauve_1 = ...
    return {
        'document_id': 'what is this?',
        'prediction_id': 'what is this?',
        'plan_similarity': '...'
    }

In [70]:
ex = [{"section_id": 1, "section": "Introduction", "content": "Introduction content...", "section_embedding1": [0.1, 0.2, 0.3]}]
embed_ada.embed_query(str(ex))
embed_e5.embed_query("query: " + str(ex))

[0.032588593661785126,
 -0.06461814790964127,
 0.03948038071393967,
 0.0012389448238536716,
 -0.06215894967317581,
 0.047782134264707565,
 -0.034225013107061386,
 -0.02329987846314907,
 -0.02498895302414894,
 -0.008283743634819984,
 0.0016184649430215359,
 -0.039514657109975815,
 0.006588333752006292,
 0.053870778530836105,
 0.01625111699104309,
 0.0062661306001245975,
 -0.003257924458011985,
 0.01289695780724287,
 0.039429157972335815,
 -0.03887464851140976,
 0.021556127816438675,
 -0.05280131474137306,
 -0.0013507960829883814,
 0.031462058424949646,
 -0.00983183179050684,
 -0.006684713996946812,
 -0.04383727163076401,
 -0.029189366847276688,
 -0.011523071676492691,
 0.03038288652896881,
 0.003982644062489271,
 0.007118346635252237,
 -0.020087596029043198,
 -0.02586466632783413,
 0.027217458933591843,
 -0.021199923008680344,
 -0.059106871485710144,
 -0.037464115768671036,
 0.030624210834503174,
 0.022320523858070374,
 0.004694132134318352,
 0.036746736615896225,
 -0.01298319362103939,

In [66]:
id(12)

2248336212560

# Unused Code

In [25]:
def convert_to_markdown(article_dict):
    md_text = ""

    for heading, content in article_dict.items():
        # heading is of form: 'h3 Example'
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(heading[1])
        heading = heading[3:]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{content}\n\n"

    return md_text

article_dict = {
    'h1_Self-driving car': 'Content for Self-driving car',
    'h2_History': 'Content for History',
    'h2_Definitions': 'Content for Definitions',
    'h3_Terminology and safety considerations': 'Content for Terminology and safety considerations',
    'h3_Autonomous vs. automa': 'Content for Autonomous vs. automa',
    'h3_Autonomous versus cooperativ': 'Content for Autonomous versus cooperativ',
    'h2_Classifications': 'Content for Classifications',
    'h3_Self-driving car': 'Content for Self-driving car',
    'h3_SAE classification': 'Content for SAE classification',
    'h3_Levels of driving automation': 'Content for Levels of driving automation'
}

self_driving_md = convert_to_markdown(doc_dict)

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("#####", "Header 5"),
    ("######", "Header 6"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
# markdown_splitter = MarkdownTextSplitter(chunk_size=512, chunk_overlap=0)
docs = markdown_splitter.split_text(self_driving_md)
# docs[0]Document(page_content="Dual phase evolution (DPE) is a process that drives self-organization within complex adaptive systems.[1] It arises in response to phase changes within the network of connections formed by a system's components. DPE occurs in a wide range of physical, biological and social systems. Its applications to technology include methods for manufacturing novel materials and algorithms to solve complex problems in computation.", metadata={'Header 1': 'Dual-phase evolution'})
docs[1]

In [19]:
embed_e5 = SentenceTransformer('intfloat/e5-base-v2')

Downloading (…)b9212/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Downloading (…)0777bb9212/README.md:   0%|          | 0.00/67.5k [00:00<?, ?B/s]

Downloading (…)77bb9212/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)777bb9212/handler.py:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)b9212/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)0777bb9212/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)7bb9212/modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

In [21]:
"query: " + final_docs[0].page_content

"query: A self-driving car, also known as an autonomous car, driverless car, or robotic car (robo-car),[1][2][3] is a car that is capable of traveling without human input.[4][5] Self-driving cars use sensors to perceive their surroundings, such as optical and thermographic cameras, radar, lidar, ultrasound/sonar, GPS, odometry and inertial measurement units.[6] Control systems interpret sensory information to create a three-dimensional model of the vehicle's surroundings. Based on the model, the car then identifies an appropriate navigation path and strategies for managing traffic controls (stop signs, traffic lights, speed limits, yield signs, etc.) and obstacles.[7][8][9][10][11]\nOnce the technology matures, autonomous vehicles are predicted to impact the automotive industry, health, welfare, urban planning, traffic, insurance, labor market, and other fields. Their regulation is becoming an increasingly important issue.\nAutonomy in vehicles is often divided into six levels,[12] acc

In [22]:
embeddings = embed_e5.encode("query: " + final_docs[0].page_content, normalize_embeddings=True)
embeddings

array([ 0.0232219 , -0.07830162,  0.02831967, ..., -0.03946006,
        0.03561037,  0.02590975], dtype=float32)

In [23]:
embedding_openai = openai.Embedding.create(
    input = [final_docs[0].page_content],
    model="text-embedding-ada-002")
['data'][0]['embedding']
embedding_openai

TypeError: string indices must be integers

In [6]:
for i, doc in enumerate(docs):
    tokens = num_tokens_from_string(doc.page_content)
    if tokens > 512:
        print(f"Document {i}")
        print(doc.metadata)
        print(f"{num_tokens_from_string(doc.page_content):,} tokens")
        print(doc.page_content[:100])
        print()

Document 1
{'Header 1': 'Self-driving car', 'Header 2': 'History'}
1,269 tokens
Experiments have been conducted on automated driving systems (ADS) since at least the 1920s;[22] tri

Document 8
{'Header 1': 'Self-driving car', 'Header 2': 'Classifications', 'Header 3': 'Levels of driving automation'}
614 tokens
In SAE's automation level definitions, "driving mode" means "a type of driving scenario with charact

Document 11
{'Header 1': 'Self-driving car', 'Header 2': 'Technology', 'Header 3': 'Hybrid navigation'}
609 tokens
Hybrid navigation is the simultaneous use of more than one navigation system for location data deter

Document 20
{'Header 1': 'Self-driving car', 'Header 2': 'Challenges', 'Header 3': 'Concerns'}
656 tokens
Deceptive marketing
As Tesla's "Full Self-Driving (FSD)" actually corresponds to Level 2,[135]
senat

Document 22
{'Header 1': 'Self-driving car', 'Header 2': 'Challenges', 'Header 3': 'Moral issues'}
734 tokens
Rationale for liabilityThere are different opinions

In [7]:
history = docs[1]
history

Document(page_content='Experiments have been conducted on automated driving systems (ADS) since at least the 1920s;[22] trials began in the 1950s. The first semi-automated car was developed in 1977, by Japan\'s Tsukuba Mechanical Engineering Laboratory, which required specially marked streets that were interpreted by two cameras on the vehicle and an analog computer. The vehicle reached speeds up to 30 kilometres per hour (19\xa0mph) with the support of an elevated rail.[23][24]\nA landmark autonomous car appeared in the 1980s, with Carnegie Mellon University\'s Navlab[25] and ALV[26][27] projects funded by the United States\' Defense Advanced Research Projects Agency (DARPA) starting in 1984 and Mercedes-Benz and Bundeswehr University Munich\'s EUREKA Prometheus Project in 1987.[28] By 1985, the ALV had demonstrated self-driving speeds on two-lane roads of 31 kilometres per hour (19\xa0mph), with obstacle avoidance added in 1986, and off-road driving in day and night time conditions b

In [7]:
final_docs: List[Document] = []
start_docs = copy(docs)
for i, doc in enumerate(start_docs):
    num_tokens = num_tokens_from_string(doc.page_content)
    max_allowed_tokens = 512
    if num_tokens <= max_allowed_tokens:
        # Add topics
        topics = await extract_topics(doc.page_content)
        doc.metadata['topics'] = topics
        final_docs.append(doc)
    else:
        # Split the document into smaller chunks, then add topics
        char_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_allowed_tokens,
            chunk_overlap=0,
            # ' ' separator means sometimes sentences will be cut in two to ensure
            # the chunk size is not exceeded
            separators=["\n\n", "\n", ' '],
            length_function=num_tokens_from_string,
        )
        splits: List[str] = char_splitter.split_text(doc.page_content)
        for split in splits:
            new_metadata = doc.metadata.copy()
            topics = await extract_topics(split)
            new_metadata['topics'] = topics
            final_docs.append(Document(page_content=split, metadata=new_metadata))

print(f"We started with {len(start_docs)} docs and got {len(final_docs)} final docs")
print(f"That's a {len(final_docs) / len(start_docs):.2f}x increase in docs")
for d in final_docs:
    print(d.metadata)
    print(f"{num_tokens_from_string(d.page_content):,} tokens")
    print(d.page_content[:100])
    print()

We started with 14 docs and got 15 final docs
That's a 1.07x increase in docs
{'Header 1': 'Dual-phase evolution', 'topics': 'Dual Phase Evolution'}
75 tokens
Dual phase evolution (DPE) is a process that drives self-organization within complex adaptive system

{'Header 1': 'Dual-phase evolution', 'Header 2': 'Introduction', 'topics': 'Dual Phase Evolution'}
265 tokens
Dual phase evolution (DPE) is a process that promotes the emergence of large-scale order in complex 

{'Header 1': 'Dual-phase evolution', 'Header 2': 'The DPE mechanism', 'topics': 'DPE'}
13 tokens
The following features are necessary for DPE to occur.[1]

{'Header 1': 'Dual-phase evolution', 'Header 2': 'The DPE mechanism', 'Header 3': 'Underlying network', 'topics': 'DPE'}
279 tokens
DPE occurs where a system has an underlying network. That is, the system's components form a set of 

{'Header 1': 'Dual-phase evolution', 'Header 2': 'The DPE mechanism', 'Header 3': 'Phase shifts', 'topics': 'Graphs and Networks'}
200 to

In [42]:
char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=0,
    separators=["\n\n", "\n"],
    length_function=num_tokens_from_string,
)
splits = char_splitter.split_text(history.page_content)
print(f"We got {len(splits)} splits")

for i, split in enumerate(splits):
    topics = await extract_topics(split)
    print(f"Split {i} - Topics: {topics}")
    print(f"{num_tokens_from_string(split):,} tokens, {len(split)} characters")
    print(split)
    print()

We got 3 splits
Split 0 - Topic: automated driving systems, autonomous car, milestones
427 tokens, 1737 characters
Experiments have been conducted on automated driving systems (ADS) since at least the 1920s;[22] trials began in the 1950s. The first semi-automated car was developed in 1977, by Japan's Tsukuba Mechanical Engineering Laboratory, which required specially marked streets that were interpreted by two cameras on the vehicle and an analog computer. The vehicle reached speeds up to 30 kilometres per hour (19 mph) with the support of an elevated rail.[23][24]
A landmark autonomous car appeared in the 1980s, with Carnegie Mellon University's Navlab[25] and ALV[26][27] projects funded by the United States' Defense Advanced Research Projects Agency (DARPA) starting in 1984 and Mercedes-Benz and Bundeswehr University Munich's EUREKA Prometheus Project in 1987.[28] By 1985, the ALV had demonstrated self-driving speeds on two-lane roads of 31 kilometres per hour (19 mph), with obstacle

In [52]:
splits

["Experiments have been conducted on automated driving systems (ADS) since at least the 1920s;[22] trials began in the 1950s. The first semi-automated car was developed in 1977, by Japan's Tsukuba Mechanical Engineering Laboratory, which required specially marked streets that were interpreted by two cameras on the vehicle and an analog computer. The vehicle reached speeds up to 30 kilometres per hour (19\xa0mph) with the support of an elevated rail.[23][24]\nA landmark autonomous car appeared in the 1980s, with Carnegie Mellon University's Navlab[25] and ALV[26][27] projects funded by the United States' Defense Advanced Research Projects Agency (DARPA) starting in 1984 and Mercedes-Benz and Bundeswehr University Munich's EUREKA Prometheus Project in 1987.[28] By 1985, the ALV had demonstrated self-driving speeds on two-lane roads of 31 kilometres per hour (19\xa0mph), with obstacle avoidance added in 1986, and off-road driving in day and night time conditions by 1987.[29] A major miles

In [51]:
history.metadata['example'] = 'hello'
history.metadata

{'Header 1': 'Self-driving car', 'Header 2': 'History', 'example': 'hello'}

In [49]:
base_metadata = history.metadata.copy()
base_metadata['topics'] = topics
Document(page_content=split, metadata=base_metadata)

Document(page_content='In October 2018, Waymo announced that its test vehicles had traveled in automated mode for over 10,000,000 miles (16,000,000\xa0km), increasing by about 1,000,000 miles (1,600,000 kilometres) per month.[52] In December 2018, Waymo was the first to commercialize a fully autonomous taxi service in the US, in Phoenix, Arizona.[53] In October 2020, Waymo launched a geo-fenced driverless ride hailing service in Phoenix.[54][55] The cars are being monitored in real-time by a team of remote engineers, and there are cases where the remote engineers need to intervene.[56][55]\nIn March 2019, ahead of the autonomous racing series Roborace, Robocar set the Guinness World Record for being the fastest autonomous car in the world. In pushing the limits of self-driving vehicles, Robocar reached 282.42\xa0km/h (175.49\xa0mph) – an average confirmed by the UK Timing Association at Elvington in Yorkshire, UK.[57]\nIn 2020, a National Transportation Safety Board chairman stated tha

In [None]:
class MarkdownHeaderTextSplitterMaxTokenLimit(MarkdownHeaderTextSplitter):
    def __init__(self, max_tokens=512, **kwargs):
        super().__init__(**kwargs)
        self.max_tokens = max_tokens

    def calculate_tokens(self, string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
        """Returns the number of tokens in a text string."""
        try:
            encoding = tiktoken.get_encoding(encoding_name)
        except ValueError:
            encoding = tiktoken.encoding_for_model(encoding_name)
        num_tokens = len(encoding.encode(string))
        return num_tokens

    def generate_extra_metadata(self, string: str):
        return 'EXTRA INFO ADDED'

    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
        """Combine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
        """
        aggregated_chunks: List[LineType] = []

        for line in lines:
            if (
                aggregated_chunks
                and aggregated_chunks[-1]["metadata"] == line["metadata"]
            ):
                # If the last line in the aggregated list
                # has the same metadata as the current line,
                # append the current content to the last lines's content
                new_content = aggregated_chunks[-1]["content"] + "  \n" + line["content"]
                new_token_count = self.calculate_tokens(new_content)

                # If less than 512 tokens, update the last line's content
                if new_token_count <= 512:
                    aggregated_chunks[-1]["content"] = new_content
                # If more than 512 tokens, start a new chunk and add extra metadata to it
                else:
                    # Generate the extra metadata for the current chunk.
                    new_metadata = self.generate_extra_metadata(line["content"])
                    line["metadata"] += new_metadata
                    # If the new token count would exceed the limit, start a new chunk
                    aggregated_chunks.append(line)
            else:
                # Otherwise, append the current line to the aggregated list
                aggregated_chunks.append(line)

        return [
            Document(page_content=chunk["content"], metadata=chunk["metadata"])
            for chunk in aggregated_chunks
        ]


In [6]:

import urllib.request
from urllib.parse import quote

def download_html(url, output_file):
    response = urllib.request.urlopen(url)
    data = response.read()

    with open(output_file, 'wb') as file:
        file.write(data)

def convert_html_to_markdown(input_file, output_file):
    command = ['pandoc', '-f', 'html', '-t', 'markdown', '-o', output_file, input_file]
    subprocess.run(command, check=True)

# Prepare the URL
wiki_page = 'https://en.wikipedia.org/wiki/OpenAI'
wiki_page_encoded = quote(wiki_page, safe=':/')
html_file = 'openai.html'
markdown_file = 'openai.md'

# Download the HTML page
download_html(wiki_page_encoded, html_file)

# Convert HTML to Markdown
convert_html_to_markdown(html_file, markdown_file)

print(f"Markdown file {markdown_file} has been created.")


Markdown file openai.md has been created.


In [15]:
with open('openai.md', 'r', encoding='utf-8') as file:
    markdown_text = file.read()

markdown_splitter = MarkdownTextSplitter(chunk_size=512, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])
docs[1]

Document(page_content='::: vector-dropdown-content\n::: {#vector-main-menu-unpinned-container .vector-unpinned-container}\n::: {#vector-main-menu .vector-main-menu .vector-pinnable-element}\n::: {.vector-pinnable-header .vector-main-menu-pinnable-header .vector-pinnable-header-unpinned feature-name="main-menu-pinned" pinnable-element-id="vector-main-menu" pinned-container-id="vector-main-menu-pinned-container" unpinned-container-id="vector-main-menu-unpinned-container"}\n::: vector-pinnable-header-label\nMain menu\n:::', metadata={})

In [16]:
import subprocess
import urllib.request
from urllib.parse import quote
from bs4 import BeautifulSoup

def download_html(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode()
    return data

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    content_text = soup.find('div', {'id': 'mw-content-text'})
    return str(content_text)

def convert_html_to_markdown(html_content, output_file):
    with open('temp.html', 'w') as f:
        f.write(html_content)
    command = ['pandoc', '-f', 'html', '-t', 'markdown', '-o', output_file, 'temp.html']
    subprocess.run(command, check=True)

# Prepare the URL
wiki_page = 'https://en.wikipedia.org/wiki/OpenAI'
wiki_page_encoded = quote(wiki_page, safe=':/')
markdown_file = 'openai.md'

# Download the HTML page
html_data = download_html(wiki_page_encoded)

# Parse HTML
parsed_html = parse_html(html_data)

# Convert HTML to Markdown
convert_html_to_markdown(parsed_html, markdown_file)

print(f"Markdown file {markdown_file} has been created.")


UnicodeEncodeError: 'charmap' codec can't encode character '\u2212' in position 7333: character maps to <undefined>

In [2]:
llm = OpenAI()

wikipedia_articles = [
    "https://en.wikipedia.org/wiki/Large_language_model",
    "https://en.wikipedia.org/wiki/Transformer_(machine_learning_model)",
    "https://en.wikipedia.org/wiki/Dual-phase_evolution",
    "https://en.wikipedia.org/wiki/Tessellation",
    "https://en.wikipedia.org/wiki/Climate_change",
    "https://en.wikipedia.org/wiki/DNA_nanotechnology",
    "https://en.wikipedia.org/wiki/Self-driving_car",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://en.wikipedia.org/wiki/2022%E2%80%932023_food_crises",
    "https://en.wikipedia.org/wiki/Economic_impacts_of_climate_change",
]

dual_phase = wikipedia_articles[2]

article = wikipedia_articles[0]

# Send an HTTP request to the URL
response = requests.get(article)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the HTML content from the response
    html_content = response.text

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
else:
    print(f"Failed to download the webpage. Status code: {response.status_code}")

In [3]:
loader = WebBaseLoader(article)
data = loader.load()
len(data)

1

In [4]:
dual_phase = 'https://en.wikipedia.org/wiki/Dual-phase_evolution'
# dual_phase = 'https://en.wikipedia.org/wiki/Climate_change'
response_dp = requests.get(dual_phase)
html_content_dp = response_dp.text
soup_dp = BeautifulSoup(html_content_dp, 'html.parser')

# Extract all headers and titles
titles_and_headers = []

# Find all title tags and extract their text
title_tags = soup_dp.find_all('title')
for title in title_tags:
    titles_and_headers.append(title.text.strip())

# Find all header tags (h1 to h6) and extract their text
header_tags = soup_dp.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for header in header_tags:
    titles_and_headers.append(header.text.strip())

# Print the extracted headers and titles
for item in titles_and_headers:
    print(item)


Dual-phase evolution - Wikipedia
Contents
Dual-phase evolution
Introduction[edit]
The DPE mechanism[edit]
Underlying network[edit]
Phase shifts[edit]
Selection and variation[edit]
System memory[edit]
Examples[edit]
Social networks[edit]
Socio-economics[edit]
Forest ecology[edit]
Search algorithms[edit]
Related processes[edit]
References[edit]


In [14]:
soup_dp.h1

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">Dual-phase evolution</span></h1>

In [86]:
with open('self_driving.html', 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

In [85]:
[x for x in soup.find_all('h3') if 'Approaches' in x.get_text()][0].find_next()

<span class="mw-headline" id="Approaches">Approaches</span>

In [60]:
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
found_tags[1].name

'h1'

In [51]:
# Assuming 'my_dict' is your dictionary
my_dict = {
    'Contents': None,
    'Dual-phase evolution': None,
    'Introduction[edit]': None,
    'The DPE mechanism[edit]': None,
    'Underlying network[edit]': None,
    'Phase shifts[edit]': None,
    'Selection and variation[edit]': None
}

# Remove '[edit]' from the end of the keys
for key in list(my_dict.keys()):  # Using list() to avoid changing the dictionary size during iteration
    new_key = key.rstrip('[edit]')
    if new_key != key:
        my_dict[new_key] = my_dict.pop(key)

print(my_dict)


{'Contents': None, 'Dual-phase evolution': None, 'Introduction': None, 'The DPE mechanism': None, 'Underlying network': None, 'Phase shifts': None, 'Selection and variation': None}


In [34]:
soup.h1.find_next_sibling().find_next_sibling().find_next_sibling()

AttributeError: 'NoneType' object has no attribute 'find_next_sibling'

In [28]:
# Load the webpage content
url = 'https://en.wikipedia.org/wiki/Dual-phase_evolution'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(string=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling('p')

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name == 'p':
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

pprint(doc_dict, sort_dicts=False)

{'Contents': '',
 'Dual-phase evolution': '',
 'Introduction[edit]': 'Dual phase evolution (DPE) is a process that promotes '
                       'the emergence of large-scale order incomplex systems. '
                       'It occurs when a system repeatedly switches between '
                       'various kinds of phases, and in each phase different '
                       'processes act on the components or connections in the '
                       'system. DPE arises because of a property '
                       'ofgraphsandnetworks: the connectivity avalanche that '
                       'occurs in graphs as the number of edges increases.[2] '
                       'Social networks provide a familiar example. In asocial '
                       'networkthe nodes of the network are people and the '
                       'network connections (edges) are relationships or '
                       'interactions between people. For any individual, '
                       

In [36]:
with open('soup.html', 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

In [37]:
broken_html = """<h1 class="firstHeading mw-first-heading" id="firstHeading">
        <span class="mw-page-title-main">
         Dual-phase evolution
        </span>
       </h1>
       <div class="vector-dropdown mw-portlet mw-portlet-lang" id="p-lang-btn">
        <input aria-haspopup="true" aria-label="Go to an article in another language. Available in 1 language" class="vector-dropdown-checkbox mw-interlanguage-selector" data-event-name="ui.dropdown-p-lang-btn" id="p-lang-btn-checkbox" role="button" type="checkbox">
         <label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-1" for="p-lang-btn-checkbox" id="p-lang-btn-label">
         </label>
         <span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive">
         </span>
         <span class="vector-dropdown-label-text">
          1 language
         </span>
         <div class="vector-dropdown-content">
          <div class="vector-menu-content">
           <ul class="vector-menu-content-list">
            <li class="interlanguage-link interwiki-fa mw-list-item">
             <a class="interlanguage-link-target" href="https://fa.wikipedia.org/wiki/%D9%81%D8%B1%DA%AF%D8%B4%D8%AA_%D8%AF%D9%88%D9%81%D8%A7%D8%B2%DB%8C" hreflang="fa" lang="fa" title="فرگشت دوفازی – Persian">
              <span>
               فارسی
              </span>
             </a>
            </li>
           </ul>
           <div class="after-portlet after-portlet-lang">
            <span class="wb-langlinks-edit wb-langlinks-link">
             <a class="wbc-editpage" href="https://www.wikidata.org/wiki/Special:EntityPage/Q25109659#sitelinks-wikipedia" title="Edit interlanguage links">
              Edit links
             </a>
            </span>
           </div>
          </div>
         </div>
        </input>
       </div>
      </header>
      <div class="vector-page-toolbar">
       <div class="vector-page-toolbar-container">
        <div id="left-navigation">
         <nav aria-label="Namespaces">
          <div class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" id="p-associated-pages">
           <div class="vector-menu-content">
            <ul class="vector-menu-content-list">
             <li class="selected vector-tab-noicon mw-list-item" id="ca-nstab-main">
              <a accesskey="c" class="" data-mw="interface" href="/wiki/Dual-phase_evolution" title="View the content page [c]">
               <span>
                Article
               </span>
              </a>
             </li>
             <li class="vector-tab-noicon mw-list-item" id="ca-talk">
              <a accesskey="t" class="" data-mw="interface" href="/wiki/Talk:Dual-phase_evolution" rel="discussion" title="Discuss improvements to the content page [t]">
               <span>
                Talk
               </span>
              </a>
             </li>
            </ul>
           </div>
          </div>
          <div class="vector-dropdown mw-portlet mw-portlet-variants emptyPortlet" id="p-variants">
           <input aria-haspopup="true" aria-label="Change language variant" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-p-variants" id="p-variants-checkbox" role="button" type="checkbox"/>
           <label aria-hidden="true" class="vector-dropdown-label" for="p-variants-checkbox" id="p-variants-label">
           </label>
           <span class="vector-dropdown-label-text">
            English
           </span>
           <div class="vector-dropdown-content">
            <div class="vector-menu-content">
             <ul class="vector-menu-content-list">
             </ul>
            </div>
           </div>
          </div>
         </nav>
        </div>
        <div class="vector-collapsible" id="right-navigation">
         <nav aria-label="Views">
          <div class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" id="p-views">
           <div class="vector-menu-content">
            <ul class="vector-menu-content-list">
             <li class="selected vector-tab-noicon mw-list-item" id="ca-view">
              <a class="" data-mw="interface" href="/wiki/Dual-phase_evolution">
               <span>
                Read
               </span>
              </a>
             </li>
             <li class="vector-tab-noicon mw-list-item" id="ca-edit">
              <a accesskey="e" class="" data-mw="interface" href="/w/index.php?title=Dual-phase_evolution&amp;action=edit" title="Edit this page [e]">
               <span>
                Edit
               </span>
              </a>
             </li>
             <li class="vector-tab-noicon mw-list-item" id="ca-history">
              <a accesskey="h" class="" data-mw="interface" href="/w/index.php?title=Dual-phase_evolution&amp;action=history" title="Past revisions of this page [h]">
               <span>
                View history
               </span>
              </a>
             </li>
            </ul>
           </div>
          </div>
         </nav>
         <nav aria-label="More options" class="vector-page-tools-landmark">
          <div class="vector-dropdown vector-page-tools-dropdown" id="vector-page-tools-dropdown">
           <input aria-haspopup="true" aria-label="Tools" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-tools-dropdown" id="vector-page-tools-dropdown-checkbox" role="button" type="checkbox"/>
           <label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" for="vector-page-tools-dropdown-checkbox" id="vector-page-tools-dropdown-label">
           </label>
           <span class="vector-dropdown-label-text">
            Tools
           </span>
           <div class="vector-dropdown-content">
            <div class="vector-unpinned-container" id="vector-page-tools-unpinned-container">
             <div class="vector-page-tools vector-pinnable-element" id="vector-page-tools">
              <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container">
               <div class="vector-pinnable-header-label">
                Tools
               </div>
               <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">
                move to sidebar
               </button>
               <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">
                hide
               </button>
              </div>
              <div class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" id="p-cactions" title="More options">
               <div class="vector-menu-heading">
                Actions
               </div>
               <div class="vector-menu-content">
                <ul class="vector-menu-content-list">
                 <li class="selected vector-more-collapsible-item mw-list-item" id="ca-more-view">
                  <a href="/wiki/Dual-phase_evolution">
                   <span>
                    Read
                   </span>
                  </a>
                 </li>
                 <li class="vector-more-collapsible-item mw-list-item" id="ca-more-edit">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                   <span>
                    Edit
                   </span>
                  </a>
                 </li>
                 <li class="vector-more-collapsible-item mw-list-item" id="ca-more-history">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;action=history">
                   <span>
                    View history
                   </span>
                  </a>
                 </li>
                </ul>
               </div>
              </div>
              <div class="vector-menu mw-portlet mw-portlet-tb" id="p-tb">
               <div class="vector-menu-heading">
                General
               </div>
               <div class="vector-menu-content">
                <ul class="vector-menu-content-list">
                 <li class="mw-list-item" id="t-whatlinkshere">
                  <a accesskey="j" href="/wiki/Special:WhatLinksHere/Dual-phase_evolution" title="List of all English Wikipedia pages containing links to this page [j]">
                   <span>
                    What links here
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-recentchangeslinked">
                  <a accesskey="k" href="/wiki/Special:RecentChangesLinked/Dual-phase_evolution" rel="nofollow" title="Recent changes in pages linked from this page [k]">
                   <span>
                    Related changes
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-upload">
                  <a accesskey="u" href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]">
                   <span>
                    Upload file
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-specialpages">
                  <a accesskey="q" href="/wiki/Special:SpecialPages" title="A list of all special pages [q]">
                   <span>
                    Special pages
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-permalink">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;oldid=1126289474" title="Permanent link to this revision of this page">
                   <span>
                    Permanent link
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-info">
                  <a href="/w/index.php?title=Dual-phase_evolution&amp;action=info" title="More information about this page">
                   <span>
                    Page information
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-cite">
                  <a href="/w/index.php?title=Special:CiteThisPage&amp;page=Dual-phase_evolution&amp;id=1126289474&amp;wpFormIdentifier=titleform" title="Information on how to cite this page">
                   <span>
                    Cite this page
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-wikibase">
                  <a accesskey="g" href="https://www.wikidata.org/wiki/Special:EntityPage/Q25109659" title="Structured data on this page hosted by Wikidata [g]">
                   <span>
                    Wikidata item
                   </span>
                  </a>
                 </li>
                </ul>
               </div>
              </div>
              <div class="vector-menu mw-portlet mw-portlet-coll-print_export" id="p-coll-print_export">
               <div class="vector-menu-heading">
                Print/export
               </div>
               <div class="vector-menu-content">
                <ul class="vector-menu-content-list">
                 <li class="mw-list-item" id="coll-download-as-rl">
                  <a href="/w/index.php?title=Special:DownloadAsPdf&amp;page=Dual-phase_evolution&amp;action=show-download-screen" title="Download this page as a PDF file">
                   <span>
                    Download as PDF
                   </span>
                  </a>
                 </li>
                 <li class="mw-list-item" id="t-print">
                  <a accesskey="p" href="/w/index.php?title=Dual-phase_evolution&amp;printable=yes" title="Printable version of this page [p]">
                   <span>
                    Printable version
                   </span>
                  </a>
                 </li>
                </ul>
               </div>
              </div>
             </div>
            </div>
           </div>
          </div>
         </nav>
        </div>
       </div>
      </div>
      <div class="vector-column-end">
       <nav aria-label="More options" class="vector-page-tools-landmark vector-sticky-pinned-container">
        <div class="vector-pinned-container" id="vector-page-tools-pinned-container">
        </div>
       </nav>
      </div>
      <div aria-labelledby="firstHeading" class="vector-body" data-mw-ve-target-container="" id="bodyContent">
       <div class="vector-body-before-content">
        <div class="mw-indicators">
        </div>
        <div class="noprint" id="siteSub">
         From Wikipedia, the free encyclopedia
        </div>
       </div>
       <div id="contentSub">
        <div id="mw-content-subtitle">
        </div>
       </div>
       <div class="mw-body-content mw-content-ltr" dir="ltr" id="mw-content-text" lang="en">
        <div class="mw-parser-output">
         <div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">
          Process that drives self-organization within complex adaptive systems
         </div>
         <table class="box-Multiple_issues plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox" role="presentation">
          <tbody>
           <tr>
            <td class="mbox-image">
             <div class="mbox-image-div">
              <img alt="" data-file-height="40" data-file-width="40" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/60px-Ambox_important.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/80px-Ambox_important.svg.png 2x" width="40"/>
             </div>
            </td>
            <td class="mbox-text">
             <div class="mbox-text-span">
              <div class="multiple-issues-text mw-collapsible">
               <b>
                This article has multiple issues.
               </b>
               Please help
               <b>
                <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                 improve it
                </a>
               </b>
               or discuss these issues on the
               <b>
                <a href="/wiki/Talk:Dual-phase_evolution" title="Talk:Dual-phase evolution">
                 talk page
                </a>
               </b>
               .
               <small>
                <i>
                 (
                 <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                  Learn how and when to remove these template messages
                 </a>
                 )
                </i>
               </small>
               <div class="mw-collapsible-content">
                <link href="mw-data:TemplateStyles:r1097763485" rel="mw-deduplicated-inline-style"/>
                <table class="box-More_citations_needed plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation">
                 <tbody>
                  <tr>
                   <td class="mbox-image">
                    <div class="mbox-image-div">
                     <a class="image" href="/wiki/File:Question_book-new.svg">
                      <img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/>
                     </a>
                    </div>
                   </td>
                   <td class="mbox-text">
                    <div class="mbox-text-span">
                     This article
                     <b>
                      needs additional citations for
                      <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">
                       verification
                      </a>
                     </b>
                     .
                     <span class="hide-when-compact">
                      Please help
                      <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                       improve this article
                      </a>
                      by
                      <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">
                       adding citations to reliable sources
                      </a>
                      . Unsourced material may be challenged and removed.
                      <br/>
                      <small>
                       <span class="plainlinks">
                        <i>
                         Find sources:
                        </i>
                        <a class="external text" href="https://www.google.com/search?as_eq=wikipedia&amp;q=%22Dual-phase+evolution%22" rel="nofollow">
                         "Dual-phase evolution"
                        </a>
                        –
                        <a class="external text" href="https://www.google.com/search?tbm=nws&amp;q=%22Dual-phase+evolution%22+-wikipedia&amp;tbs=ar:1" rel="nofollow">
                         news
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://www.google.com/search?&amp;q=%22Dual-phase+evolution%22&amp;tbs=bkt:s&amp;tbm=bks" rel="nofollow">
                         newspapers
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://www.google.com/search?tbs=bks:1&amp;q=%22Dual-phase+evolution%22+-wikipedia" rel="nofollow">
                         books
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://scholar.google.com/scholar?q=%22Dual-phase+evolution%22" rel="nofollow">
                         scholar
                        </a>
                        <b>
                         ·
                        </b>
                        <a class="external text" href="https://www.jstor.org/action/doBasicSearch?Query=%22Dual-phase+evolution%22&amp;acc=on&amp;wc=on" rel="nofollow">
                         JSTOR
                        </a>
                       </span>
                      </small>
                     </span>
                     <span class="date-container">
                      <i>
                       (
                       <span class="date">
                        May 2015
                       </span>
                       )
                      </i>
                     </span>
                     <span class="hide-when-compact">
                      <i>
                       (
                       <small>
                        <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                         Learn how and when to remove this template message
                        </a>
                       </small>
                       )
                      </i>
                     </span>
                    </div>
                   </td>
                  </tr>
                 </tbody>
                </table>
                <link href="mw-data:TemplateStyles:r1097763485" rel="mw-deduplicated-inline-style"/>
                <table class="box-Technical plainlinks metadata ambox ambox-style ambox-technical" role="presentation">
                 <tbody>
                  <tr>
                   <td class="mbox-image">
                    <div class="mbox-image-div">
                     <img alt="" data-file-height="48" data-file-width="48" decoding="async" height="40" src="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/40px-Edit-clear.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/60px-Edit-clear.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/f/f2/Edit-clear.svg/80px-Edit-clear.svg.png 2x" width="40"/>
                    </div>
                   </td>
                   <td class="mbox-text">
                    <div class="mbox-text-span">
                     This article
                     <b>
                      may be too technical for most readers to understand
                     </b>
                     .
                     <span class="hide-when-compact">
                      Please
                      <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Dual-phase_evolution&amp;action=edit">
                       help improve it
                      </a>
                      to
                      <a href="/wiki/Wikipedia:Make_technical_articles_understandable" title="Wikipedia:Make technical articles understandable">
                       make it understandable to non-experts
                      </a>
                      , without removing the technical details.
                     </span>
                     <span class="date-container">
                      <i>
                       (
                       <span class="date">
                        May 2015
                       </span>
                       )
                      </i>
                     </span>
                     <span class="hide-when-compact">
                      <i>
                       (
                       <small>
                        <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                         Learn how and when to remove this template message
                        </a>
                       </small>
                       )
                      </i>
                     </span>
                    </div>
                   </td>
                  </tr>
                 </tbody>
                </table>
               </div>
              </div>
              <span class="hide-when-compact">
               <i>
                (
                <small>
                 <a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">
                  Learn how and when to remove this template message
                 </a>
                </small>
                )
               </i>
              </span>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
         <p>
          <b>
           Dual phase evolution
          </b>
          (
          <b>
           DPE
          </b>
          ) is a process that drives
          <a href="/wiki/Self-organization" title="Self-organization">
           self-organization
          </a>
          within
          <a href="/wiki/Complex_adaptive_system" title="Complex adaptive system">
           complex adaptive systems
          </a>
          .
          <sup class="reference" id="cite_ref-DPE2_1-0">
           <a href="#cite_note-DPE2-1">
            [1]
           </a>
          </sup>
          It arises in response to phase changes within the network of connections formed by a system's components. DPE occurs in a wide range of physical, biological and social systems. Its applications to technology include methods for manufacturing novel materials and algorithms to solve complex problems in computation.
         </p>
         <meta property="mw:PageProp/toc">
          <h2>
           <span class="mw-headline" id="Introduction">
            Introduction
           </span>
           <span class="mw-editsection">
            <span class="mw-editsection-bracket">
             [
            </span>
            <a href="/w/index.php?title=Dual-phase_evolution&amp;action=edit&amp;section=1" title="Edit section: Introduction">
             edit
            </a>
            <span class="mw-editsection-bracket">
             ]
            </span>
           </span>
          </h2>
          <p>
           Dual phase evolution (DPE) is a process that promotes the emergence of large-scale order in
           <a class="mw-redirect" href="/wiki/Complex_systems" title="Complex systems">
            complex systems
           </a>"""

working_code = """from bs4 import BeautifulSoup, Comment
import requests

# Load the webpage content
url = 'http://your-website-url.com'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(text=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling()

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name == 'p':
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

# Print the resulting dictionary
for k, v in doc_dict.items():
    print(f'{k}: {v}')
"""

prompt = f"""The following code was created by GPT-4 and successfully for each header tag finds and returns all the subsequent p tags until the next header tag and keeps doing that for each header tag and adds them to a dict.

However, it does not find the p tag for the h1 tag. Please correct it so that
it finds the p tag after the h1 tag

Working code: ####
{working_code}
####

html soup: ####
{broken_html}
"""
llm_16k.predict(prompt)

'To fix the code, you need to modify the while loop condition to include the h1 tag. Here\'s the modified code:\n\n```python\ndoc_dict = {}\nfor tag in found_tags:\n    content = []\n    next_sibling = tag.find_next_sibling()\n\n    while next_sibling and (next_sibling.name not in tags or tag.name == \'h1\'):\n        if next_sibling.name == \'p\':\n            content.append(next_sibling.get_text(strip=True))\n        next_sibling = next_sibling.find_next_sibling()\n\n    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n```\n\nBy adding `tag.name == \'h1\'` to the while loop condition, it will include the p tags after the h1 tag as well.'

In [38]:
pprint('To fix the code, you need to modify the while loop condition to include the h1 tag. Here\'s the modified code:\n\n```python\ndoc_dict = {}\nfor tag in found_tags:\n    content = []\n    next_sibling = tag.find_next_sibling()\n\n    while next_sibling and (next_sibling.name not in tags or tag.name == \'h1\'):\n        if next_sibling.name == \'p\':\n            content.append(next_sibling.get_text(strip=True))\n        next_sibling = next_sibling.find_next_sibling()\n\n    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n```\n\nBy adding `tag.name == \'h1\'` to the while loop condition, it will include the p tags after the h1 tag as well.')

('To fix the code, you need to modify the while loop condition to include the '
 "h1 tag. Here's the modified code:\n"
 '\n'
 '```python\n'
 'doc_dict = {}\n'
 'for tag in found_tags:\n'
 '    content = []\n'
 '    next_sibling = tag.find_next_sibling()\n'
 '\n'
 '    while next_sibling and (next_sibling.name not in tags or tag.name == '
 "'h1'):\n"
 "        if next_sibling.name == 'p':\n"
 '            content.append(next_sibling.get_text(strip=True))\n'
 '        next_sibling = next_sibling.find_next_sibling()\n'
 '\n'
 '    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n'
 '```\n'
 '\n'
 "By adding `tag.name == 'h1'` to the while loop condition, it will include "
 'the p tags after the h1 tag as well.')


In [39]:
working_code = """from bs4 import BeautifulSoup, Comment
import requests

# Load the webpage content
url = 'http://your-website-url.com'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(text=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling()

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name not in ["script", "style", "head", "title", "[document]"]:
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

# Print the resulting dictionary
for k, v in doc_dict.items():
    print(f'{k}: {v}')
"""

prompt = f"""The following code was created by GPT-4 and successfully achives this goal: I only care about what a person reading the page will read. So just get me the headers and actual content text content that a human would see. The current code I tried gets things like 'document.documentElement.className="client-js ' which is obviously js code that the user would not read. add everything to a dict in the order you find them.

However, it does not find the p tag for the h1 tag. Please correct it so that
it finds the p tag after the h1 tag

Working code: ####
{working_code}
####

html soup: ####
{broken_html}
####
"""
output = llm_16k.predict(prompt)
pprint(output)

('To modify the existing code to include the `<p>` tag after the `<h1>` tag, '
 'you can update the `while` loop inside the `for` loop as follows:\n'
 '\n'
 '```python\n'
 'doc_dict = {}\n'
 'for tag in found_tags:\n'
 '    content = []\n'
 '    next_sibling = tag.find_next_sibling()\n'
 '\n'
 '    # Add the content of the <p> tag after the <h1> tag\n'
 "    if tag.name == 'h1':\n"
 "        p_tag = tag.find_next_sibling('p')\n"
 '        if p_tag:\n'
 '            content.append(p_tag.get_text(strip=True))\n'
 '\n'
 '    while next_sibling and next_sibling.name not in tags:\n'
 '        if next_sibling.name not in ["script", "style", "head", "title", '
 '"[document]"]:\n'
 '            content.append(next_sibling.get_text(strip=True))\n'
 '        next_sibling = next_sibling.find_next_sibling()\n'
 '\n'
 '    doc_dict[tag.get_text(strip=True)] = " ".join(content)\n'
 '```\n'
 '\n'
 'This code checks if the current tag is an `<h1>` tag and then finds the next '
 'sibling `<p>` tag usin

In [40]:
working_code = """from bs4 import BeautifulSoup, Comment
import requests

# Load the webpage content
url = 'http://your-website-url.com'
r = requests.get(url)
html_content = r.text

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# Remove unwanted tags: script, style, [document], head, title
for element in soup(["script", "style", "head", "title", "[document]"]):
    element.decompose()

# Also remove HTML comments
for element in soup.find_all(text=lambda text: isinstance(text, Comment)):
    element.extract()

# Define the tags to find
tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
found_tags = soup.find_all(tags)

# Extract tags and their associated content into a dictionary
doc_dict = {}
for tag in found_tags:
    content = []
    next_sibling = tag.find_next_sibling()

    while next_sibling and next_sibling.name not in tags:
        if next_sibling.name not in ["script", "style", "head", "title", "[document]"]:
            content.append(next_sibling.get_text(strip=True))
        next_sibling = next_sibling.find_next_sibling()

    doc_dict[tag.get_text(strip=True)] = " ".join(content)

# Print the resulting dictionary
for k, v in doc_dict.items():
    print(f'{k}: {v}')
"""

prompt = f"""The following code finds all the p tags after header tags. But it does not find the p tags for the h1 tag. Why does it not find it?

Code: ####
{working_code}
####

html soup: ####
{broken_html}
####
"""
output = llm_16k.predict(prompt)
pprint(output)

('The code does not find the p tags for the h1 tag because the h1 tag is not '
 'included in the list of tags to find. In the code, the tags variable is '
 "defined as ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], so only h2 to h6 tags will "
 'be found. If you want to include the h1 tag as well, you can modify the tags '
 "variable to ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].")


In [35]:
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">

<body class="skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Dual-phase_evolution rootpage-Dual-phase_evolution skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a>
<div class="vector-header-container">
<header class="vector-header mw-header">
<div class="vector-header-start">
<nav aria-label="Site" class="vector-main-menu-landmark" role="navigation">
<div class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" i

In [5]:
Introduction
The DPE mechanism
Underlying network
Phase shifts
Selection and variation
System memory
Examples
Social networks
Socio-economics

SyntaxError: invalid syntax (221947299.py, line 2)

In [None]:
all_articles = WebBaseLoader(wikipedia_articles).load()
print('Total number of articles: ', len(all_articles))
for i, doc in enumerate(all_articles):
    print(f"Article {i} contains {num_tokens_from_string(doc.page_content):,} tokens - {wikipedia_articles[i]}")

In [None]:
shortest_article = all_articles[2]
shortest_article.page_content

In [None]:
llm_16k.predict(f"Summarize the following text: {shortest_article.page_content}")

In [None]:
# llm_default.predict(f'Translate the following text into German: {all_articles[3].page_content}')
output = llm_16k.predict(f'Translate the following text into German: {all_articles[3].page_content}')

In [None]:
num_tokens_from_string(output)

In [None]:
wikipedia_articles[3]

In [None]:
len(all_articles[3].page_content)

enc = tiktoken.encoding_for_model('gpt-3.5-turbo')
len(all_articles[3].page_content.split(' '))

In [None]:
num_tokens_from_string(all_articles[3].page_content)