In [1]:
# Load HTML
import nest_asyncio
import asyncio
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
# Allow nested use of asyncio.run()
nest_asyncio.apply()
loader = AsyncChromiumLoader(['https://en.wikipedia.org/wiki/India'])
html = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# Transform
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(html, tags_to_extract=["span"])

In [4]:
docs_transformed[0].page_content[0:2000]

"Main menu Main page Contents Current events Random article About Wikipedia Contact us Donate Help Learn to edit Community portal Recent changes Upload file    Search Appearance Create account Log in Personal tools Create account Log in learn more Contributions Talk 1 Etymology 2 History Toggle History subsection 2.1 Ancient India 2.2 Medieval India 2.3 Early modern India 2.4 Modern India 3 Geography 4 Biodiversity 5 Politics and government Toggle Politics and government subsection 5.1 Politics 5.2 Government 5.3 Administrative divisions 5.3.1 States 5.3.2 Union territories 6 Foreign, economic and strategic relations 7 Economy Toggle Economy subsection 7.1 Industries 7.2 Energy 7.3 Socio-economic challenges 8 Demographics, languages and religion 9 Culture Toggle Culture subsection 9.1 Visual art 9.2 Architecture 9.3 Literature 9.4 Performing arts and media 9.5 Society 9.6 Education 9.7 Clothing 9.8 Cuisine 9.9 Sports and recreation 10 See also 11 Notes 12 References 13 Bibliography 14 

In [5]:
from langchain_community.document_loaders import AsyncHtmlLoader

urls = ["https://en.wikipedia.org/wiki/India", "https://lilianweng.github.io/posts/2023-06-23-agent/"]
loader = AsyncHtmlLoader(urls)
docs = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.
Fetching pages: 100%|##########| 2/2 [00:00<00:00,  2.24it/s]


In [6]:
from langchain_community.document_transformers import Html2TextTransformer

html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs_transformed[0].page_content[0:500]

'Jump to content\n\nMain menu\n\nMain menu\n\nmove to sidebar hide\n\nNavigation\n\n  * Main page\n  * Contents\n  * Current events\n  * Random article\n  * About Wikipedia\n  * Contact us\n  * Donate\n\nContribute\n\n  * Help\n  * Learn to edit\n  * Community portal\n  * Recent changes\n  * Upload file\n\nSearch\n\nSearch\n\nAppearance\n\n  * Create account\n  * Log in\n\nPersonal tools\n\n  * Create account\n  * Log in\n\nPages for logged out editors learn more\n\n  * Contributions\n  * Talk\n\n## Contents\n\nmove to sidebar hide\n\n  * (Top)'

In [7]:
import os
from langchain_groq import ChatGroq
groq_api_key = os.environ.get("GROQ_API_KEY")

In [8]:
llm = ChatGroq(model="llama3-70b-8192",max_tokens=1000)

In [None]:
# from langchain_community.chat_models import ChatOllama
# llm = ChatOllama(model="llama3.1:8b")

In [9]:
from langchain.chains import create_extraction_chain

schema = {
    "properties": {
        "news_article_title": {"type": "string"},
        "news_article_summary": {"type": "string"},
    },
    "required": ["news_article_title", "news_article_summary"],
}


def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

In [12]:
import pprint

from langchain_text_splitters import RecursiveCharacterTextSplitter

def scrape_with_playwright(urls, schema):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(
        docs, tags_to_extract=["span"]
    )
    print("Extracting content with LLM")

    # Grab the first 1000 tokens of the site
    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1000, chunk_overlap=0
    )
    splits = splitter.split_documents(docs_transformed)

    # Process the first split
    extracted_content = extract(schema=schema, content=splits[0].page_content)
    pprint.pprint(extracted_content)
    return extracted_content

urls = ["https://en.wikipedia.org/wiki/India"]
extracted_content = scrape_with_playwright(urls, schema=schema)

USER_AGENT environment variable not set, consider setting it to identify your requests.


Extracting content with LLM
[{'news_article_summary': 'Main menu Main page Contents Current events Random '
                          'article About Wikipedia Contact us Donate Help '
                          'Learn to edit Community portal Recent changes '
                          'Upload file    Search Appearance Create account Log '
                          'in Personal tools Create account Log in learn more '
                          'Contributions Talk 1 Etymology 2 History Toggle '
                          'History subsection 2.1 Ancient India 2.2 Medieval '
                          'India 2.3 Early modern India 2.4 Modern India 3 '
                          'Geography 4 Biodiversity 5 Politics and government '
                          'Toggle Politics and government subsection 5.1 '
                          'Politics 5.2 Government 5.3 Administrative '
                          'divisions 5.3.1 States 5.3.2 Union territories 6 '
                          'Foreign, econom