In [1]:
import asyncio
import nest_asyncio
import os
import sys

from dotenv import load_dotenv
from langchain.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = api_key

In [2]:
nest_asyncio.apply()

In [3]:
urls=[
    "https://www.spirit.com/",
    "https://www.spirit.com/s/about_us",
    "https://foundation.spirit.com/es-US/foundation",
    "https://www.spirit.com/free-spirit",
    "https://www.spirit.com/savers-club",
    "https://www.spirit.com/press-release",
    "https://ir.spirit.com/overview/default.aspx",
    "https://careers.spirit.com/careers-home",
    "https://ir.spirit.com/overview/default.aspx",
    "https://customersupport.spirit.com/es-US/",
    "https://www.spirit.com/s/contact-us",
    "https://www.spirit.com/legal",
    "https://www.spirit.com/s/privacy-policy",
    "https://content.spirit.com/Shared/es-pr/Documents/Contract_of_Carriage.pdf",
    "https://privacyportal.onetrust.com/webform/a3ba0bce-b005-4f16-b003-c296ec63f389/bce5864e-c8c7-43de-85e9-952e0e564167",
    "https://www.spirit.com/#scroll-home-search",
    "https://customersupport.spirit.com/en-us/category/article/KA-01333",
    "https://www.spirit.com/es/vuelos",
    "https://vacations.spirit.com/es/vacaciones",
    "https://www.spirit.com/my-trips/find-trip",
    "https://www.spirit.com/flight-status",
    "https://www.spirit.com/s/onboardexperience",
    "https://www.spirit.com/s/wifi",
    "https://www.spirit.com/s/info",
    "https://www.spirit.com/spirit-101",
    "https://www.spirit.com/travel-advisory",
    ]

In [4]:
async def load_and_process_documents():
    loader = AsyncChromiumLoader(urls)
    tt = Html2TextTransformer()

    docs = tt.transform_documents(loader.load())
    print(docs)
    
    ts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    fd = ts.split_documents(docs)
    
    return fd

In [5]:
fd = await load_and_process_documents()
print(len(fd))
l = []
client = OpenAI()

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.spirit.com/'}, page_content='Sign-In\n\n  * # >> Since \n\n#  >> Since\n\n  * 0 PTS\n  * My Dashboard\n  * Edit profile\n  * Free spirit Status\n  * Activity\n  * Saver$ Club\n  * Sign Out\n\nEspañol __\n\n  * Hi,\n  * # >> Since \n\n#  >> Since\n\n  * 0 PTS\n  * My Dashboard\n  * Edit profile\n  * Free spirit Status\n  * Activity\n  * Saver$ Club\n  * Sign Out\n\nEspañol __\n\n  * Book\n  * * * *\n\nMy Trips\n\n  * * * *\n\nCheck In\n\n  * * * *\n\nFlight Status\n\n  * * * *\n\nTravel Info\n\n __\n\n  * * * *\n\nLoyalty\n\n __\n\n  * * * *\n\nDeals\n\n __\n\n  * * * *\n\nContact Us\n\n  * * * *\n\n __Español\n\n  * Book\n  * My Trips\n  * Check In\n  * Flight Status\n  * Travel Info \n\nOnboard Experience\n\nWi-fi\n\n  * Loyalty \n\nFree Spirit®\n\nSaver$ Club\n\n  * Deals \n\nFlight Deals\n\nVacation Deals\n\n  * Contact Us \n  * Sign-In __\n\n  * ES __\n\n  * FLIGHT\n\n  * BUNDLE & SAVE \n\n  * HOTEL \n\n  * CAR \n\n  * CRUISE \n\nRound Tri

In [6]:
for xx in fd:
    response = completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": "You are a highly intelligent assistant capable of breaking text into structured summaries for create chunks for a pinecone vector database."
        },
        {
            "role": "user",
            "content": (
                "Break down the following text into structured iterable bullet points using the format: "
                "'topic | subtopic | content | url information link',"
                "'topic | subtopic | content | url information link',"
                "Guidelines:"
                "- Focus only on meaningful sentences that has a chunk structure."
                "- Ignore headings, filler words, or unimportant details."
                "- Use concise yet descriptive sentences for the 'content' keeping context."
                "- Return everything on Spanish'."
                "- For the url information link just write the explicit the link, not extra info"
                "Text to summarize:"
                f"{xx}"
            )
        }]
    )
    l.append(response.choices[0].message.content)

print("".join(l))

- reservas | libros de viajes | Puedes reservar vuelos y gestionar tus viajes a través del portal en línea. | https://www.spirit.com/
- información de viaje | información de vuelo | Encuentra el estado de tus vuelos y revisa detalles importantes de viaje. | https://www.spirit.com/
- programa de lealtad | Free Spirit® | Únete al programa de lealtad para obtener beneficios adicionales. | https://www.spirit.com/
- ofertas | ofertas de vuelos | Explora ofertas y descuentos en vuelos y vacaciones. | https://www.spirit.com/
- contacto | servicio al cliente | Puedes contactar al servicio al cliente para obtener asistencia. | https://www.spirit.com/
- experiencias a bordo | wi-fi | Disfruta de conexión Wi-fi y otras comodidades a bordo. | https://www.spirit.com/
- opciones de reserva | viaje redondo y un solo sentido | Haz reservas para viajes de ida y vuelta, un solo sentido, o múltiples ciudades. | https://www.spirit.com/- Viajes | Fechas de viaje | Seleccione las fechas de viaje: salida el 

In [7]:
import json

# formatted_list = [item.replace("\n", "\n") for item in l]

with open("output.json", "w", encoding="utf-8") as f:
    json.dump(l, f, ensure_ascii=False, indent=4)

print("Results saved to output.json")

Results saved to output.json


In [8]:
def clean_chunks(text: str) -> list:
    """
    Splits a text into a list of chunks where special characters like "\n" act as delimiters
    and replaces those special characters with commas in the resulting chunks.
    
    Args:
        text (str): The input text to process.

    Returns:
        list: A list of cleaned text chunks.
    """
    # Split the text into chunks by "\n"
    chunks = text.split("\n")
    
    # Replace special characters like "\n" with a comma in each chunk
    cleaned_chunks = [chunk.strip().replace("\n", ",") for chunk in chunks if chunk.strip()]
    
    return cleaned_chunks

In [9]:
def clean_chunks(text: str) -> list:
    """
    Splits a text into a list of chunks where special characters like "\n" act as delimiters
    and replaces those special characters with commas in the resulting chunks.
    
    Args:
        text (str): The input text to process.

    Returns:
        list: A list of cleaned text chunks.
    """
    chunks = text.split("\n")    
    cleaned_chunks = [chunk.strip().replace("\n", ",") for chunk in chunks if chunk.strip()]
    return cleaned_chunks

In [10]:
preprocessed_list = [clean_chunks(item) for item in l]

flattened_list = [chunk for sublist in preprocessed_list for chunk in sublist]

with open("output.json", "w", encoding="utf-8") as f:
    json.dump(flattened_list, f, ensure_ascii=False, indent=4)

print("Results saved to output.json")

Results saved to output.json


## Langchain tutorial Html2TextTransformer

In [40]:
%pip install -q langchain-openai langchain playwright beautifulsoup4
%playwright install


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%playwright` not found.


In [52]:
import asyncio
import dotenv

from langchain_community.document_loaders import AsyncChromiumLoader
from langchain.document_transformers import Html2TextTransformer

dotenv.load_dotenv()
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")

Overwrite playwright settings in AsyncChromiumLoader

In [54]:
from playwright.async_api import TimeoutError as PlaywrightTimeoutError

class RobustChromiumLoader(AsyncChromiumLoader):
    """
    Enhanced AsyncChromiumLoader that handles:
    - Auto-navigation and redirections
    - Pop-ups & unwanted elements
    - Pages that dynamically change content
    """
    async def _fetch(self, page, url):
        try:
            # Set User-Agent to avoid detection
            await page.set_extra_http_headers({"User-Agent": USER_AGENT})

            # Block auto-navigation & unwanted elements (ads, pop-ups)
            await page.route("**/*", lambda route: route.continue_())

            # Navigate to the page, ensuring it fully loads
            await page.goto(url, wait_until="domcontentloaded", timeout=15000)

            # Wait until network is idle (no new requests for dynamic content)
            await page.wait_for_load_state("networkidle")

            # Ensure a key content element is present before extracting (avoids broken loads)
            await page.wait_for_selector("body", timeout=5000)

            # Finally, extract page content
            return await page.content()

        except PlaywrightTimeoutError:
            print(f"⏳ Timeout while loading {url}. Retrying...")
            return await self._retry_fetch(page, url)
        
        except Exception as e:
            print(f"❌ Error loading {url}: {e}")
            return None

    async def _retry_fetch(self, page, url, retries=2):
        """Retry fetching the page in case of timeout errors."""
        for attempt in range(retries):
            try:
                print(f"🔄 Retrying ({attempt + 1}/{retries}) for {url}...")
                await page.goto(url, wait_until="domcontentloaded", timeout=20000)
                await page.wait_for_load_state("networkidle")
                await page.wait_for_selector("body", timeout=5000)
                return await page.content()
            except PlaywrightTimeoutError:
                continue  # Try again
        print(f"❌ Final retry failed for {url}")
        return None

In [55]:
async def fetch_and_extract_text(urls):
    """
    Uses AsyncChromiumLoader to load the webpage asynchronously and extract clean text.
    """
    loader = RobustChromiumLoader(urls=urls)
    docs = await loader.aload()

    if not docs:
        return None

    transformer = Html2TextTransformer()
    extracted_docs = transformer.transform_documents(docs)
    
    return extracted_docs

In [56]:
def save_documents_to_files(documents, directory="./output/"):
    """
    Saves each extracted document to a .txt file.
    """
    os.makedirs(directory, exist_ok=True)

    for index, doc in enumerate(documents):
        content = doc.page_content
        file_path = os.path.join(directory, f"file_{index + 1}.txt") 

        with open(file_path, "w", encoding="utf-8") as file:
            file.write(content)


In [None]:
urls = [
    "https://www.vensure.com/",
]

DIR_Html2TextTransformer = "./html2text_output/"

documents = await fetch_and_extract_text(urls)

if documents:
    save_documents_to_files(documents, DIR_Html2TextTransformer)
else:
    print("No content extracted!")