In [9]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts) 
    return [u" ".join(t.strip() for t in visible_texts)]

def extract_title(body):
    soup = BeautifulSoup(body, 'html.parser')
    title = soup.find('title')
    return title

tokenlen = 30000
max_length = 12000
# recursion limit
recurse = 50

def smart_truncate(text: str) -> str:
    """Intelligently truncate text at natural boundaries"""
    if len(text) <= max_length:
        return text
    
    # Try to cut at paragraph breaks first
    truncated = text[:max_length]
    
    # Find last paragraph break
    last_para = truncated.rfind('\n\n')
    if last_para > max_length * 0.7:  # If we don't lose too much
        return truncated[:last_para] + "\n\n[Content truncated at paragraph break...]"
    
    # Fallback to sentence breaks
    last_sentence = truncated.rfind('. ')
    if last_sentence > max_length * 0.8:
        return truncated[:last_sentence + 1] + "\n\n[Content truncated...]"
    
    # Hard truncate as last resort
    return truncated + "\n\n[Content truncated...]"

def scrape(url:str)->str:
    """
    Scrape visible text content from a website URL.
    
    Args:
        url (str): The URL to scrape. Must be a valid HTTP/HTTPS URL.
        
    Returns:
        str: The visible text content wrapped in <text> tags, or error message.
             the text will be truncated if it is too long.
        
    Example:
        >>> scrape("https://example.com")
        "<text>Example Domain This domain is for use in illustrative examples...</text>"
    
    Note:
        - Only scrapes visible text, not HTML markup
        - Has basic error handling for network issues
        - Returns content wrapped in XML-style tags for parsing
    """
    try:
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        
        with urllib.request.urlopen(url, timeout=30) as response:
            html = response.read()
            
        # Extract title and text
        title = extract_title(html)
        visible_text = text_from_html(html)
        
        # Truncate text if too long
        visible_text = smart_truncate(visible_text)
        print(visible_text)
        return f"""
            <webpage>
                <url>{url}</url>
                <title>{title or "No title found"}</title>
                <content>
                    {visible_text.join().strip()}
                </content>
            </webpage>
            """
        
    except urllib.error.URLError as e:
        logger.error(f"URL error while scraping {url}: {e}")
        return f"URL error occurred: {e}"
    except urllib.error.HTTPError as e:
        logger.error(f"HTTP error while scraping {url}: {e}")
        return f"HTTP error occurred: {e.code} - {e.reason}"
    except Exception as e:
        logger.error(f"Unexpected error while scraping {url}: {e}")
        return f"An unexpected error occurred: {e}"

scrape("https://portland.craigslist.org/clk/act/d/vancouver-lets-go-walking-hiking/7859242796.html")

["             CL      portland  >      clark/cowlitz  >      community  >      activity partners         post    account       favorites         hidden         CL   clark/cowlitz            >\n\nactivity partners   ...       ◀  prev  ▲  next ▶      reply      favorite     favorite       hide     unhide       ⚐  ⚑   flag    ⚑  flagged        Posted 2025-06-18 15:16   Contact Information:    print      Lets go walking/hiking (Vancouver)            QR Code Link to This Post   I am a senior and would like to find another senior or a group to go walking with. I consider myself to be generally fit and would enjoy hikes 45 to 60 minutes long. Prefer paved trails but will consider all.....thanks     post id: 7859242796  posted: 2025-06-18 15:16    ♥ best of   [ ? ]           © 2025 craigslist CL  help  safety  privacy  terms  about  app         loading  reading  writing  saving  searching    refresh the page.      Lets go walking/hiking - activity partners - craigslist  I am a senior and woul

  texts = soup.findAll(text=True)


NameError: name 'logger' is not defined