In [None]:
!playwright install
# Instalar dependencias primero
# !pip install requests beautifulsoup4 html2text lxml pandas
# !pip install crawl4ai nest-asyncio


In [1]:
import requests
from bs4 import BeautifulSoup
import html2text
import pandas as pd
from urllib.parse import urljoin, urlparse

def scrape_website(url, css_selector=None):
    """
    Scraper simple y confiable para Jupyter
    """
    try:
        # Headers para evitar bloqueos
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'es-ES,es;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
        print(f"🔍 Scrapeando: {url}")
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        # Parsear HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Limpiar contenido no deseado
        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
            element.decompose()
        
        # Extraer contenido específico si se proporciona selector
        if css_selector:
            selected_elements = soup.select(css_selector)
            if selected_elements:
                content_soup = BeautifulSoup('', 'html.parser')
                for elem in selected_elements:
                    content_soup.append(elem)
                soup = content_soup
        
        # Convertir a markdown
        h = html2text.HTML2Text()
        h.ignore_links = False
        h.ignore_images = False
        h.body_width = 0  # No limitar ancho
        markdown_content = h.handle(str(soup))
        
        # Extraer texto plano
        text_content = soup.get_text(strip=True, separator=' ')
        
        return {
            'url': url,
            'title': soup.title.string if soup.title else 'Sin título',
            'markdown': markdown_content,
            'text': text_content,
            'html': response.text,
            'status_code': response.status_code,
            'success': True
        }
        
    except Exception as e:
        return {
            'url': url,
            'error': str(e),
            'success': False
        }

In [2]:
# Probar con la URL de exactas
url = "https://exactas.uba.ar/ensenanza/carreras-de-grado/ciencias-de-datos/"
result = scrape_website(url)

if result['success']:
    print("✅ Scraping exitoso!")
    print(f"📄 Título: {result['title']}")
    print(f"📊 Longitud del contenido: {len(result['markdown'])} caracteres")
    print("\n📝 Primeros 1000 caracteres del markdown:")
    print("-" * 50)
    print(result['markdown'][:1000])
    print("-" * 50)
else:
    print(f"❌ Error: {result['error']}")

🔍 Scrapeando: https://exactas.uba.ar/ensenanza/carreras-de-grado/ciencias-de-datos/
✅ Scraping exitoso!
📄 Título: Ciencias de Datos | Facultad de Ciencias Exactas y Naturales de la Universidad de Buenos Aires
📊 Longitud del contenido: 12598 caracteres

📝 Primeros 1000 caracteres del markdown:
--------------------------------------------------
  * [![twitter](https://exactas.uba.ar/wp-content/themes/polytechnic/mythology-core/core-assets/images/social-icons/circles/twitter.png)](https://twitter.com/exactas_uba)
  * [![facebook](https://exactas.uba.ar/wp-content/themes/polytechnic/mythology-core/core-assets/images/social-icons/circles/fb.png)](https://www.facebook.com/UBAExactas)
  * [![youtube](https://exactas.uba.ar/wp-content/themes/polytechnic/mythology-core/core-assets/images/social-icons/circles/youtube.png)](https://www.youtube.com/exactasubaoficial)
  * [![linkedin](https://exactas.uba.ar/wp-content/themes/polytechnic/mythology-core/core-assets/images/social-icons/circles/linkedi

In [1]:
import nest_asyncio
import asyncio
from crawl4ai import AsyncWebCrawler
# Habilitar loops anidados para Jupyter
nest_asyncio.apply()

In [2]:
async def crawl_with_crawl4ai(url):
    """
    Usar Crawl4AI en Jupyter con configuración específica
    """
    try:
        # Configuración optimizada para Jupyter
        config = {
            'headless': True,
            'verbose': False,
            'browser_type': 'chromium',
            'ignore_https_errors': True,
            'java_script_enabled': False,  # Deshabilitar JS para mayor compatibilidad
        }
        
        print(f"🚀 Iniciando crawler para: {url}")
        
        async with AsyncWebCrawler(**config) as crawler:
            result = await crawler.arun(
                url=url,
                bypass_cache=True,
                process_iframes=False,
                wait_for_images=False,
                simulate_user=False,
                magic=True,  # Usar extracción inteligente
            )
            
            print(f"✅ Crawling completado!")
            print(f"📋 Tipo de resultado: {type(result)}")
            
            # Mostrar atributos disponibles
            attrs = [attr for attr in dir(result) if not attr.startswith('_')]
            print(f"📋 Atributos disponibles: {attrs}")
            
            return result
            
    except Exception as e:
        print(f"❌ Error en Crawl4AI: {e}")
        return None



In [4]:
# Ejecutar el crawler
url = "https://exactas.uba.ar/ensenanza/carreras-de-grado/ciencias-de-datos/"
result = asyncio.run(crawl_with_crawl4ai(url))

🚀 Iniciando crawler para: https://exactas.uba.ar/ensenanza/carreras-de-grado/ciencias-de-datos/


Task exception was never retrieved
future: <Task finished name='Task-2' coro=<Connection.run() done, defined at c:\Users\arca\Desarrollo\Exacty\.venv\Lib\site-packages\playwright\_impl\_connection.py:303> exception=NotImplementedError()>
Traceback (most recent call last):
  File "C:\laragon\bin\python\Python312\Lib\asyncio\tasks.py", line 314, in __step_run_and_handle_result
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "c:\Users\arca\Desarrollo\Exacty\.venv\Lib\site-packages\playwright\_impl\_connection.py", line 310, in run
    await self._transport.connect()
  File "c:\Users\arca\Desarrollo\Exacty\.venv\Lib\site-packages\playwright\_impl\_transport.py", line 133, in connect
    raise exc
  File "c:\Users\arca\Desarrollo\Exacty\.venv\Lib\site-packages\playwright\_impl\_transport.py", line 120, in connect
    self._proc = await asyncio.create_subprocess_exec(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\laragon\bin\python\Python312\Lib\asyncio\s

❌ Error en Crawl4AI: 


In [None]:


if result:
    # Verificar qué atributos tiene el resultado
    if hasattr(result, 'markdown') and result.markdown:
        print("\n📄 Contenido Markdown:")
        print("-" * 50)
        print(result.markdown[:1000])
    elif hasattr(result, 'extracted_content') and result.extracted_content:
        print("\n📄 Contenido Extraído:")
        print("-" * 50)
        print(result.extracted_content[:1000])
    else:
        print("\n🔍 Explorando resultado:")
        for attr in ['html', 'text', 'content']:
            if hasattr(result, attr):
                content = getattr(result, attr)
                if content:
                    print(f"✅ Encontrado {attr}: {str(content)[:200]}...")
                    break