<a href="https://colab.research.google.com/github/drfperez/utilities/blob/main/Webtopdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# -*- coding: utf-8 -*-
"""
SISTEMA AVAN√áAT DE CONVERSI√ì WEB A PDF
Per Google Colab - Versi√≥ 2.0
"""

# ============ INSTAL¬∑LACI√ì DE DEPEND√àNCIES ============
print("üîÑ Instal¬∑lant depend√®ncies necess√†ries...")

!apt-get update -qq > /dev/null 2>&1
!apt-get install -y -qq wkhtmltopdf > /dev/null 2>&1
!pip install -q pdfkit requests beautifulsoup4 PyPDF2 selenium webdriver-manager pillow > /dev/null 2>&1

import os
import sys
import time
import requests
import json
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Depend√®ncies instal¬∑lades correctament")

# ============ CONFIGURACI√ì DEL SISTEMA ============
class PDFGeneratorConfig:
    """Configuraci√≥ del sistema de generaci√≥ de PDFs"""

    # Directoris
    BASE_DIR = Path("/content/pdf_generator")
    TEMP_DIR = BASE_DIR / "temp"
    OUTPUT_DIR = BASE_DIR / "output"
    LOGS_DIR = BASE_DIR / "logs"

    # Par√†metres per defecte
    DEFAULT_OPTIONS = {
        'page-size': 'A4',
        'margin-top': '0.5in',
        'margin-right': '0.5in',
        'margin-bottom': '0.5in',
        'margin-left': '0.5in',
        'encoding': 'UTF-8',
        'enable-local-file-access': None,
        'quiet': '',
        'no-outline': None,
        'disable-smart-shrinking': None,
        'zoom': 1.0,
        'custom-header': [
            ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        ],
        'javascript-delay': 1000,
    }

    # Formats suportats
    SUPPORTED_FORMATS = ['pdf', 'html', 'png']

    def __init__(self):
        self.create_directories()

    def create_directories(self):
        """Crea els directoris necessaris"""
        for directory in [self.BASE_DIR, self.TEMP_DIR, self.OUTPUT_DIR, self.LOGS_DIR]:
            directory.mkdir(parents=True, exist_ok=True)

# ============ EINES AUXILIARS ============
class WebTools:
    """Eines per al processament web"""

    @staticmethod
    def validate_url(url):
        """Valida i normalitza una URL"""
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        try:
            result = urlparse(url)
            if all([result.scheme, result.netloc]):
                return url
            return None
        except:
            return None

    @staticmethod
    def extract_domain(url):
        """Extrau el domini d'una URL"""
        return urlparse(url).netloc

    @staticmethod
    def get_page_title(url):
        """Obt√© el t√≠tol d'una p√†gina web"""
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.title.string if soup.title else "Sense t√≠tol"
            return title.strip()[:100]
        except:
            return "Document sense t√≠tol"

    @staticmethod
    def estimate_reading_time(text_length):
        """Estima el temps de lectura"""
        words = text_length / 5  # Aproximaci√≥ de paraules
        minutes = words / 200  # Velocitat mitjana de lectura
        return max(1, int(minutes))

# ============ CRAWLER AVAN√áAT ============
class AdvancedWebCrawler:
    """Crawler per descobrir totes les p√†gines d'un lloc"""

    def __init__(self, base_url, max_pages=50, respect_robots=True):
        self.base_url = base_url
        self.max_pages = max_pages
        self.respect_robots = respect_robots
        self.visited = set()
        self.to_visit = set([base_url])
        self.discovered_pages = []
        self.domain = urlparse(base_url).netloc
        self.robots_txt = None

        if respect_robots:
            self.fetch_robots_txt()

    def fetch_robots_txt(self):
        """Obt√© el fitxer robots.txt"""
        try:
            robots_url = urljoin(self.base_url, '/robots.txt')
            response = requests.get(robots_url, timeout=5)
            if response.status_code == 200:
                self.robots_txt = response.text
                print(f"üìÑ robots.txt trobat per {self.domain}")
        except:
            pass

    def is_allowed(self, url):
        """Comprova si la URL est√† permesa per robots.txt"""
        if not self.robots_txt:
            return True

        # An√†lisi b√†sic de robots.txt
        for line in self.robots_txt.split('\n'):
            if line.lower().startswith('disallow:'):
                path = line.split(':', 1)[1].strip()
                if path and path in url:
                    return False
        return True

    def extract_links(self, html_content, current_url):
        """Extrau tots els enlla√ßos d'una p√†gina"""
        soup = BeautifulSoup(html_content, 'html.parser')
        links = []

        for link in soup.find_all('a', href=True):
            href = link.get('href')
            full_url = urljoin(current_url, href)

            # Filtrar enlla√ßos
            if self.is_valid_link(full_url):
                links.append(full_url)

        return links

    def is_valid_link(self, url):
        """Determina si un enlla√ß √©s v√†lid per al crawling"""
        # Nom√©s mateix domini
        if urlparse(url).netloc != self.domain:
            return False

        # Excloure extensions no desitjades
        excluded_ext = [
            '.pdf', '.jpg', '.jpeg', '.png', '.gif', '.svg',
            '.css', '.js', '.zip', '.tar', '.gz', '.exe',
            '.mp3', '.mp4', '.avi', '.mov'
        ]

        if any(url.lower().endswith(ext) for ext in excluded_ext):
            return False

        # Excloure enlla√ßos especials
        if any(x in url.lower() for x in ['mailto:', 'tel:', 'javascript:', '#']):
            return False

        return True

    def crawl(self):
        """Executa el crawling del lloc"""
        print(f"üîç Comen√ßant crawling de {self.base_url}")
        print(f"   L√≠mit m√†xim de p√†gines: {self.max_pages}")

        while self.to_visit and len(self.visited) < self.max_pages:
            current_url = self.to_visit.pop()

            if current_url in self.visited:
                continue

            if not self.is_allowed(current_url):
                continue

            try:
                print(f"   Analitzant: {current_url}")

                response = requests.get(current_url, timeout=10)
                if response.status_code == 200:
                    self.visited.add(current_url)
                    self.discovered_pages.append(current_url)

                    # Extraure nous enlla√ßos
                    new_links = self.extract_links(response.text, current_url)

                    for link in new_links:
                        if link not in self.visited and link not in self.to_visit:
                            self.to_visit.add(link)

                    time.sleep(0.5)  # Respectar el servidor

            except Exception as e:
                print(f"   Error amb {current_url}: {e}")
                continue

        print(f"‚úÖ Crawling completat. S'han trobat {len(self.discovered_pages)} p√†gines.")
        return self.discovered_pages

# ============ GENERADOR DE PDFS ============
class PDFGenerator:
    """Generador principal de PDFs"""

    def __init__(self, config):
        self.config = config
        self.stats = {
            'total_pages': 0,
            'successful': 0,
            'failed': 0,
            'total_size': 0,
            'start_time': None,
            'end_time': None
        }

        # Configurar pdfkit
        try:
            import pdfkit
            self.wkhtml_path = '/usr/bin/wkhtmltopdf'
            self.pdfkit_config = pdfkit.configuration(wkhtmltopdf=self.wkhtml_path)
            self.pdfkit_available = True
        except:
            self.pdfkit_available = False
            print("‚ö†Ô∏è  pdfkit no disponible, utilitzant mode alternatiu")

    def generate_single_pdf(self, url, output_path, options=None):
        """Genera un PDF a partir d'una sola URL"""
        if not self.pdfkit_available:
            return self.generate_alternative(url, output_path)

        import pdfkit

        try:
            # Preparar opcions
            pdf_options = self.config.DEFAULT_OPTIONS.copy()
            if options:
                pdf_options.update(options)

            # Generar PDF
            pdfkit.from_url(url, output_path,
                          configuration=self.pdfkit_config,
                          options=pdf_options)

            # Verificar resultat
            if os.path.exists(output_path) and os.path.getsize(output_path) > 1024:
                size = os.path.getsize(output_path)
                self.stats['successful'] += 1
                self.stats['total_size'] += size
                return True, size
            else:
                return False, 0

        except Exception as e:
            print(f"   Error generant PDF: {e}")
            return False, 0

    def generate_alternative(self, url, output_path):
        """M√®tode alternatiu per generar PDFs"""
        try:
            from selenium import webdriver
            from webdriver_manager.chrome import ChromeDriverManager
            from selenium.webdriver.chrome.options import Options

            # Configurar Chrome headless
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--window-size=1920,1080')

            # Instal¬∑lar i configurar driver
            driver = webdriver.Chrome(
                ChromeDriverManager().install(),
                options=chrome_options
            )

            # Capturar p√†gina
            driver.get(url)
            time.sleep(3)  # Esperar a que carregui

            # Guardar com a PDF (Chrome headless t√© aquesta opci√≥)
            pdf_data = driver.execute_cdp_cmd('Page.printToPDF', {
                'landscape': False,
                'displayHeaderFooter': False,
                'printBackground': True,
                'preferCSSPageSize': True,
            })

            import base64
            with open(output_path, 'wb') as f:
                f.write(base64.b64decode(pdf_data['data']))

            driver.quit()

            size = os.path.getsize(output_path)
            self.stats['successful'] += 1
            self.stats['total_size'] += size
            return True, size

        except Exception as e:
            print(f"   Error en m√®tode alternatiu: {e}")
            return False, 0

    def generate_batch(self, urls, output_filename, merge=True, progress_callback=None):
        """Genera PDFs per a m√∫ltiples URLs"""
        self.stats['start_time'] = datetime.now()
        self.stats['total_pages'] = len(urls)

        print(f"\nüîÑ Generant {len(urls)} PDFs...")

        # Crear directori temporal
        temp_dir = self.config.TEMP_DIR / f"batch_{int(time.time())}"
        temp_dir.mkdir(exist_ok=True)

        generated_files = []

        for i, url in enumerate(urls, 1):
            if progress_callback:
                progress_callback(i, len(urls))

            print(f"   [{i}/{len(urls)}] Processant: {url[:80]}...")

            # Generar nom de fitxer segur
            safe_name = self.get_safe_filename(url)
            temp_pdf = temp_dir / f"{i:03d}_{safe_name}.pdf"

            # Generar PDF
            success, size = self.generate_single_pdf(url, str(temp_pdf))

            if success:
                generated_files.append(temp_pdf)
                print(f"     ‚úì PDF generat ({size:,} bytes)")
            else:
                self.stats['failed'] += 1
                print(f"     ‚úó Error generant PDF")

            time.sleep(1)  # Pausa per no sobrecarregar

        # Combinar PDFs si √©s necessari
        if merge and len(generated_files) > 1:
            final_path = self.merge_pdfs(generated_files, output_filename)
        elif generated_files:
            final_path = generated_files[0]
            os.rename(final_path, self.config.OUTPUT_DIR / output_filename)
        else:
            final_path = None

        # Actualitzar estad√≠stiques
        self.stats['end_time'] = datetime.now()
        self.generate_report()

        # Netejar fitxers temporals
        self.cleanup_temp(temp_dir)

        return final_path

    def merge_pdfs(self, pdf_files, output_filename):
        """Combina m√∫ltiples PDFs en un sol fitxer"""
        try:
            from PyPDF2 import PdfMerger

            merger = PdfMerger()
            output_path = self.config.OUTPUT_DIR / output_filename

            for pdf_file in pdf_files:
                merger.append(str(pdf_file))

            merger.write(str(output_path))
            merger.close()

            print(f"\n‚úÖ PDFs combinats correctament: {output_filename}")
            return output_path

        except Exception as e:
            print(f"‚ùå Error combinant PDFs: {e}")
            return None

    def get_safe_filename(self, url):
        """Crea un nom de fitxer segur a partir d'una URL"""
        from urllib.parse import urlparse
        import re

        # Extraure parts de la URL
        parsed = urlparse(url)
        path = parsed.path.strip('/').replace('/', '_')
        domain = parsed.netloc.replace('.', '_')

        # Netejar car√†cters no v√†lids
        filename = f"{domain}_{path}" if path else domain
        filename = re.sub(r'[^\w\-_\. ]', '_', filename)
        filename = filename[:100]  # Limitar longitud

        return filename if filename else "document"

    def cleanup_temp(self, temp_dir):
        """Neteja els fitxers temporals"""
        import shutil
        try:
            shutil.rmtree(temp_dir)
        except:
            pass

    def generate_report(self):
        """Genera un informe d'execuci√≥"""
        duration = self.stats['end_time'] - self.stats['start_time']

        report = f"""
        üìä INFORME DE GENERACI√ì DE PDF
        =================================
        ‚Ä¢ Data i hora: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
        ‚Ä¢ P√†gines processades: {self.stats['total_pages']}
        ‚Ä¢ √àxits: {self.stats['successful']}
        ‚Ä¢ Errors: {self.stats['failed']}
        ‚Ä¢ Temps total: {duration.total_seconds():.1f} segons
        ‚Ä¢ Mida total: {self.stats['total_size']:,} bytes
        =================================
        """

        print(report)

        # Guardar informe
        report_file = self.config.LOGS_DIR / f"report_{int(time.time())}.txt"
        with open(report_file, 'w', encoding='utf-8') as f:
            f.write(report)

# ============ INTERF√çCIE D'USUARI ============
class PDFGeneratorUI:
    """Interf√≠cie d'usuari per al sistema"""

    def __init__(self):
        self.config = PDFGeneratorConfig()
        self.tools = WebTools()
        self.generator = PDFGenerator(self.config)
        self.current_project = None

    def display_banner(self):
        """Mostra el banner del sistema"""
        banner = """
        ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
        ‚ïë                                                       ‚ïë
        ‚ïë    üåê SISTEMA DE CONVERSI√ì WEB A PDF                 ‚ïë
        ‚ïë          per Google Colab                            ‚ïë
        ‚ïë                                                       ‚ïë
        ‚ïë    Versi√≥ 2.0 | Amb mode crawling avan√ßat           ‚ïë
        ‚ïë                                                       ‚ïë
        ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
        """
        print(banner)

    def display_menu(self):
        """Mostra el men√∫ principal"""
        menu = """
        üìã MENU PRINCIPAL:

        1. üéØ Convertir una sola p√†gina web
        2. üìù Convertir una llista d'URLs
        3. üï∑Ô∏è  Convertir tot un lloc web (crawling)
        4. üìä Veure informes anteriors
        5. ‚öôÔ∏è  Configurar opcions avan√ßades
        6. üìö Ajuda i exemples
        7. üö™ Sortir

        Escull una opci√≥ (1-7): """

        return input(menu).strip()

    def option_single_page(self):
        """Opci√≥ 1: Convertir una sola p√†gina"""
        print("\nüéØ CONVERTIR UNA SOLA P√ÄGINA WEB")
        print("-" * 40)

        url = input("Introdueix la URL de la p√†gina: ").strip()
        validated_url = self.tools.validate_url(url)

        if not validated_url:
            print("‚ùå URL inv√†lida. Torna-ho a provar.")
            return

        print(f"\nüîç Analitzant: {validated_url}")

        try:
            # Obtindre informaci√≥ de la p√†gina
            title = self.tools.get_page_title(validated_url)
            domain = self.tools.extract_domain(validated_url)

            print(f"   T√≠tol: {title}")
            print(f"   Domini: {domain}")

            # Demanar nom del fitxer
            default_name = self.generator.get_safe_filename(validated_url)
            filename = input(f"\nüìù Nom del fitxer PDF [{default_name}.pdf]: ").strip()
            filename = filename if filename else f"{default_name}.pdf"

            if not filename.endswith('.pdf'):
                filename += '.pdf'

            # Configurar opcions
            print("\n‚öôÔ∏è  Configuraci√≥ (prem Enter per valors per defecte):")

            page_size = input(f"   Mida de p√†gina [A4]: ").strip() or 'A4'
            orientation = input(f"   Orientaci√≥ [portrait/landscape]: ").strip() or 'portrait'

            options = {
                'page-size': page_size,
                'orientation': orientation,
            }

            print(f"\nüîÑ Convertint {validated_url} a PDF...")

            output_path = self.config.OUTPUT_DIR / filename
            success, size = self.generator.generate_single_pdf(
                validated_url,
                str(output_path),
                options
            )

            if success:
                print(f"\n‚úÖ PDF generat correctament!")
                print(f"   Fitxer: {filename}")
                print(f"   Mida: {size:,} bytes")
                print(f"   Ubicaci√≥: {output_path}")

                self.offer_download(output_path)
            else:
                print("‚ùå Error generant el PDF. Torna-ho a provar.")

        except Exception as e:
            print(f"‚ùå Error: {e}")

    def option_url_list(self):
        """Opci√≥ 2: Convertir una llista d'URLs"""
        print("\nüìù CONVERTIR UNA LLISTA D'URLs")
        print("-" * 40)

        print("\nPots introduir URLs de les seg√ºents maneres:")
        print("1. Directament aqu√≠ (una per l√≠nia)")
        print("2. Desde un fitxer de text")
        print("3. Desde un fitxer JSON")

        choice = input("\nEscull una opci√≥ (1-3): ").strip()

        urls = []

        if choice == '1':
            print("\nüì• Introdueix les URLs (una per l√≠nia).")
            print("   Escriu 'END' en una l√≠nia nova per acabar:")

            while True:
                url = input().strip()
                if url.upper() == 'END':
                    break
                if url:
                    validated = self.tools.validate_url(url)
                    if validated:
                        urls.append(validated)
                        print(f"   ‚úì URL afegida: {validated[:80]}...")
                    else:
                        print(f"   ‚úó URL inv√†lida: {url}")

        elif choice == '2':
            print("\nüì§ Pujant fitxer de text...")
            from google.colab import files
            uploaded = files.upload()

            for filename, content in uploaded.items():
                if filename.endswith('.txt'):
                    lines = content.decode('utf-8').split('\n')
                    urls = [self.tools.validate_url(line.strip())
                           for line in lines if line.strip()]
                    urls = [u for u in urls if u]  # Eliminar None
                    break

        elif choice == '3':
            print("\nüì§ Pujant fitxer JSON...")
            from google.colab import files
            uploaded = files.upload()

            for filename, content in uploaded.items():
                if filename.endswith('.json'):
                    try:
                        data = json.loads(content.decode('utf-8'))
                        if isinstance(data, list):
                            urls = [self.tools.validate_url(url) for url in data]
                            urls = [u for u in urls if u]
                    except:
                        print("‚ùå Error llegint el fitxer JSON")

        if not urls:
            print("‚ùå No s'han trobat URLs v√†lides.")
            return

        print(f"\n‚úÖ S'han trobat {len(urls)} URLs v√†lides.")

        # Configuraci√≥
        filename = input("\nüìù Nom del fitxer PDF final [llista_webs.pdf]: ").strip()
        filename = filename if filename else "llista_webs.pdf"

        merge = input("\nüìë Vols combinar tot en un sol PDF? (s/n) [s]: ").strip().lower()
        merge = merge != 'n'

        # Generar PDFs
        print(f"\nüîÑ Convertint {len(urls)} p√†gines a PDF...")

        output_path = self.generator.generate_batch(
            urls,
            filename,
            merge=merge,
            progress_callback=lambda i, total: print(f"   Processant {i}/{total}...")
        )

        if output_path:
            self.offer_download(output_path)

    def option_crawl_site(self):
        """Opci√≥ 3: Crawling de tot un lloc web"""
        print("\nüï∑Ô∏è  CRAWLING DE LLOC WEB COMPLET")
        print("-" * 40)

        url = input("Introdueix la URL inicial: ").strip()
        validated_url = self.tools.validate_url(url)

        if not validated_url:
            print("‚ùå URL inv√†lida.")
            return

        print(f"\nüîç Preparant crawling de: {validated_url}")

        # Configuraci√≥ del crawling
        max_pages = input(f"   M√†xim de p√†gines a analitzar [50]: ").strip()
        max_pages = int(max_pages) if max_pages.isdigit() else 50

        respect_robots = input(f"   Respectar robots.txt? (s/n) [s]: ").strip().lower()
        respect_robots = respect_robots != 'n'

        # Executar crawling
        print(f"\nüîÑ Iniciant crawling...")

        crawler = AdvancedWebCrawler(
            validated_url,
            max_pages=max_pages,
            respect_robots=respect_robots
        )

        discovered_urls = crawler.crawl()

        if not discovered_urls:
            print("‚ùå No s'han trobat p√†gines.")
            return

        # Mostrar resultats
        print(f"\nüìä RESULTATS DEL CRAWLING:")
        print(f"   ‚Ä¢ P√†gines trobades: {len(discovered_urls)}")
        print(f"   ‚Ä¢ P√†gines √∫niques: {len(set(discovered_urls))}")

        # Demanar qu√® fer amb els resultats
        print("\nüìù Qu√® vols fer amb les p√†gines trobades?")
        print("1. Convertir totes a PDF")
        print("2. Seleccionar manualment")
        print("3. Guardar llista per despr√©s")

        action = input("Escull una opci√≥ (1-3): ").strip()

        if action == '1':
            filename = input("\nNom del fitxer PDF [lloc_complet.pdf]: ").strip()
            filename = filename if filename else "lloc_complet.pdf"

            output_path = self.generator.generate_batch(
                discovered_urls,
                filename,
                merge=True
            )

            if output_path:
                self.offer_download(output_path)

        elif action == '2':
            print("\nüìã P√†gines trobades:")
            for i, url in enumerate(discovered_urls[:20], 1):
                print(f"{i:3d}. {url[:80]}...")

            if len(discovered_urls) > 20:
                print(f"... i {len(discovered_urls) - 20} m√©s")

            selected = input("\nIntrodueix els n√∫meros separats per comes: ").strip()
            indices = [int(i.strip()) - 1 for i in selected.split(',') if i.strip().isdigit()]

            selected_urls = [discovered_urls[i] for i in indices if i < len(discovered_urls)]

            if selected_urls:
                filename = input("\nNom del fitxer PDF [seleccio.pdf]: ").strip()
                filename = filename if filename else "seleccio.pdf"

                output_path = self.generator.generate_batch(selected_urls, filename)

                if output_path:
                    self.offer_download(output_path)

    def option_view_reports(self):
        """Opci√≥ 4: Veure informes anteriors"""
        print("\nüìä INFORMES ANTERIORS")
        print("-" * 40)

        reports = list(self.config.LOGS_DIR.glob("*.txt"))

        if not reports:
            print("No hi ha informes disponibles.")
            return

        reports.sort(key=lambda x: x.stat().st_mtime, reverse=True)

        print(f"\nüìÅ S'han trobat {len(reports)} informes:")

        for i, report in enumerate(reports[:10], 1):
            mtime = datetime.fromtimestamp(report.stat().st_mtime)
            size = report.stat().st_size

            print(f"{i:2d}. {report.name}")
            print(f"    Data: {mtime.strftime('%Y-%m-%d %H:%M')}")
            print(f"    Mida: {size:,} bytes")
            print()

        view = input("Veure algun informe? (n√∫mero o 'n' per cap): ").strip()

        if view.isdigit():
            idx = int(view) - 1
            if 0 <= idx < len(reports):
                with open(reports[idx], 'r', encoding='utf-8') as f:
                    print(f"\n{'='*50}")
                    print(f.read())
                    print(f"{'='*50}")

    def offer_download(self, file_path):
        """Ofereix la desc√†rrega d'un fitxer"""
        print(f"\nüì• DESCARREGA DEL FITXER")
        print("-" * 40)

        if not os.path.exists(file_path):
            print("‚ùå El fitxer no existeix.")
            return

        size = os.path.getsize(file_path)
        print(f"Fitxer: {os.path.basename(file_path)}")
        print(f"Mida: {size:,} bytes ({size/1024/1024:.2f} MB)")

        download = input("\nVols descarregar el fitxer ara? (s/n) [s]: ").strip().lower()

        if download != 'n':
            try:
                from google.colab import files
                files.download(str(file_path))
                print("‚úÖ Descarrega iniciada!")
            except Exception as e:
                print(f"‚ùå Error en la desc√†rrega: {e}")
                print(f"üìÅ El fitxer est√† a: {file_path}")

    def run(self):
        """Executa la interf√≠cie principal"""
        self.display_banner()

        while True:
            try:
                choice = self.display_menu()

                if choice == '1':
                    self.option_single_page()
                elif choice == '2':
                    self.option_url_list()
                elif choice == '3':
                    self.option_crawl_site()
                elif choice == '4':
                    self.option_view_reports()
                elif choice == '5':
                    print("\n‚öôÔ∏è  Configuraci√≥ avan√ßada... (en desenvolupament)")
                elif choice == '6':
                    self.show_help()
                elif choice == '7' or choice.lower() == 'exit':
                    print("\nüëã Fins aviat!")
                    break
                else:
                    print("\n‚ùå Opci√≥ inv√†lida. Torna-ho a provar.")

                input("\nPrem Enter per continuar...")

            except KeyboardInterrupt:
                print("\n\nüëã Execuci√≥ interrompuda. Fins aviat!")
                break
            except Exception as e:
                print(f"\n‚ùå Error inesperat: {e}")

    def show_help(self):
        """Mostra l'ajuda i exemples"""
        help_text = """
        üìö AJUDA I EXEMPLES

        1. FORMATS D'ENTRADA ACCEPTATS:
           ‚Ä¢ URLs individuals: https://exemple.com
           ‚Ä¢ Llistes en text pla: una URL per l√≠nia
           ‚Ä¢ JSON: array d'URLs

        2. OPCIONS DE CONFIGURACI√ì:
           ‚Ä¢ Mida de p√†gina: A4, Letter, Legal, etc.
           ‚Ä¢ Orientaci√≥: portrait o landscape
           ‚Ä¢ Marges: personalitzables

        3. MODE CRAWLING:
           ‚Ä¢ Analitza tot un lloc web autom√†ticament
           ‚Ä¢ Respecta robots.txt (opcional)
           ‚Ä¢ L√≠mit configurable de p√†gines

        4. EXEMPLES DE US:
           ‚Ä¢ Convertir article: https://exemple.com/article
           ‚Ä¢ Convertir blog: mode crawling
           ‚Ä¢ Convertir llista: URLs de cursos/articles

        5. CONSELLS:
           ‚Ä¢ Per a p√†gines amb JavaScript, el sistema
             utilitza un motor alternatiu
           ‚Ä¢ Pots ajustar el temps d'espera si les
             p√†gines s√≥n lentes a carregar
        """

        print(help_text)

# ============ INICIALITZACI√ì ============
def main():
    """Funci√≥ principal d'inicialitzaci√≥"""
    print("üîß Inicialitzant sistema de conversi√≥ web a PDF...")

    # Crear i executar la interf√≠cie
    ui = PDFGeneratorUI()
    ui.run()

# Executar el sistema
if __name__ == "__main__":
    main()

üîÑ Instal¬∑lant depend√®ncies necess√†ries...
‚úÖ Depend√®ncies instal¬∑lades correctament
üîß Inicialitzant sistema de conversi√≥ web a PDF...

        ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
        ‚ïë                                                       ‚ïë
        ‚ïë    üåê SISTEMA DE CONVERSI√ì WEB A PDF                 ‚ïë
        ‚ïë          per Google Colab                            ‚ïë
        ‚ïë                                                       ‚ïë
        ‚ïë    Versi√≥ 2.0 | Amb mode crawling avan√ßat           ‚ïë
        ‚ïë                                                       ‚ïë
        ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
        

üï∑Ô∏è  CRAWLING DE LLOC WEB COMPLET
-----------------------

In [None]:
from google.colab import files
files.download('openair_book_complete.pdf')

In [None]:

# Instal¬∑lem les depend√®ncies necess√†ries
!apt-get update -qq
!apt-get install -y -qq wkhtmltopdf
!pip install -q pdfkit requests beautifulsoup4 PyPDF2

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pdfkit
import time
import os
from PyPDF2 import PdfMerger

def get_all_chapter_urls(start_url):
    """Obt√© totes les URLs del llibre"""
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    chapters = []

    # Buscar tots els enlla√ßos que s√≥n part del llibre
    for link in soup.find_all('a', href=True):
        href = link.get('href')
        if href:
            # Ignorar enlla√ßos externs i ancores
            if href.startswith('http') and 'openair-project.github.io/book/' not in href:
                continue
            if href.startswith('#'):
                continue

            # Construir URL completa
            full_url = urljoin(start_url, href)

            # Filtrar per obtenir nom√©s p√†gines del llibre
            if 'openair-project.github.io/book/' in full_url:
                # Excloure recursos no HTML
                excluded_extensions = ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg']
                if not any(full_url.lower().endswith(ext) for ext in excluded_extensions):
                    # Normalitzar URL (eliminar fragments)
                    full_url = full_url.split('#')[0]
                    if full_url not in chapters:
                        chapters.append(full_url)

    return chapters

def convert_urls_to_pdf(urls, output_file):
    """Converteix m√∫ltiples URLs a un sol PDF"""
    # Configurar opcions per a PDF
    options = {
        'page-size': 'A4',
        'margin-top': '0.5in',
        'margin-right': '0.5in',
        'margin-bottom': '0.5in',
        'margin-left': '0.5in',
        'encoding': 'UTF-8',
        'no-outline': None,
        'enable-local-file-access': None,  # Important per a Colab
        'quiet': ''
    }

    # Configurar pdfkit per a Colab (wkhtmltopdf s'instal¬∑la a /usr/bin/)
    config = pdfkit.configuration(wkhtmltopdf='/usr/bin/wkhtmltopdf')

    # Crear directori per als PDFs temporals
    os.makedirs('temp_pdfs', exist_ok=True)

    # Convertir cada p√†gina a PDF
    pdf_files = []
    successful_conversions = 0

    for i, url in enumerate(urls):
        try:
            print(f"Processant ({i+1}/{len(urls)}): {url}")
            output_filename = f"temp_pdfs/chapter_{i}.pdf"

            # Convertir la p√†gina a PDF
            pdfkit.from_url(url, output_filename, options=options, configuration=config)

            # Verificar que el PDF s'ha generat correctament
            if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
                pdf_files.append(output_filename)
                successful_conversions += 1
                print(f"  ‚úì PDF generat ({os.path.getsize(output_filename)} bytes)")
            else:
                print(f"  ‚úó Error: PDF buit o no generat")

            time.sleep(0.5)  # Esperar per no sobrecarregar el servidor

        except Exception as e:
            print(f"  ‚úó Error processant {url}: {str(e)[:100]}...")

    # Combinar tots els PDFs en un sol fitxer
    if pdf_files:
        print(f"\nCombinant {successful_conversions} PDFs en un sol fitxer...")
        merger = PdfMerger()

        for pdf_file in pdf_files:
            try:
                merger.append(pdf_file)
            except Exception as e:
                print(f"Error afegint {pdf_file}: {e}")

        # Guardar el PDF final
        merger.write(output_file)
        merger.close()

        # Netejar fitxers temporals
        for pdf_file in pdf_files:
            try:
                os.remove(pdf_file)
            except:
                pass

        # Eliminar directori temporal
        try:
            os.rmdir('temp_pdfs')
        except:
            pass

        print(f"\n‚úÖ PDF complet generat: {output_file}")
        print(f"   Mida del fitxer: {os.path.getsize(output_file)} bytes")

        # Mostrar enlla√ß per descarregar a Colab
        from google.colab import files
        print("\nüì• Per descarregar el PDF, executa: files.download('openair_book_complete.pdf')")

    else:
        print("\n‚ùå No s'ha generat cap PDF. Revisa els errors.")

# Executar el script
if __name__ == "__main__":
    start_url = "https://openair-project.github.io/book/"

    print("üîç Obtenint llista de cap√≠tols...")
    chapters = get_all_chapter_urls(start_url)

    # Afegir la p√†gina principal si no hi √©s
    if start_url not in chapters:
        chapters.insert(0, start_url)

    # Ordenar URLs per mantenir l'estructura l√≤gica (opcional)
    chapters = sorted(set(chapters))  # Eliminar duplicats

    print(f"\nüìö Trobades {len(chapters)} p√†gines:")
    for i, url in enumerate(chapters[:10]):  # Mostrar les 10 primeres
        print(f"  {i+1}. {url}")
    if len(chapters) > 10:
        print(f"  ... i {len(chapters)-10} m√©s")

    print(f"\nüîÑ Convertint a PDF...")
    convert_urls_to_pdf(chapters, "openair_book_complete.pdf")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Extracting templates from packages: 100%
(Reading database ... 117540 files and directories currently installed.)
Preparing to unpack .../00-libavahi-client3_0.8-5ubuntu5.4_amd64.deb ...
Unpacking libavahi-client3:amd64 (0.8-5ubuntu5.4) over (0.8-5ubuntu5.3) ...
Preparing to unpack .../01-libavahi-common3_0.8-5ubuntu5.4_amd64.deb ...
Unpacking libavahi-common3:amd64 (0.8-5ubuntu5.4) over (0.8-5ubuntu5.3) ...
Preparing to unpack .../02-libavahi-common-data_0.8-5ubuntu5.4_amd64.deb ...
Unpacking libavahi-common-data:amd64 (0.8-5ubuntu5.4) over (0.8-5ubuntu5.3) ...
Selecting previously unselected package libavahi-core7:amd64.
Preparing to unpack .../03-libavahi-core7_0.8-5ubuntu5.4_amd64.deb ...
Unpacking libavahi-core7:amd64 (0.8-5ubuntu5.4) ...
Selecting previously unselected package libdaemon0:amd64.