<a href="https://colab.research.google.com/github/davidlealo/100profes/blob/master/textos_largos_anthropic_api2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Textos largos Anthropic API

In [None]:
# Instalación de dependencias
!pip install anthropic PyMuPDF python-docx epub_meta ipywidgets

# Importar librerías
from google.colab import files
from ipywidgets import widgets
from anthropic import Anthropic
import fitz
import docx
import epub_meta
import os
from typing import List, Dict
import time

# Widget para API key
api_key_input = widgets.Password(
    description='API Key:',
    layout={'width': '500px'},
    style={'description_width': 'initial'}
)
display(api_key_input)

class DocumentReader:
    def __init__(self, api_key: str):
        self.client = Anthropic(api_key=api_key)

    def read_pdf(self, filepath: str) -> str:
        text = ""
        with fitz.open(filepath) as doc:
            for page in doc:
                text += page.get_text()
        return text

    def read_docx(self, filepath: str) -> str:
        doc = docx.Document(filepath)
        return "\n".join([paragraph.text for paragraph in doc.paragraphs])

    def read_txt(self, filepath: str) -> str:
        with open(filepath, 'r', encoding='utf-8') as file:
            return file.read()

    def read_epub(self, filepath: str) -> str:
        metadata = epub_meta.get_epub_metadata(filepath, read_content=True)
        return metadata['content']

    def process_text(self, text: str, max_chunk_size: int = 2000) -> List[str]:
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) + 1 > max_chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_length = len(word)
            else:
                current_chunk.append(word)
                current_length += len(word) + 1

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def analyze_document(self, filepath: str) -> List[Dict]:
        print("Leyendo documento...")
        _, ext = os.path.splitext(filepath)
        ext = ext.lower()

        if ext == '.pdf':
            text = self.read_pdf(filepath)
        elif ext == '.docx':
            text = self.read_docx(filepath)
        elif ext == '.txt':
            text = self.read_txt(filepath)
        elif ext == '.epub':
            text = self.read_epub(filepath)
        else:
            raise ValueError(f"Formato no soportado: {ext}")

        print("Procesando texto en chunks...")
        chunks = self.process_text(text)
        responses = []
        total_chunks = len(chunks)

        for i, chunk in enumerate(chunks, 1):
            print(f"Analizando chunk {i}/{total_chunks}")
            try:
                message = self.client.messages.create(
                    model="claude-3-sonnet-20240229",
                    max_tokens=1024,
                    messages=[{
                        "role": "user",
                        "content": f"Analiza el siguiente texto y proporciona un resumen:\n\n{chunk}"
                    }]
                )
                responses.append({
                    "chunk": chunk,
                    "analysis": message.content
                })
                # Pequeña pausa para evitar límites de rate
                time.sleep(1)
            except Exception as e:
                print(f"Error en chunk {i}: {str(e)}")
                continue

        return responses

print("Por favor, ingresa tu API key de Anthropic arriba y luego ejecuta la siguiente celda.")

# [NUEVA CELDA]
# Subir y procesar archivo
print("Selecciona el archivo a analizar:")
uploaded = files.upload()
filename = list(uploaded.keys())[0]

if api_key_input.value:
    reader = DocumentReader(api_key_input.value)
    try:
        results = reader.analyze_document(filename)
        print("\nResultados del análisis:")
        for i, result in enumerate(results, 1):
            print(f"\nAnálisis #{i}:")
            print(result["analysis"])
            print("-" * 50)
    except Exception as e:
        print(f"Error al procesar el documento: {str(e)}")
else:
    print("Por favor, ingresa una API key válida.")