<a href="https://colab.research.google.com/github/davidlealo/100profes/blob/master/textos_largos_anthropic_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Textos largos Anthropic API

In [1]:
# Instalación de dependencias
!pip install anthropic PyMuPDF python-docx epub_meta



Collecting anthropic
  Downloading anthropic-0.42.0-py3-none-any.whl.metadata (23 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting epub_meta
  Downloading epub_meta-0.0.7.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading anthropic-0.42.0-py3-none-any.whl (203 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?

In [None]:
# Importar librerías
from google.colab import files
from anthropic import Anthropic
import fitz
import docx
import epub_meta
import os
from typing import List, Dict

class DocumentReader:
    def __init__(self, api_key: str):
        self.client = Anthropic(api_key=api_key)

    def read_pdf(self, filepath: str) -> str:
        text = ""
        with fitz.open(filepath) as doc:
            for page in doc:
                text += page.get_text()
        return text

    def read_docx(self, filepath: str) -> str:
        doc = docx.Document(filepath)
        return "\n".join([paragraph.text for paragraph in doc.paragraphs])

    def read_txt(self, filepath: str) -> str:
        with open(filepath, 'r', encoding='utf-8') as file:
            return file.read()

    def read_epub(self, filepath: str) -> str:
        metadata = epub_meta.get_epub_metadata(filepath, read_content=True)
        return metadata['content']

    def process_text(self, text: str, max_chunk_size: int = 4000) -> List[str]:
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) + 1 > max_chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_length = len(word)
            else:
                current_chunk.append(word)
                current_length += len(word) + 1

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def analyze_document(self, filepath: str) -> List[Dict]:
        _, ext = os.path.splitext(filepath)
        ext = ext.lower()

        if ext == '.pdf':
            text = self.read_pdf(filepath)
        elif ext == '.docx':
            text = self.read_docx(filepath)
        elif ext == '.txt':
            text = self.read_txt(filepath)
        elif ext == '.epub':
            text = self.read_epub(filepath)
        else:
            raise ValueError(f"Formato no soportado: {ext}")

        chunks = self.process_text(text)
        responses = []

        for chunk in chunks:
            message = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=1024,
                messages=[{
                    "role": "user",
                    "content": f"Analiza el siguiente texto y proporciona un resumen:\n\n{chunk}"
                }]
            )
            responses.append({
                "chunk": chunk,
                "analysis": message.content
            })

        return responses

# Subir archivo
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Usar el DocumentReader
ANTHROPIC_API_KEY = "tu-api-key"  # Reemplaza con tu API key
reader = DocumentReader(ANTHROPIC_API_KEY)
results = reader.analyze_document(filename)

# Mostrar resultados
for i, result in enumerate(results, 1):
    print(f"\nAnálisis #{i}:")
    print(result["analysis"])
    print("-" * 50)