In [2]:
pip install python-dotenv==1.0.1 langchain==0.2.2 langchain-community==0.2.3 langchain-openai==0.1.8 unstructured==0.14.4 chromadb==0.5.0

Collecting langchain==0.2.2
  Downloading langchain-0.2.2-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community==0.2.3
  Downloading langchain_community-0.2.3-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-openai==0.1.8
  Downloading langchain_openai-0.1.8-py3-none-any.whl.metadata (2.5 kB)
Collecting unstructured==0.14.4
  Downloading unstructured-0.14.4-py3-none-any.whl.metadata (28 kB)
Collecting chromadb==0.5.0
  Downloading chromadb-0.5.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain==0.2.2)
  Using cached langchain_core-0.2.43-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.2.2)
  Using cached langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting openai<2.0.0,>=1.26.0 (from langchain-openai==0.1.8)
  Downloading openai-1.66.3-py3-none-any.whl.metadata (25 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai==0.1.8)
  Downloading tiktoken-0.9.0-cp312-cp312-win_amd6

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-aws 0.1.0 requires langchain-core<0.2,>=0.1, but you have langchain-core 0.2.43 which is incompatible.


In [2]:
import os
import json
import boto3
import PyPDF2
from typing import List, Dict, Any

# AWS Bedrock models
CLAUDE_MODEL_ID = "arn:aws:bedrock:us-east-1:660845012422:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0"
TITAN_EMBED_MODEL_ID = "amazon.titan-embed-text-v2:0"

# Setup AWS Bedrock client
bedrock_runtime = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
)

# Simple document class to store text chunks
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

# Simple in-memory vector store with basic similarity search
class SimpleVectorStore:
    def __init__(self):
        self.documents = []
        self.embeddings = []
    
    def add_documents(self, documents):
        # In a real implementation, you would compute embeddings here
        self.documents.extend(documents)
        return [f"doc_{i}" for i in range(len(documents))]
    
    def similarity_search(self, query, k=4):
        # In a simple implementation, we'll just do basic keyword matching
        # This is a very basic approach and would be replaced with proper vector similarity
        query_words = set(query.lower().split())
        scored_docs = []
        
        for doc in self.documents:
            content_words = set(doc.page_content.lower().split())
            # Score is the count of matching words
            score = len(query_words.intersection(content_words))
            scored_docs.append((doc, score))
        
        # Sort by score descending and take top k
        scored_docs.sort(key=lambda x: x[1], reverse=True)
        return [doc for doc, score in scored_docs[:k]]

# Create a simple vector store
vector_store = SimpleVectorStore()

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file
        
    Returns:
        Extracted text
    """
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text() + "\n\n"
            
    return text

def index_pdf(pdf_path, chunk_size=1000, chunk_overlap=200):
    """
    Index a PDF document for RAG.
    
    Args:
        pdf_path: Path to the PDF file
        chunk_size: Size of text chunks in characters
        chunk_overlap: Overlap between chunks in characters
        
    Returns:
        Number of document chunks indexed
    """
    # Extract text from PDF
    print(f"Extracting text from {pdf_path}...")
    text = extract_text_from_pdf(pdf_path)
    
    # Split text into chunks with overlap
    chunks = []
    start = 0
    
    while start < len(text):
        # Get chunk with specified size
        end = min(start + chunk_size, len(text))
        
        # Find a good break point (newline or space)
        if end < len(text):
            # Try to find a newline
            newline_pos = text.rfind('\n', start, end)
            if newline_pos > start + chunk_size // 2:  # Make sure we don't create too small chunks
                end = newline_pos + 1
            else:
                # Try to find a space
                space_pos = text.rfind(' ', start, end)
                if space_pos > start + chunk_size // 2:
                    end = space_pos + 1
        
        # Extract the chunk
        chunk = text[start:end].strip()
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        
        # Move to next chunk with overlap
        start = end - chunk_overlap
        if start < 0 or start >= len(text):
            break
    
    # Create document objects
    documents = []
    for i, chunk in enumerate(chunks):
        documents.append(Document(
            page_content=chunk,
            metadata={
                "source": pdf_path, 
                "chunk_id": i,
                "start_char": i * (chunk_size - chunk_overlap),
                "end_char": i * (chunk_size - chunk_overlap) + len(chunk)
            }
        ))
    
    # Add documents to vector store
    vector_store.add_documents(documents)
    
    print(f"Indexed {len(documents)} chunks from {pdf_path}")
    return len(documents)

def query_claude(messages):
    """
    Query Claude 3.7 Sonnet model.
    
    Args:
        messages: List of message dictionaries with role and content
        
    Returns:
        Model response text
    """
    # Format messages for Anthropic Claude
    anthropic_messages = []
    for msg in messages:
        anthropic_messages.append({
            "role": msg["role"],
            "content": msg["content"]
        })
    
    try:
        # Invoke the model
        response = bedrock_runtime.invoke_model(
            modelId=CLAUDE_MODEL_ID,
            contentType="application/json",
            accept="application/json",
            body=json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 1000,
                "messages": anthropic_messages,
                "temperature": 0.2
            })
        )
        
        response_body = json.loads(response["body"].read())
        return response_body["content"][0]["text"]
    except Exception as e:
        print(f"Error querying Claude: {e}")
        return "I encountered an error processing your request. Please try again."

class PDFRAGAgent:
    def __init__(self):
        self.conversation_history = []
        self.vector_store = vector_store
        self.pdf_path = None
    
    def add_to_history(self, role, content):
        """Add a message to the conversation history."""
        self.conversation_history.append({"role": role, "content": content})
    
    def retrieve_context(self, query, k=3):
        """Retrieve relevant documents for a query."""
        docs = self.vector_store.similarity_search(query, k=k)
        context_parts = []
        
        for i, doc in enumerate(docs):
            # Add metadata about the document source
            source_info = f"[Excerpt {i+1}, from PDF: {self.pdf_path}, chunk {doc.metadata.get('chunk_id', 'unknown')}]"
            context_parts.append(f"{source_info}\n{doc.page_content}")
        
        return "\n\n" + "\n\n".join(context_parts) if context_parts else ""
    
    def generate_system_prompt(self):
        """Generate the system prompt for the conversation."""
        return f"""You are a helpful AI assistant that answers questions based on the provided PDF document '{self.pdf_path}'.
Follow these guidelines:
1. Base your answers primarily on the information in the context provided.
2. If the context doesn't contain relevant information, acknowledge that you don't find this in the document.
3. Keep your answers informative but concise.
4. When referencing specific information from the document, indicate which excerpt it came from.
5. For questions about topics not in the document, politely explain that you're focused on discussing the document content."""
    
    def answer(self, user_question):
        """Answer a user question using RAG."""
        # Retrieve relevant context
        context = self.retrieve_context(user_question)
        
        # Prepare messages for the model
        messages = [
            {"role": "system", "content": self.generate_system_prompt()},
        ]
        
        # Add conversation history (limit to last 5 exchanges)
        for msg in self.conversation_history[-10:]:
            messages.append(msg)
        
        # Add the current question with context
        user_message = f"""Question: {user_question}

Relevant excerpts from the PDF document:
{context}

Please answer based on the above excerpts from the document."""
        
        messages.append({"role": "user", "content": user_message})
        
        # Get response from Claude
        response = query_claude(messages)
        
        # Update conversation history
        self.add_to_history("user", user_question)
        self.add_to_history("assistant", response)
        
        return response

def main():
    # Create an agent
    agent = PDFRAGAgent()
    
    # Get PDF path
    pdf_path = "Propuestas Técnicas\Propuestas anteriores exitosas"

    
    if not os.path.exists(pdf_path):
        print(f"Error: File '{pdf_path}' not found.")
        return
    
    # Set the PDF path in the agent
    agent.pdf_path = pdf_path
    
    # Index the PDF
    print(f"Indexing {pdf_path}...")
    index_pdf(pdf_path)
    
    print("\nPDF RAG Agent ready. Type 'quit' to exit.")
    
    # Chat loop
    while True:
        user_input = input("\nYou: ")
        if user_input.lower() in ["quit", "exit", "bye"]:
            break
            
        response = agent.answer(user_input)
        print(f"\nAssistant: {response}")

if __name__ == "__main__":
    main()

Indexing Propuestas Técnicas\Propuestas anteriores exitosas...
Extracting text from Propuestas Técnicas\Propuestas anteriores exitosas...


  pdf_path = "Propuestas Técnicas\Propuestas anteriores exitosas"


PermissionError: [Errno 13] Permission denied: 'Propuestas Técnicas\\Propuestas anteriores exitosas'

In [4]:
pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
{ "feature_analysis" : { "M" : { "aumentos" : { "L" : [ { "M" : { "nombre" : { "S" : "Corr_Fase_A" }, "rango" : { "S" : "De 394.29 a 410.16 A (4.02%) - Hora inicio (2025-03-10): 07:59 a Hora fin (2025-03-10): 10:07" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_B" }, "rango" : { "S" : "De 391.85 a 405.27 A (3.43%) - Hora inicio (2025-03-10): 07:59 a Hora fin (2025-03-10): 10:07" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_C" }, "rango" : { "S" : "De 390.62 a 405.27 A (3.75%) - Hora inicio (2025-03-10): 07:59 a Hora fin (2025-03-10): 10:07" } } }, { "M" : { "nombre" : { "S" : "presion_LP_bearing" }, "rango" : { "S" : "De 24.18 a 25.01 bar (3.43%) - Hora inicio (2025-03-10): 07:34 a Hora fin (2025-03-10): 09:16" } } }, { "M" : { "nombre" : { "S" : "presion_LP_bearing" }, "rango" : { "S" : "De 24.86 a 25.39 bar (2.13%) - Hora inicio (2025-03-10): 09:49 a Hora fin (2025-03-10): 11:31" } } }, { "M" : { "nombre" : { "S" : "temperatura_LP_antes_enfriamiento" }, "rango" : { "S" : "De 26.24 a 26.97 °C (2.78%) - Hora inicio (2025-03-10): 14:21 a Hora fin (2025-03-10): 15:30" } } }, { "M" : { "nombre" : { "S" : "temperatura_LP_despues_enfriamiento" }, "rango" : { "S" : "De 29.85 a 30.77 °C (3.08%) - Hora inicio (2025-03-10): 13:47 a Hora fin (2025-03-10): 15:30" } } }, { "M" : { "nombre" : { "S" : "carga" }, "rango" : { "S" : "De 428.26 a 448.26 Tonelaje/hora (4.67%) - Hora inicio (2025-03-10): 12:10 a Hora fin (2025-03-10): 14:16" } } }, { "M" : { "nombre" : { "S" : "carga" }, "rango" : { "S" : "De 428.26 a 437.61 Tonelaje/hora (2.18%) - Hora inicio (2025-03-10): 08:00 a Hora fin (2025-03-10): 10:08" } } } ] }, "disminuciones" : { "L" : [ { "M" : { "nombre" : { "S" : "Corr_Fase_A" }, "rango" : { "S" : "De 410.16 a 397.95 A (-2.98%) - Hora inicio (2025-03-10): 10:07 a Hora fin (2025-03-10): 12:10" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_A" }, "rango" : { "S" : "De 400.39 a 390.62 A (-2.44%) - Hora inicio (2025-03-10): 12:11 a Hora fin (2025-03-10): 14:17" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_B" }, "rango" : { "S" : "De 405.27 a 394.29 A (-2.71%) - Hora inicio (2025-03-10): 10:07 a Hora fin (2025-03-10): 12:10" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_B" }, "rango" : { "S" : "De 397.95 a 388.18 A (-2.45%) - Hora inicio (2025-03-10): 14:16 a Hora fin (2025-03-10): 14:17" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_C" }, "rango" : { "S" : "De 405.27 a 394.29 A (-2.71%) - Hora inicio (2025-03-10): 10:07 a Hora fin (2025-03-10): 12:10" } } }, { "M" : { "nombre" : { "S" : "Corr_Fase_C" }, "rango" : { "S" : "De 397.95 a 386.96 A (-2.76%) - Hora inicio (2025-03-10): 14:16 a Hora fin (2025-03-10): 14:17" } } }, { "M" : { "nombre" : { "S" : "presion_LP_bearing" }, "rango" : { "S" : "De 25.47 a 24.72 bar (-2.94%) - Hora inicio (2025-03-10): 13:47 a Hora fin (2025-03-10): 15:30" } } }, { "M" : { "nombre" : { "S" : "temperatura_aceite_lubricacion" }, "rango" : { "S" : "De 28.73 a 26.33 °C (-8.35%) - Hora inicio (2025-03-10): 07:34 a Hora fin (2025-03-10): 14:21" } } }, { "M" : { "nombre" : { "S" : "temperatura_2_aceite_lubricacion" }, "rango" : { "S" : "De 38.18 a 36.35 °C (-4.79%) - Hora inicio (2025-03-10): 07:34 a Hora fin (2025-03-10): 13:13" } } }, { "M" : { "nombre" : { "S" : "temperatura_LP_antes_enfriamiento" }, "rango" : { "S" : "De 28.10 a 26.21 °C (-6.73%) - Hora inicio (2025-03-10): 07:34 a Hora fin (2025-03-10): 11:31" } } }, { "M" : { "nombre" : { "S" : "temperatura_LP_despues_enfriamiento" }, "rango" : { "S" : "De 31.81 a 30.41 °C (-4.40%) - Hora inicio (2025-03-10): 07:34 a Hora fin (2025-03-10): 09:16" } } }, { "M" : { "nombre" : { "S" : "temperatura_LP_despues_enfriamiento" }, "rango" : { "S" : "De 30.60 a 29.85 °C (-2.45%) - Hora inicio (2025-03-10): 09:49 a Hora fin (2025-03-10): 13:13" } } }, { "M" : { "nombre" : { "S" : "carga" }, "rango" : { "S" : "De 437.61 a 428.26 Tonelaje/hora (-2.14%) - Hora inicio (2025-03-10): 10:08 a Hora fin (2025-03-10): 12:10" } } } ] }, "causas_raiz" : { "L" : [ { "M" : { "descripcion" : { "S" : "Los cambios en la carga del molino han provocado una cadena de variaciones cíclicas en los parámetros eléctricos del motor. El aumento inicial de la carga generó incrementos en las corrientes de fase, seguidos de disminuciones cuando la carga se redujo. Estas fluctuaciones son consistentes con el ciclo normal de operación del molino bajo condiciones variables de alimentación." }, "Variables_involucradas" : { "S" : "carga, Corr_Fase_A, Corr_Fase_B, Corr_Fase_C" }, "porcentaje" : { "N" : "60" }, "razonamiento" : { "S" : "La correlación entre carga y corrientes es clara en los datos (correlación de 0.427 a 0.762). El reporte muestra que los aumentos de carga (4.67% y 2.18%) preceden a los aumentos en corriente de las tres fases (aproximadamente 4.02%, 3.43% y 3.75%), mientras que la disminución de carga (-2.14%) antecede a la reducción de corrientes. Ninguna variable muestra desviaciones estándar actuales por encima de los valores normales de referencia (actual: 5.349, 4.674, 5.083 vs normal: 5.952, 5.700, 5.812), indicando que estas variaciones están dentro del comportamiento esperado del motor." }, "nombre" : { "S" : "Variación cíclica en la carga del motor" } } }, { "M" : { "descripcion" : { "S" : "Se observa una disminución progresiva en las temperaturas del sistema de lubricación que coincide con fluctuaciones en la presión del aceite. Este patrón sugiere un ajuste o cambio en las condiciones de operación del sistema de refrigeración, posiblemente una intervención de mantenimiento o una respuesta automática del sistema para optimizar la temperatura de lubricación." }, "Variables_involucradas" : { "S" : "temperatura_aceite_lubricacion, temperatura_2_aceite_lubricacion, presion_LP_bearing, temperatura_LP_antes_enfriamiento, temperatura_LP_despues_enfriamiento" }, "porcentaje" : { "N" : "40" }, "razonamiento" : { "S" : "Los datos muestran una reducción significativa y sostenida en la temperatura del aceite de lubricación (-8.35%), así como en otros puntos del circuito de aceite (temperatura_2_aceite_lubricacion: -4.79%, temperatura_LP_antes_enfriamiento: -6.73%). Estas disminuciones se correlacionan con variaciones en la presión del aceite (presion_LP_bearing: fluctuaciones de +3.43% a -2.94%), sugiriendo cambios en el flujo del sistema de refrigeración. Las desviaciones estándar actuales (temperatura: 0.795, 0.515 vs normal: 1.137, 1.202; presión: 0.343 vs normal: 0.705) se mantienen por debajo de los valores normales, indicando un comportamiento controlado." }, "nombre" : { "S" : "Optimización del sistema de refrigeración de aceite" } } } ] } } } }

{'feature_analysis': {'M': {'aumentos': {'L': [{'M': {'nombre': {'S': 'Corr_Fase_A'},
       'rango': {'S': 'De 394.29 a 410.16 A (4.02%) - Hora inicio (2025-03-10): 07:59 a Hora fin (2025-03-10): 10:07'}}},
     {'M': {'nombre': {'S': 'Corr_Fase_B'},
       'rango': {'S': 'De 391.85 a 405.27 A (3.43%) - Hora inicio (2025-03-10): 07:59 a Hora fin (2025-03-10): 10:07'}}},
     {'M': {'nombre': {'S': 'Corr_Fase_C'},
       'rango': {'S': 'De 390.62 a 405.27 A (3.75%) - Hora inicio (2025-03-10): 07:59 a Hora fin (2025-03-10): 10:07'}}},
     {'M': {'nombre': {'S': 'presion_LP_bearing'},
       'rango': {'S': 'De 24.18 a 25.01 bar (3.43%) - Hora inicio (2025-03-10): 07:34 a Hora fin (2025-03-10): 09:16'}}},
     {'M': {'nombre': {'S': 'presion_LP_bearing'},
       'rango': {'S': 'De 24.86 a 25.39 bar (2.13%) - Hora inicio (2025-03-10): 09:49 a Hora fin (2025-03-10): 11:31'}}},
     {'M': {'nombre': {'S': 'temperatura_LP_antes_enfriamiento'},
       'rango': {'S': 'De 26.24 a 26.97 °C (2.78