In [22]:
from pinecone import Pinecone, ServerlessSpec
import re
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize Pinecone client
pc = Pinecone(api_key=pinecone_api_key)

# Index configuration
INDEX_NAME = "knowledge-base"
NAMESPACE = "markdown_chunks"
DIMENSION = 1024  # Dimension for multilingual-e5-large
PINECONE_CLOUD = "aws"  # Replace with your cloud provider
PINECONE_REGION = "us-east-1"  # Replace with your region

In [30]:
def determine_file_type(prompt):
    """
    Determines the metadata file type based on the user's input.
    """
    if prompt.startswith("/tech"):
        cleaned_prompt = prompt.replace("/technical", "").strip()
        return "tucuvi_data_technical.md", cleaned_prompt
    else:
        # Default to organizational data
        return "tucuvi_data_organizational.md", prompt


def query_knowledge_base(index, user_input):

    # Step 1: Determine file type based on input
    file_type, query = determine_file_type(user_input)

    # Step 2: Embed the query
    embeddings = pc.inference.embed(
        model="multilingual-e5-large",
        inputs=[query],
        parameters={"input_type": "passage"}
    )
    query_embedding = embeddings[0]["values"]  # Extract the vector for querying

    # Step 3: Query Pinecone with appropriate file type
    sample = index.query(
        vector=query_embedding,
        namespace=NAMESPACE,
        top_k=3,
        include_values=False,
        include_metadata=True,
        filter={"file": file_type}  # Use the determined file type as a filter
    )

    # Step 4: Process results
    matches = sample.get("matches", [])
    if matches:
        for match in matches:
            metadata = match.get("metadata", {})
            full_text = metadata.get("text", "")
            print("Full Text:")
            print(full_text)
    else:
        print("No matches found.")

In [27]:
index = index = pc.Index(INDEX_NAME)
# Step 2: Embed the query
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=["What is a phone visit?"],
    parameters={"input_type": "passage"}
)
query_embedding = embeddings[0]["values"]  # Extract the vector for querying

sample = index.query(
    vector=query_embedding,
    namespace=NAMESPACE,
    top_k=3,
    include_values=False,
    include_metadata=True,
    filter={"file": "tucuvi_data_technical.md"}
)

for match in sample.get("matches", []):
    print("Match Metadata:", match["metadata"])

Match Metadata: {'file': 'tucuvi_data_technical.md', 'section': '# Dataset: `tucuvi_data`', 'text': '# Dataset: `tucuvi_data`\n\nThe `tucuvi_data` dataset stores domain events related to core operational entities such as **conversations, calls, patients, and practitioners**.\n\n### ETL-Populated Tables\n\nThese tables are populated through the **Domain Events Transformer** (Cloud Run function `domain-event-transformers`):\n\n- `actions`\n- `alerts`\n- `calls`\n- `care_plans`\n- `clinical_notes`\n- `comments`\n- `conversations`\n- `patients`\n- `phone_visit_summaries`\n- `phone_visits`\n- `practitioners`\n\n### Firestore-Synchronized Tables\n\nThese tables are populated by the **Tucuvi Data Quality** pipeline (Cloud Run function `domain-event-tucuvi_data_quality`), executed daily:\n\n- `conversations_classification`\n- `origins`\n- `protocols`\n- `protocols_display`\n- `sms`\n- `work_units`'}
Match Metadata: {'file': 'tucuvi_data_technical.md', 'section': '# Pipelines: Tucuvi Data', 'te

In [35]:
index = index = pc.Index(INDEX_NAME)
user_input = 'Number of calls in Tucuvi segregated per year'
query_knowledge_base(index, user_input)

Full Text:
6a-4a6d-9b60-66ea49b75b09/page/RmQ4D) | Dashboard tracking the use of the following Tucuvi Dashboard features: Care plans, Phone Visits and Clinical notes. The dashboard shows data for all clients and projects, and allows segregating. | November 22, 2024 |
    | SMS analysis | [https://lookerstudio.google.com/reporting/24597db6-3b5c-4aa3-b65a-5e30ed7e81a2/page/haqFE/edit](https://lookerstudio.google.com/reporting/24597db6-3b5c-4aa3-b65a-5e30ed7e81a2/page/haqFE/edit) | [https://lookerstudio.google.com/embed/reporting/24597db6-3b5c-4aa3-b65a-5e30ed7e81a2/page/haqFE/edit](https://lookerstudio.google.com/embed/reporting/24597db6-3b5c-4aa3-b65a-5e30ed7e81a2/page/haqFE/edit) | This dashboard tracks the SMS notifications sent in all projects, and has metrics to asses the impact of SMS notifications on reach and engagement. |  |
Full Text:
reporting/da65b5f1-56ad-47d3-86f7-083e90714f2d/page/p_lpkoccdzfd) | Used to retrieve historical data at Tucuvi. Aggregated for all clients, and a