In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
import uuid

from llama_index.core.schema import BaseNode

import chromadb
from chromadb.utils import embedding_functions

cohere_ef  = embedding_functions.CohereEmbeddingFunction(api_key=os.environ["COHERE_API_KEY"], model_name="embed-english-v3.0")

chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(
    name="grouping_db",
    metadata={"hnsw:space": "cosine"},
    embedding_function=cohere_ef,
)
# collection.delete(ids=collection.get()["ids"])

def insert_node(node: BaseNode, title_node_id: str):
    # find grouping nodes with similarity
    # create grouping node if no grouping node found and add queryable node for group
    # create value node attached to grouping node and title node
    results = collection.query(
        query_texts=[node.text],
        n_results=1,
        where={"$and": [{"node_type": "grouping"}, {"source_id": {"$ne": title_node_id}}]},
    )
    print(results)
    if results["distances"] == [[]] or results["distances"][0][0] > 0.4:
        # create new grouping node
        grouping_node_id = str(uuid.uuid4())
        
        collection.add(
            ids=[grouping_node_id],
            documents=[node.text],
            metadatas=[{"node_type": "grouping", "source_id": title_node_id}],
        )
    else:
        grouping_node_id = results["ids"][0][0]

    # add value node with links to title node and grouping node
    collection.add(
        ids=[str(uuid.uuid4())],
        documents=[node.text],
        metadatas=[{"node_type": "value", "title_node": title_node_id, "grouping_node": grouping_node_id}],
    )

In [3]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import MarkdownNodeParser

reader = SimpleDirectoryReader(
    input_files=[
        "datasets/cds/md/bama.md",
        "datasets/cds/md/mississippi-state.md",
        "datasets/cds/md/nyu.md",
        "datasets/cds/md/uw-madison.md",
        "datasets/cds/md/penn-state.md",
    ],
)

documents = reader.load_data()

pipeline = IngestionPipeline(
    transformations=[MarkdownNodeParser()],
)

nodes = pipeline.run(documents=documents)

In [4]:
file_names = set(node.metadata["file_name"] for node in nodes)
file_names

{'bama.md', 'mississippi-state.md', 'nyu.md', 'penn-state.md', 'uw-madison.md'}

In [5]:
collection.add(ids=list(file_names), documents=list(file_names), metadatas=[{"node_type": "title"} for file_name in file_names])

for node in nodes:
    insert_node(node, node.metadata["file_name"])

ValueError: Expected each embedding in the embeddings to be a list, got ['tuple']

In [53]:
def query_nodes(query_text: str):
    queryable_nodes = collection.query(
        query_texts=[query_text],
        n_results=8,
        where={"$or": [{"node_type": "grouping"}, {"node_type": "title"}]},
    )
    # print(queryable_nodes)

    grouping_ids = []
    title_ids = []

    for i, metadata in enumerate(queryable_nodes["metadatas"][0]):
        node_id = queryable_nodes["ids"][0][i]

        if metadata["node_type"] == "grouping":
            grouping_ids.append(node_id)
        elif metadata["node_type"] == "title":
            title_ids.append(node_id)

    docs_by_title = {tid: [] for tid in title_ids}
    # value_node_filters = [{"node_type": "value"}]

    # print(grouping_ids)
    # print(title_ids)

    # get all value nodes under grouping with title; if no title nodes return all nodes under grouping
    # get top k (k = # grouping nodes) value nodes under title
    # de-dupe nodes
    if title_ids != []:
        value_node_ids = []
        
        for tid in title_ids:
            title_value_nodes = collection.query(
                query_texts=[query_text],
                n_results=len(grouping_ids),
                where={"$and": [{"node_type": "value"}, {"title_node": tid}]},
            )

            value_node_ids.extend(title_value_nodes["ids"][0])

        value_nodes = collection.get(ids=value_node_ids)
    elif grouping_ids != []:
        if len(grouping_ids) == 1:
            value_nodes = collection.get(
                where={
                    "grouping_node": grouping_ids[0],
                    "node_type": "value"
                }
            )
        else:
            value_nodes = collection.get(
                where={
                    "$and": [
                        {"node_type": "value"},
                        {"$or": [{"grouping_node": gid} for gid in grouping_ids]}
                    ]
                }
            )

    # if grouping_ids != []:
    #     if len(grouping_ids) == 1:
    #         value_node_filters.append({"grouping_node": grouping_ids[0]})
    #     else:
    #         value_node_filters.append({"$or": [{"grouping_node": gid} for gid in grouping_ids]})

    # if title_ids != []:
    #     if len(title_ids) == 1:
    #         value_node_filters.append({"title_node": title_ids[0]})
    #     else:
    #         value_node_filters.append({"$or": [{"title_node": tid} for tid in title_ids]})

    # value_nodes = collection.get(
    #     where={
    #         "$and": value_node_filters
    #     }
    # )
    # print(value_nodes)
    # combine value node texts by title to create synthetic nodes
    for i, doc in enumerate(value_nodes["documents"]):
        title = value_nodes["metadatas"][i]["title_node"]

        if title in docs_by_title:
            docs_by_title[title].append(doc)
        else:
            docs_by_title[title] = [doc]

    # print(docs_by_title)
    return [f"# {title}:\n\n" + "\n".join(docs_by_title[title]) for title in docs_by_title]

for node in query_nodes("which schools are in the south?"):
    print(node)

# mississippi-state.md:

Mississippi State University
Office of Institutional Research and Effectiveness
A0. Respondent Information

| Field | Value |
|-------|-------|
| Office: | Office of Institutional Research and Effectiveness |
| Address: | P.O. Drawer EY |
| City: | Mississippi State |
| State: | Mississippi |
| Zip: | 39762 |
| Country: | United States |
| Phone Number: | 662-325-3920 |
| Email Address: | oire@ir.msstate.edu |

Are your responses to the CDS posted for reference on your institution's website?
Yes

If yes, please provide a direct link to the posted CDS responses:
https://ir.msstate.edu/cdsets.php
A1. Address Information
Please enter general institution information below:

| Field | Value |
|-------|-------|
| Name of College or University | Mississippi State University |
| Street Address: | 75 B.S. Hood Road |
| City: | Mississippi State |
| State: | Mississippi |
| Zip: | 39762 |
| Country: | United States |
| Main Institution Phone Number: | 662-325-2323 |
| In

In [55]:
from llama_index.llms.bedrock import Bedrock
from llama_index.core.prompts import PromptTemplate
from llama_index.core.llms import ChatMessage

llm = Bedrock(
    model="anthropic.claude-3-5-sonnet-20240620-v1:0", profile_name="collega-prod", context_size=4096,
)

template = PromptTemplate(
    """
Respond to the following message given this context:
{message}

Context:
{context}
""")

message = "which schools are male only?"

response = llm.chat([
    ChatMessage.from_str(template.format(
        message=message,
        context="\n".join(query_nodes(message))
    ))
])

print(response)

assistant: Based on the information provided, there are no schools mentioned that are male-only. The data shows:

- NYU and Penn State both have coed residence halls and admit both male and female students.
- Penn State specifically lists having women's residence halls, but does not mention any male-only residence halls.
- The enrollment data for both schools shows they admit and enroll both male and female students.

So there is no indication of any male-only schools in this context. Both NYU and Penn State appear to be coeducational institutions that admit students of all genders.
