In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import MarkdownNodeParser, SemanticSplitterNodeParser

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding

embed_model = CohereEmbedding(model_name="embed-english-v3.0")

# load markdown nodes
# group together semantically similar nodes
# TODO: make sure there are enough nodes per PDF
reader = SimpleDirectoryReader(
    input_files=[
        "datasets/cds/md/bama.md",
        "datasets/cds/md/mississippi-state.md",
        "datasets/cds/md/nyu.md",
        "datasets/cds/md/uw-madison.md",
        "datasets/cds/md/penn-state.md",
    ],
)

documents = reader.load_data()

pipeline = IngestionPipeline(
    transformations=[MarkdownNodeParser()],
)

nodes = pipeline.run(documents=documents)

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

import nest_asyncio; nest_asyncio.apply();

# group together nodes by file
nodes_by_file = {}

for node in nodes:
    file_name = node.metadata['file_name']

    if file_name not in nodes_by_file:
        nodes_by_file[file_name] = [node]
    else:
        nodes_by_file[file_name].append(node)

for file in nodes_by_file:
    print(f"{file}: {len(nodes_by_file[file])}")

embeddings_by_file = {
    filename: [await embed_model.aget_text_embedding(node.text) for node in nodes_by_file[filename]]
    for filename in nodes_by_file
}
        
# add first file nodes to groups
# for each node in next file, compare to nodes in group
# node_groups = []

# for node in nodes_by_file['bama.md']:
#     node_groups.append(node)

# for node in 

bama.md: 198
mississippi-state.md: 5
nyu.md: 74
uw-madison.md: 262
penn-state.md: 93


In [4]:
node_groupings = []

for file_name in embeddings_by_file:
    incoming_dataset_embeddings = embeddings_by_file[file_name]
    # TODO: take vector average instead of first node
    # TODO: may need to include relevant preceeding context to build summary node
    existing_dataset_embeddings = [group[0]["embedding"] for group in node_groupings]

    if existing_dataset_embeddings == []:
        for i, embedding in enumerate(incoming_dataset_embeddings):
            node_groupings.append([{"node": nodes_by_file[file_name][i], "embedding": embedding}])
    else:
        similarities = cosine_similarity(incoming_dataset_embeddings, existing_dataset_embeddings)
        # print(similarities)

        for i, incoming_node_sims in enumerate(similarities):
            current_node = nodes_by_file[file_name][i]
            max_score = 0
            max_index = -1

            for j, existing_node_score in enumerate(incoming_node_sims):
                if existing_node_score > 0.5 and existing_node_score > max_score:
                    max_score = existing_node_score
                    max_index = j
    
            if max_index > -1:
                node_groupings[max_index].append({"node": current_node, "embedding": incoming_dataset_embeddings[i]})
                # print("score:", existing_node_score)
                # print("incoming node:", nodes_by_file[file_name][i])
                # print("existing node:", node_groupings[max_index][0]["node"])
                # print("\n")
            else:
                node_groupings.append([{"node": current_node, "embedding": incoming_dataset_embeddings[i]}])

# print("node groups collected:", len(node_groupings))
# print("non singular groups collected:", len([ng for ng in node_groupings if len(ng) > 1]))

In [114]:
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode

index = VectorStoreIndex(nodes=[], embed_model=embed_model)
# upsert title node for each document
index.insert_nodes([TextNode(text=filename) for filename in embeddings_by_file])
index.insert_nodes([TextNode(text=f"# {ng[0]['node'].metadata['file_name']}:\n{ng[0]['node'].text}") for ng in node_groupings])

In [115]:
retriever = index.as_retriever(similarity_top_k=8)
nodes = retriever.retrieve("which schools have more men than women?")

for node in nodes:
    print(node.text + "\n")

# bama.md: TOTAL

|                          | Men    | Women  | Another Gender | Total  |
| ------------------------ | ------ | ------ | -------------- | ------ |
| Total all undergraduates | 15,105 | 18,530 | 1,428          | 35,063 |
| Total all graduate       | 3,483  | 4,060  | 814            | 8,357  |
| GRAND TOTAL ALL STUDENTS | 18,588 | 22,590 | 2,242          | 43,420 |

# bama.md: B1 Institutional Enrollment - Men and Women

Provide numbers of students for each of the following categories as of the institution's official fall reporting date or as of October 19, 2023.

- Note: Report students formerly designated as "first professional" in the graduate cells.
- For information on reporting study abroad students please see: This Document at NCES.GOV
- If your institution collects and reports non-binary gender data, please use the "Another Gender" category.
  - In cases where gender information is not provided, please distribute across the two-binary categories.

# bama.md: C1 F

In [94]:
from llama_index.core import PromptTemplate
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
from llama_index.core.program import LLMTextCompletionProgram

from llama_index.llms.cohere import Cohere

from typing import List, Literal, Union, Optional, Dict, Any
from pydantic import BaseModel

# TODO: try with more obedient model
llm = Cohere(model="command-r")

class GroupInfo(BaseModel):
    subject: str
    fields: List[str]
    
    # possible_values: List[str]

def generate_fields(node_group):
    prompt = """
Given these nodes, determine a common subject.
Next, determine up to {num_fields} fields that can be extracted from the nodes and a datatype for each field.
If the subject represent categorical data with a finite amount of possible values, express each value as a separate field.
Use snake_case for the subject and field names.
Each field should be in the form `field_name (field_datatype)`.
field_datatype should be one of the following: STRING, BOOLEAN, NUMBER.
Categorical data should be boolean datatype.
Avoid creating different fields for similar information, such as "phone" and "phone_number".
Do not make up fields if they are not present in any of the nodes.
Avoid stopwords.

Nodes:
{nodes}
"""

    # for i, node in enumerate(node_group):
    #     prompt += f"Node {i+1}:\n"
    #     prompt += node["node"].text
    #     prompt += "\n"

    program = LLMTextCompletionProgram.from_defaults(
        output_cls=GroupInfo,
        prompt=PromptTemplate(prompt),
        verbose=True,
    )

    output = program(num_fields=20, nodes=[f"Node {i+1}:\n {n['node'].text}\n" for i, n in enumerate(node_group)])
    print(output)
    return output

class NodeAnswer(BaseModel):
    question: str
    value: Union[str, bool, int, float, None]

class NodeAnswers(BaseModel):
    answers: List[NodeAnswer]

def extract_single_answers(node, fields):
    answers = []

    for f in fields:
        prompt = """
Given this node, attempt to fill in the following field:
{question}
Node:
{node}
"""

        program = LLMTextCompletionProgram.from_defaults(
            prompt=PromptTemplate(prompt),
            output_cls=NodeAnswer,
            verbose=True,
        )

        try:
            output = program(question=f, node=node.text)
            print(output)
            answers.append(output)
        except Exception as e:
            print(e)

    return NodeAnswers(answers=answers)

def extract_answers(node, subject, fields):
    prompt = """
Given this node, which is about {subject}, attempt to answer the following questions:
{fields}
Node:
{node}
"""
    
    program = LLMTextCompletionProgram.from_defaults(
        prompt=PromptTemplate(prompt),
        output_cls=NodeAnswers,
        verbose=True,
    )

    try:
        answers = program(
            subject=subject,
            fields="\n".join([f"Question {i+1}. {f}" for i, f in enumerate(fields)]),
            node=node.text,
        )
        print(answers)
        return answers
        # TODO: try individual field extraction
    except Exception as e:
        answers = extract_single_answers(node, fields)
        print(answers)
        return answers

def get_group_kg_extractor(group):
    info = generate_fields(group)

    # entity = triples.subject.upper()
    fields = [f.upper() for f in info.fields]

    for node in ng:
        node_answers = extract_answers(node["node"], info.subject, fields)
        # TODO: add entity relations

# for ng in node_groupings[:10]:
#     if len(ng) > 0:
#         get_group_kg_extractor(ng)

In [None]:
# setup duckdb
# import schema
# index, index keys, values, m2m index relationship, m2m value relationship to index keys
# chunk storage, link index keys and values to chunks, delete values if chunks change or deleted
cursor.sql('create table "indexes" ("id" TEXT NOT NULL, "name" TEXT NOT NULL), CONSTRAINT "indexes_pkey" PRIMARY KEY ("id")')
cursor.sql('create table "index_keys" ("id" TEXT NOT NULL, "index_id" TEXT NOT NULL, "key" TEXT, "key_embedding" FLOAT[1024], CONSTRAINT "index_keys_pkey" PRIMARY KEY ("id"))')
cursor.sql('create table "values" ("id" TEXT NOT NULL, "value" TEXT NOT NULL, CONSTRAINT "values_pkey" PRIMARY KEY ("id"))')
# add vector extension
cursor.sql('select * from index_keys order by array_distance(key_embedding, [1,2,3]::FLOAT[1024])')

# Document API:

# insert generic chunk
# create collection
# add document w/ document name, document hash, document text
# add chunks for document with hash, name, text, parent doc
# add queryable embedding node with link to parent chunk
# add non-queryable embedding group node for each chunk

# when adding a new document, get possible groups with existing document nodes w/threshold
# create node group should un-index original embedding nodes in group, create embedded primary field nodes and secondary title nodes for each document
# all nodes in group need to be added back as value nodes with primary and secondary filters with a link to the originally embedded node
# adding a new document to an existing group should add the 
# if the group changes and originally grouped nodes are removed, put the node back on the index
# deleting an index node should delete all values under the index

# re-indexing document -> diff check chunks, return existing nodes that are outdated, new chunks from updated document

# delete document should delete all child documents, all embeddings for document


# insert grouped semi-structured document
# insert document + chunk like normal in grouping index -> do not return individual nodes from grouping index
# get document groupings from grouping index

In [97]:
from llama_index.core.graph_stores.types import Relation, EntityNode, KG_NODES_KEY, KG_RELATIONS_KEY
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode

def insert_node_answers(node, answers):
    existing_relations = node.metadata.pop(KG_RELATIONS_KEY, [])
    existing_nodes = node.metadata.pop(KG_NODES_KEY, [])

    edges = []
    metadata = node.metadata.copy()

    for answer in answers:
        if answer.value is not None:
            answer_node = EntityNode(name=answer.value, properties=metadata)
            edges.append(
                Relation(
                    source_id=node.node_id,
                    target_id=answer_node.id,
                    label=answer.question,
                    properties=metadata,
                )
            )
    
    existing_relations.extend(edges)
    node.metadata[KG_RELATIONS_KEY] = existing_relations
    node.metadata[KG_NODES_KEY] = existing_nodes

index = VectorStoreIndex(nodes=[], embed_model=embed_model)
title_index = VectorStoreIndex(nodes=[], embed_model=embed_model)
# upsert title node for each document
index.insert_nodes([TextNode(text=filename) for filename in embeddings_by_file])
title_index.insert_nodes([TextNode(text=filename) for filename in embeddings_by_file])

for ng in node_groupings[:20]:
    if len(ng) > 1:
        info = generate_fields(ng)
        # insert node for fields in node group -> if this is hit + fields in this subject just return all fields under subject
        info_node_text = f"{info.subject}\n"
        for field in info.fields:
            info_node_text += f"* {field}\n"

        index.insert_nodes([TextNode(text=info_node_text)])

        # entity = triples.subject.upper()
        fields = [f.upper() for f in info.fields]
        # Insert nodes for each key (with aggregate info for all keys under it)
        index.insert_nodes([TextNode(text=field) for field in fields])
    
        for node in ng:
            node_answers = extract_answers(node["node"], info.subject, fields)
            # insert value node with node metadata for key and title
            # index.insert_nodes([TextNode(text=f"{node['node'].metadata['file_name']} {answer.question}: {answer.value}") for answer in node_answers.answers])
            # if node_answers is not None:
            #     insert_node_answers(node["node"], node_answers.answers)


# TODO: save schemas
# save answer chunks
# title nodes
# key nodes
# cross reference key and title nodes
# if key and title node both hit, generate custom aggregate value node and return
# if key node(s) hit (and some value nodes in column), return synthetic node with all values in columns

subject='general_information' fields=['annual_expenses (boolean)']
answers=[NodeAnswer(question='ANNUAL_EXPENSES', value=True)]
answers=[NodeAnswer(question='ANNUAL_EXPENSES', value=True)]
answers=[NodeAnswer(question='ANNUAL_EXPENSES', value=True)]
answers=[NodeAnswer(question='ANNUAL_EXPENSES', value=True)]
answers=[NodeAnswer(question='ANNUAL_EXPENSES', value=True)]
answers=[NodeAnswer(question='ANNUAL_EXPENSES', value=True)]
subject='respondent_information' fields=['name (string)', 'title (string)', 'office (string)', 'mailing_address (string)', 'city (string)', 'state (string)', 'zip (string)', 'country (string)', 'phone (string)', 'email_address (string)', 'responses_posted_on_website (boolean)', 'url_of_web_page (string)', 'first_name (string)', 'last_name (string)', 'address (string)', 'phone_number (string)', 'extension (string)', 'url_of_cds_responses (string)']
answers=[NodeAnswer(question='NAME', value='Lorne Kuffel'), NodeAnswer(question='TITLE', value='Executive Director,

In [96]:
retriever = index.as_retriever(similarity_top_k=8)
retriever.retrieve("are there more women at nyu or penn state?")
# retriever.retrieve("which schools are in the south?")
# retriever = title_index.as_retriever(similarity_top_k=8)
# retriever.retrieve("how many women are at alabama, mississippi, nyu, penn, and madison?")

[NodeWithScore(node=TextNode(id_='57024bb0-11af-4ead-b9c4-daf33ef47835', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='penn-state.md FULL_TIME_ENROLLMENT_WOMEN: 2970', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.4362364717190708),
 NodeWithScore(node=TextNode(id_='66249e2b-5c17-4790-9e7a-11e4d2c8c9b0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='penn-state.md PART_TIME_ENROLLMENT_WOMEN: 205', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.4178824872177374),
 NodeWithScore(node=TextNode(id_='d1f9b713-c2a9-4ab0-8e84-6ea0c533e123', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relations