In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
import s3fs

s3_url = "https://seskderhisdikbcyrwcw.supabase.co/storage/v1/s3"
# s3_region = "us-west-1"
s3_access_key_id = os.environ["S3_ACCESS_KEY_ID"]
s3_secret_access_key = os.environ["S3_SECRET_ACCESS_KEY"]

cds_files = s3fs.S3FileSystem(endpoint_url=s3_url, key=s3_access_key_id, secret=s3_secret_access_key)

cds_files.ls("llm-training-data-bucket")
# ['my-file.txt']
# with s3.open('my-bucket/my-file.txt', 'rb') as f:
#     print(f.read())

['llm-training-data-bucket/cds-files']

In [3]:
# load PDFs from S3
import json

from llama_index.core import SimpleDirectoryReader

with open("./datasets.json", "r") as fp:
    datasets = json.load(fp)

# NOTE: may need better injestion for PDF tables and excel files (trim PDF, convert to CSV with power BI automate?)
input_dir = "llm-training-data-bucket/cds-files"

reader = SimpleDirectoryReader(
    input_dir=input_dir,
    input_files=(os.path.join(input_dir, doc['filename']) for doc in datasets['cds-files']),
    fs=cds_files,
)

datasets_by_filename = {doc['filename']: doc for doc in datasets['cds-files']}
documents = []

for pages in reader.iter_data():
    for page in pages:
        dataset = datasets_by_filename[page.metadata['file_name']]

        # TODO: parser for XLSX format (it is just gibberish)
        page.doc_id = f"{dataset['id']}-{page.metadata['page_label']}"
        page.metadata['dataset_id'] = dataset['id']
        page.metadata['dataset_group'] = "cds-data"

    documents.extend(pages)

print("documents:", len(documents))

documents: 1943


In [4]:
# extract features
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

# TODO: add more metadata parsers to pipeline: https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_metadata_extractor/
pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])

nodes = pipeline.run(documents=documents)

nodes
# print("nodes:", len(nodes))

[TextNode(id_='1a04007b-505e-4b69-8ab4-61a24d95c838', embedding=None, metadata={'page_label': '1', 'file_name': 'Arizona State University CDS_2023-2024.pdf', 'file_path': 'llm-training-data-bucket/cds-files/Arizona State University CDS_2023-2024.pdf', 'file_type': 'application/pdf', 'file_size': 785342, 'dataset_id': 'arizona-state', 'dataset_group': 'cds-data'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='arizona-state-1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'Arizona State University CDS_2023-2024.pdf', 'file_path': 'llm-training-data-bucket/cds-files/Arizona State University CDS_2023-2024.pdf', 'file_type': 'application/pdf', 'file_size': 785342, 'dataset_id

In [5]:
# index documents into pinecone
import time

# from llama_index.core import Settings
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.cohere import CohereEmbedding
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = os.environ["PINECONE_API_KEY"]
pinecone_index_name = "cds-index-test"

cohere_api_key = os.environ["COHERE_API_KEY"]

pc = Pinecone(api_key=pinecone_api_key)

print("existing indexes:", pc.list_indexes())

if pinecone_index_name in (index['name'] for index in pc.list_indexes()):
    pc_index = pc.Index(pinecone_index_name)

    # if pc_index.describe_index_stats()['total_vector_count'] > 0:
    #     pc_index.delete(ids=[vector_id for vector_id in pc_index.list()], namespace="")
else:
    pc.create_index(
        name=pinecone_index_name,
        dimension=1024, # Replace with your model dimensions
        metric="cosine", # Replace with your model metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

    pc_index = pc.Index(pinecone_index_name)

pinecone_vector_store = PineconeVectorStore(pc_index)

# construct vector store and customize storage context
storage_context = StorageContext.from_defaults(
    vector_store=pinecone_vector_store,
    docstore=SimpleDocumentStore()
)

embed_model = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-english-v3.0",
    input_type="search_query",
)

# batch insert nodes so we don't hit the embed API too many times
# Settings.chunk_size = 32
# Settings.chunk_overlap = 50

batch_size = 40
index = VectorStoreIndex(nodes=[], storage_context=storage_context, embed_model=embed_model)

# NOTE: to load vector store
# index = VectorStoreIndex.from_vector_store(vector_store=PineconeVectorStore(pc.Index(pinecone_index_name)), embed_model=embed_model)

for i in range(0, len(nodes), batch_size):
    batch = nodes[i:i+batch_size]
    index.insert_nodes(batch)
    time.sleep(5)

# TODO: wait for nodes to be indexed
# TODO: create in-memory index of student info and make chat index a super index of the cds data + student info
# TODO: rerank cds index on in-memory index so it prioritizes the schools relevant to the student

  from tqdm.autonotebook import tqdm


existing indexes: {'indexes': []}


Upserted vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 74.53it/s]
Upserted vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 77.91it/s]
Upserted vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 89.44it/s]
Upserted vectors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 114.25it/s]
Upserted vectors: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 121.40it/s]
Upserted vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 56.02it/s]
Upserted vectors: 100%|█████████████████████████████████████████

In [31]:
# create index that is just the University of Nebraska - Lincoln docs
from llama_index.core.tools import QueryEngineTool
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

from llama_index.llms.cohere import Cohere

# Necessary to use the latest OpenAI models that support function calling API
llm = Cohere(model="command-r-plus")

query_engine_tools = {}

for dataset in datasets['cds-files']:
    vector_filter = MetadataFilters(
        filters=[
            MetadataFilter(
                key="dataset_id", operator=FilterOperator.EQ, value=dataset['id']
            ),
        ]
    )

    query_engine_tools[dataset['id']] = QueryEngineTool.from_defaults(
        query_engine=index.as_query_engine(filters=vector_filter, chat_mode="best", llm=llm, verbose=True),
        description=dataset['description'],
    )

asu_query_engine = query_engine_tools['brigham-young']
asu_query_engine.query_engine.query("does brigham young consider class rank important for applications?")

Response(response='Yes, class rank is considered important for applications to Brigham Young.', source_nodes=[NodeWithScore(node=TextNode(id_='522141af-d3c5-4c4c-8620-3b77f968eea9', embedding=[0.0310821533, 0.00146865845, -0.00971221924, -0.0444030762, -0.00289726257, -0.00885772705, -0.0248413086, -0.00413513184, -0.0458679199, 0.016418457, -0.00702667236, 0.0502624512, -0.0280303955, -0.0145111084, 0.073425293, -0.00771713257, 0.0226135254, -0.012008667, 0.0181884766, -0.0205078125, -0.0274658203, 0.0132217407, -0.029296875, -0.0170898438, 0.0306549072, 0.03074646, -0.0340271, 0.00326156616, 0.037902832, 0.0331115723, 0.0242156982, -0.0188293457, 0.0291748047, -0.0281677246, 0.0280151367, 0.0610656738, -0.0135574341, -0.00374794, -0.0114822388, 0.0238647461, 0.00555419922, 0.00504684448, -0.0781860352, 0.0280456543, -0.0491027832, -0.0578918457, 0.0554504395, -0.0187225342, 0.0190429688, 0.00386619568, -0.00282478333, -0.0175323486, 0.00785827637, 0.0101394653, -0.0136032104, 0.03628

In [2]:
# attempt inference on the whole index
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.chat_engine import CondenseQuestionChatEngine, SimpleChatEngine, CondensePlusContextChatEngine
from llama_index.core.selectors import LLMMultiSelector
from llama_index.core.tools import QueryEngineTool
from llama_index.core.memory import ChatMemoryBuffer

vector_tool = QueryEngineTool.from_defaults(
    query_engine=index.as_query_engine(chat_mode="best", llm=llm, verbose=True),
    description="For questions about all school admissions data",
)

query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(llm=llm),
    query_engine_tools=[
        *query_engine_tools.values(),
        vector_tool
    ],
    llm=llm,
    verbose=True
)

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

# Chat engine that uses query engine
chat_engine = CondensePlusContextChatEngine.from_defaults(
    memory=memory,
    query_engine=query_engine,
    llm=llm,
    verbose=True,
)
# chat_engine = SimpleChatEngine.from_defaults(llm=llm)

# University of Nebraska Lincoln: 25th% = 1090, 50th% = 1200, 75th% = 1310
chat_engine.chat(
    "If I got a 900 on my sat should I apply to nebraska?"
)

NameError: name 'index' is not defined