In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [2]:
import s3fs

s3_url = "https://seskderhisdikbcyrwcw.supabase.co/storage/v1/s3"
# s3_region = "us-west-1"
s3_access_key_id = os.environ["S3_ACCESS_KEY_ID"]
s3_secret_access_key = os.environ["S3_SECRET_ACCESS_KEY"]

cds_files = s3fs.S3FileSystem(endpoint_url=s3_url, key=s3_access_key_id, secret=s3_secret_access_key)
# https://seskderhisdikbcyrwcw.supabase.co/storage/v1/s3
# us-west-1

cds_files.ls("llm-training-data-bucket")
# ['my-file.txt']
# with s3.open('my-bucket/my-file.txt', 'rb') as f:
#     print(f.read())

['llm-training-data-bucket/cds-files']

In [3]:
# load PDFs from S3
from llama_index.core import SimpleDirectoryReader

# NOTE: may need better injestion for PDF tables and excel files (trim PDF, convert to CSV with power BI automate?)
# TODO: make node name based on file name + page number
# TODO: add metadata for filtering on school instead of file name
# TODO: add more metadata parsers to pipeline: https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_metadata_extractor/
reader = SimpleDirectoryReader(input_dir="llm-training-data-bucket/cds-files", fs=cds_files, recursive=True)

documents = reader.load_data()
print("len docs:", documents)

[Document(id_='baba12a1-6d35-45c7-a681-47501e87b28e', embedding=None, metadata={'page_label': '1', 'file_name': 'Arizona State University CDS_2023-2024.pdf', 'file_path': 'llm-training-data-bucket/cds-files/Arizona State University CDS_2023-2024.pdf', 'file_type': 'application/pdf', 'file_size': 785342}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="Arizona State University Campus Immersion\nCommon Data Set 2023-2024\nA0 Respondent Information (Not for Publication)\nName: Amy Sonke\nTitle: Data and Reporting Analyst\nOffice: University Office of Institutional Analysis\nMailing Address: P.O. Box 875304\nCity/State/Zip/Country: Tempe, AZ 85287-5304\nPhone: 480-965-2318\nFax: 480-965-1559\nE-mail Address: amy.sonke@asu.edu\nXYes\nNo\nIf yes, please provid

In [4]:
# extract features
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])

nodes = pipeline.run(documents=documents)

print("nodes:", len(nodes))

nodes: 2680


In [31]:
# index documents into pinecone
import time

from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.cohere import CohereEmbedding
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = os.environ["PINECONE_API_KEY"]
pinecone_index_name = "cds-index-test"

cohere_api_key = os.environ["COHERE_API_KEY"]

pc = Pinecone(api_key=pinecone_api_key)

print("existing indexes:", pc.list_indexes())

if pinecone_index_name in (index['name'] for index in pc.list_indexes()):
    pc_index = pc.Index(pinecone_index_name)

    # if vectors != []:
    #     pc_index.delete(ids=pc_index.list(), namespace="default")
else:
    pc.create_index(
        name=pinecone_index_name,
        dimension=1024, # Replace with your model dimensions
        metric="cosine", # Replace with your model metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

    pc_index = pc.Index(pinecone_index_name)

pinecone_vector_store = PineconeVectorStore(pc_index)

# construct vector store and customize storage context
storage_context = StorageContext.from_defaults(
    vector_store=pinecone_vector_store
)

embed_model = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-english-v3.0",
    input_type="search_query",
)

# batch insert nodes so we don't hit the embed API too many times
# Settings.chunk_size = 512
# Settings.chunk_overlap = 50

batch_size = 50
index = VectorStoreIndex(nodes=[], storage_context=storage_context, embed_model=embed_model)

# NOTE: to load vector store
# index = VectorStoreIndex.from_vector_store(vector_store=PineconeVectorStore(pc.Index(pinecone_index_name)), embed_model=embed_model)

# for i in range(0, len(nodes), batch_size):
#     batch = nodes[i:i+batch_size]
#     index.insert_nodes(batch)
#     time.sleep(5)
# TODO: wait for nodes to be indexed
# TODO: create in-memory index of student info and make chat index a super index of the cds data + student info
# TODO: rerank cds index on in-memory index so it prioritizes the schools relevant to the student

existing indexes: {'indexes': [{'dimension': 1024,
              'host': 'cds-index-test-2leo7ru.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'cds-index-test',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [38]:
# create index that is just the University of Nebraska - Lincoln docs
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

from llama_index.llms.cohere import Cohere

# Necessary to use the latest OpenAI models that support function calling API
llm = Cohere(model="command-r")

unl_filter = MetadataFilters(
    filters=[
        MetadataFilter(
            key="file_name", operator=FilterOperator.EQ, value="University of Nebraska-Lincoln CDS_2023-2024.pdf"
        ),
    ]
)

unl_query_engine = index.as_query_engine(filters=unl_filter, chat_mode="best", llm=llm, verbose=True)
unl_query_engine.query("does university of nebraska lincoln consider class rank important for applications?")

Response(response="According to the University of Nebraska Lincoln's website, class rank is considered important for applications. The university states that the rigor of secondary school records, academic grade point average (GPA) and standardized test scores are also important.", source_nodes=[NodeWithScore(node=TextNode(id_='aee7b744-58a0-4177-b00d-c7ddbc5fae2c', embedding=[-0.0303497314, -0.0144882202, -0.00577163696, 0.0104522705, -0.0669555664, 0.0112762451, -0.0482788086, -0.00869751, -0.00704956055, 0.00011998415, -0.0148468018, 0.0190429688, -0.00475692749, -0.017288208, 0.0626220703, -0.00214576721, 0.00888061523, 0.0151443481, -0.00320625305, -0.0331726074, -0.0158538818, -0.0273132324, -0.0414428711, -0.0170593262, 0.0473327637, -0.0342712402, -0.0612182617, 0.018661499, 0.0547180176, 0.0409240723, 0.0216522217, -0.0169830322, 0.00560379028, -0.0111160278, 0.0229797363, 0.0502624512, 0.0120162964, 0.007396698, 0.0086517334, 0.0121383667, 0.032409668, -0.018737793, -0.078796

In [50]:
# attempt inference on the whole index
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.chat_engine import CondenseQuestionChatEngine
from llama_index.core.selectors import LLMMultiSelector
from llama_index.core.tools import QueryEngineTool

unl_query_tool = QueryEngineTool.from_defaults(
    query_engine=unl_query_engine,
    description="For questions about university of nebraska lincoln admissions",
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=index.as_query_engine(chat_mode="best", llm=llm, verbose=True),
    description="For questions about all school admissions data",
)

query_engine = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(llm=llm),
    query_engine_tools=[
        unl_query_tool,
        vector_tool,
    ],
    llm=llm,
    verbose=True
)

# Chat engine that uses query engine
chat_engine = CondenseQuestionChatEngine.from_defaults(
    query_engine=query_engine,
    chat_mode="best",
    llm=llm,
    verbose=True,
)

# University of Nebraska Lincoln: 25th% = 1090, 50th% = 1200, 75th% = 1310
chat_engine.chat(
    "What was the 75th percentile SAT score for admitted applicants for university of nebraska?"
)

Querying with: What was the 75th percentile SAT score for admitted applicants for university of nebraska?
[1;3;38;5;200mSelecting query engine 0: The question specifically asks for University of Nebraska Lincoln admissions information, so choice 1 is the most relevant..
[0m

AgentChatResponse(response='The 75th percentile SAT score for admitted applicants to the University of Nebraska is 1310.', sources=[ToolOutput(content='The 75th percentile SAT score for admitted applicants to the University of Nebraska is 1310.', tool_name='query_engine', raw_input={'query': 'What was the 75th percentile SAT score for admitted applicants for university of nebraska?'}, raw_output=Response(response='The 75th percentile SAT score for admitted applicants to the University of Nebraska is 1310.', source_nodes=[NodeWithScore(node=TextNode(id_='4dbb37cd-b1d7-47d8-8c16-6de975a08be3', embedding=[-0.0157928467, -0.0246124268, 0.00720977783, 0.00830078125, -0.0643310547, 0.00396347046, -0.0286712646, -0.0120315552, 0.0228424072, -0.0191802979, -0.0204467773, -0.0254516602, -0.0189971924, -0.0296325684, 0.0427246094, -0.015296936, 0.0212860107, 0.0107727051, 0.00714111328, -0.0241088867, -0.0261077881, -0.000580787659, -0.0197753906, -0.00148963928, 0.0341186523, 0.00590133667, -0.