In [31]:
import os
import dotenv

dotenv.load_dotenv()

True

In [5]:
import s3fs

s3_url = "https://seskderhisdikbcyrwcw.supabase.co/storage/v1/s3"
# s3_region = "us-west-1"
s3_access_key_id = os.environ["S3_ACCESS_KEY_ID"]
s3_secret_access_key = os.environ["S3_SECRET_ACCESS_KEY"]

cds_files = s3fs.S3FileSystem(endpoint_url=s3_url, key=s3_access_key_id, secret=s3_secret_access_key)
# https://seskderhisdikbcyrwcw.supabase.co/storage/v1/s3
# us-west-1

cds_files.ls("llm-training-data-bucket")
# ['my-file.txt']
# with s3.open('my-bucket/my-file.txt', 'rb') as f:
#     print(f.read())

['llm-training-data-bucket/cds-files']

In [9]:
# load PDFs from S3
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_dir="llm-training-data-bucket/cds-files", fs=cds_files, recursive=True)

documents = reader.load_data()
documents

[Document(id_='72b5be20-c69b-49f5-b9e3-5ee1b1ee3985', embedding=None, metadata={'page_label': '1', 'file_name': 'University of Nebraska-Lincoln CDS_2023-2024.pdf', 'file_path': 'llm-training-data-bucket/cds-files/University of Nebraska-Lincoln CDS_2023-2024.pdf', 'file_type': 'application/pdf', 'file_size': 669674}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text=" \xa0\nA1. Address Information\nPlease\xa0enter\xa0general\xa0institution \xa0information \xa0below:\nName\xa0of\xa0College\xa0or\xa0University University \xa0of\xa0Nebraska \xa0‐\xa0Lincoln\nStreet\xa0Address: 1400\xa0R\xa0Street\nCity: Lincoln\nState: Nebraska\nZip: 68588‐0419\nCountry:\xa0 United\xa0States\nMain\xa0Institution \xa0Phone\xa0Number: (402)\xa0472‐7211\nMain\xa0Institution \xa0We

In [10]:
# extract features
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter

pipeline = IngestionPipeline(transformations=[TokenTextSplitter()])

nodes = pipeline.run(documents=documents)



[TextNode(id_='fce92a7f-c5d9-4b11-8a1c-979554c8deb5', embedding=None, metadata={'page_label': '1', 'file_name': 'University of Nebraska-Lincoln CDS_2023-2024.pdf', 'file_path': 'llm-training-data-bucket/cds-files/University of Nebraska-Lincoln CDS_2023-2024.pdf', 'file_type': 'application/pdf', 'file_size': 669674}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='72b5be20-c69b-49f5-b9e3-5ee1b1ee3985', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'University of Nebraska-Lincoln CDS_2023-2024.pdf', 'file_path': 'llm-training-data-bucket/cds-files/University of Nebraska-Lincoln CDS_2023-2024.pdf', 'file_type': 'application/pdf', 'file_size': 669674}, hash='d6632f4d2570701d6cd

In [32]:
# index documents into pinecone
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.embeddings.cohere import CohereEmbedding
from pinecone import Pinecone, ServerlessSpec

pinecone_api_key = os.environ["PINECONE_API_KEY"]
pinecone_index_name = "cds-index-test"

cohere_api_key = os.environ["COHERE_API_KEY"]

pc = Pinecone(api_key=pinecone_api_key)

print(pc.list_indexes())

if pinecone_index_name not in (index['name'] for index in pc.list_indexes()):
    pc.create_index(
        name=pinecone_index_name,
        dimension=1024, # Replace with your model dimensions
        metric="euclidean", # Replace with your model metric
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

# construct vector store and customize storage context
storage_context = StorageContext.from_defaults(
    vector_store=PineconeVectorStore(pc.Index(pinecone_index_name))
)

from llama_index.embeddings.cohere import CohereEmbedding

# with input_typ='search_query'
embed_model = CohereEmbedding(
    cohere_api_key=cohere_api_key,
    model_name="embed-english-v3.0",
    input_type="search_query",
)

embeddings = embed_model.get_text_embedding("Hello CohereAI!")

index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model=embed_model)
index

{'indexes': [{'dimension': 1024,
              'host': 'cds-index-test-2leo7ru.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'cds-index-test',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


Upserted vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 61/61 [00:01<00:00, 43.07it/s]


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x28dafed10>

In [35]:
# attempt inference
from llama_index.llms.cohere import Cohere

# Necessary to use the latest OpenAI models that support function calling API
llm = Cohere(model="command-r")

chat_engine = index.as_chat_engine(chat_mode="best", llm=llm, verbose=True)

# 25th% = 1090, 50th% = 1200, 75th% = 1300
chat_engine.chat(
    "What was the 75th percentile SAT score for admitted applicants for University of Nebraska - Lincoln?"
)

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: tool
Action Input: {'input': 'What was the 75th percentile SAT score for admitted applicants for University of Nebraska - Lincoln?'}
[0m[1;3;34mObservation: Error: No such tool named `tool`.
[0m[1;3;38;5;200mThought: It looks like I need to use the query engine tool to answer this question. I'll run the user's question as an input and use the tool's response to formulate an answer.
Action: query_engine_tool
Action Input: {'input': 'What was the 75th percentile SAT score for admitted applicants for University of Nebraska - Lincoln?'}
[0m[1;3;34mObservation: The 75th percentile SAT score for admitted applicants to the University of Nebraska - Lincoln was 1310.
[0m[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to formulate the answer.
Answer: The 75th percentile SAT score for admitted applicants to the Un

AgentChatResponse(response='The 75th percentile SAT score for admitted applicants to the University of Nebraska - Lincoln is 1310.', sources=[ToolOutput(content='Error: No such tool named `tool`.', tool_name='tool', raw_input={'kwargs': {'input': 'What was the 75th percentile SAT score for admitted applicants for University of Nebraska - Lincoln?'}}, raw_output='Error: No such tool named `tool`.', is_error=True), ToolOutput(content='The 75th percentile SAT score for admitted applicants to the University of Nebraska - Lincoln was 1310.', tool_name='query_engine_tool', raw_input={'input': 'What was the 75th percentile SAT score for admitted applicants for University of Nebraska - Lincoln?'}, raw_output=Response(response='The 75th percentile SAT score for admitted applicants to the University of Nebraska - Lincoln was 1310.', source_nodes=[NodeWithScore(node=TextNode(id_='c0fc85cf-649d-43c1-b0fb-e3846271dbef', embedding=[-0.0157928467, -0.0246124268, 0.00720977783, 0.00830078125, -0.06433