In [1]:
# imports

import os
import re
import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader, Document

In [2]:
# check for llamaparse key
LLAMAPARSE_API_KEY = os.environ.get('LLAMAPARSE_API_KEY')
if LLAMAPARSE_API_KEY is not None:
    print('API key found')
else:
    print('Check for API key in environment variable')

API key found


In [3]:
# get list of files over which to prep
file_list = [file for file in os.listdir('../data') if file.startswith('ufr')]
file_list


['ufr-q1fy24.pdf', 'ufr-q3fy24.pdf', 'ufr-q2fy24.pdf']

In [4]:
# positive loobehind assertion and positive lookahead assertion in regex to get filename

pattern = r"(?<=-)\w+(?=\.)"

re_dash_to_dot = re.compile(r"-([^\.]+)\.")
print(re.search(re_dash_to_dot, file_list[0]))
print(re.search(pattern, file_list[0]))
re.search(pattern, file_list[0]).group()

<re.Match object; span=(3, 11), match='-q1fy24.'>
<re.Match object; span=(4, 10), match='q1fy24'>


'q1fy24'

In [5]:
# list of variables over which to construct the indexes
# positive loobehind assertion and positive lookahead assertion in regex to get filename
pattern = r"(?<=-)\w+(?=\.)"
doc_names = ['axis-'+re.search(pattern, file).group() for file in file_list]
doc_names

['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24']

In [29]:
# for separate index for each document
# list of variables over which to construct the indexes
# positive loobehind assertion and positive lookahead assertion in regex to get filename
pattern = r"(?<=-)\w+(?=\.)"
vec_index_list = ['axis-'+re.search(pattern, file).group()+'-index' for file in file_list]
vec_index_list

['axis-q1fy24-index', 'axis-q3fy24-index', 'axis-q2fy24-index']

In [7]:
pattern = r"\w{4}(?=\.)"
year = [re.search(pattern, file).group() for file in file_list]
year

['fy24', 'fy24', 'fy24']

In [8]:
pattern = r"(?<=-)\w{2}"
quarter = [re.search(pattern, file).group() for file in file_list]
quarter

['q1', 'q3', 'q2']

In [9]:
# instantiate parser
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown", # or text
    num_workers=4, # for multiple files
    verbose=True,
    language="en", # default is english
)

In [10]:
# filename as metadata
file_extractor = {".pdf": parser}
filename_fn = lambda filename: {"file_name": filename}

In [13]:
for file, doc_name, qtr, yr in zip(file_list, doc_names, quarter, year):
    print(file, doc_name, qtr, yr)

ufr-q1fy24.pdf axis-q1fy24 q1 fy24
ufr-q3fy24.pdf axis-q3fy24 q3 fy24
ufr-q2fy24.pdf axis-q2fy24 q2 fy24


In [14]:
# read in docs
doc_dict = {}

for file, doc_name, qtr, yr in zip(file_list, doc_names, quarter, year):
    reader = SimpleDirectoryReader(
        input_files=['../data/' + file],
        file_extractor=file_extractor,
        # filename_as_id=True,
        file_metadata=filename_fn,
        )
    doc = reader.load_data()
    for i in doc:
        i.metadata={
            'file_descr':f'Axis bank quarterly earnings report for {re.search(pattern, file).group()} ',
            'financial_year': yr,
            'quarter':qtr,
                }
        # i.metadata['file_descr']=str(f'Axis bank quarterly earnings report for {re.search(pattern, file).group()} '),
        # i.metadata['financial year']=str(yr),
        # i.metadata['quarter']=str(qtr),
    doc_dict[doc_name] = doc

Started parsing the file under job_id 33b8b6ab-ad0e-40c6-b694-193d234ef428
Started parsing the file under job_id 912f88ab-12f0-48bf-b386-5858e266a739
Started parsing the file under job_id 461a237d-42b9-4096-a34f-69bace366987


In [16]:
# doc_dict['axis-q1fy24']

In [17]:
def check_length(variable, expected_length):
    try:
        if len(variable)==len(expected_length):
            print('All files parsed')
        else:
            raise ValueError(f'Number of files parse is not equal to {len(expected_length)}')
    except ValueError as e:
        print(f'Caught Exception: {e}')

In [18]:
check_length(doc_dict, file_list)

All files parsed


In [19]:
# more imports
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.node_parser import MarkdownElementNodeParser

from pathlib import Path

In [39]:
# instantiate llm and markdown parser
llm = OpenAI(model='gpt-3.5-turbo-0125', temperature=0.1)
node_parser=MarkdownElementNodeParser(llm=llm, num_workers=4)
storage_context = StorageContext.from_defaults()

##### Local Indexing

In [None]:
data_path = Path(f'../data/axis_qr_index')

if not data_path.exists():
    Path.mkdir(data_path)
    print(data_path)
    print('data path created')
else:
    print(data_path)
    print('data path exists')

In [None]:
# loop over the doc_list dictionary to parse the documents, construct vector index and store to disk
for doc in doc_dict.keys():
    # print(doc)
    documents = doc_dict[doc]
    # run parser and get nodes for text and summary for tables
    nodes = node_parser.get_nodes_from_documents(documents)
    base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
    index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context)
    index.set_index_id(doc)
    index.storage_context.persist(persist_dir=f'../data/axis_qr_index')
    print(f'{doc} indexed and stored successfully to disk')

In [None]:
# load stored index 
local_index= {}
for name in doc_names:   
    storage_context=StorageContext.from_defaults(
        persist_dir=f'../data/axis_qr_index')
    cur_index=load_index_from_storage(storage_context, index_id=name)
    local_index[name]=cur_index
    print(f'loaded index {name} from local storage')
    

In [None]:
# # https://docs.llamaindex.ai/en/stable/module_guides/storing/customization/
# # loop over the variable list and document dictionary to parse the nodes and construct indexes
# for file_list_item, vec_index_list_item in zip(file_list, vec_index_list):
    
#     # store vector index to disk
#     data_path = Path(f'../data/axis_qr_index/{vec_index_list_item}')
#     # print(data_path)
#     if not data_path.exists():
#         Path.mkdir(data_path, parents=True, exist_ok=True)
#         # print(data_path)
    
#     documents = doc_list[file_list_item]
#     # run parser and get nodes for text and summary for tables
#     nodes = node_parser.get_nodes_from_documents(documents)
#     base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
#     index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context)
    
#     index.storage_context.persist(persist_dir=data_path)
#     print(f'{file_list_item} indexed and stored successfully to disk')

##### Indexing on Pinecone

In [21]:
# vector store imports
# !pip install llama-index-vector-stores-pinecone
# !pip install pinecone-client
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [22]:
# initialize pinecone
pc = Pinecone()

# dictionary of indexes
pc_index_dict = {}

In [25]:
# pc.list_indexes()

In [27]:
# Get a list of all existing indexes
indexes = pc.list_indexes()
print(indexes)

index_names = [i['name'] for i in pc.list_indexes()]

# Check if a specific index exists
index_name = "axis-q1fy24-index"
if index_name not in index_names:
    print(f"Index '{index_name}' does not exist.")
else:
    print(f"Index '{index_name}' exists.")

{'indexes': []}
Index 'axis-q1fy24-index' does not exist.


In [None]:
# Get a list of all existing indexes
# indexes = pc.list_indexes()
# pc.list_indexes().names()# 
# index_names = [i['name'] for i in pc.list_indexes()]

In [30]:
index_names = [i['name'] for i in pc.list_indexes()]

# create separate serverless pinecone index for each index in index_set; i.e. quarterly result
for item in vec_index_list:
    # print(item)
    if item not in index_names:
        # print(item)
        # create index
        pc.create_index(
            name=item,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1",
            )
        )
        pc_index_dict[item] = pc.Index(item)
        print(f'{item} created successfully')
        print(pc.describe_index(name=item))
    else:
        pc_index_dict[item] = pc.Index(item)
        print(f'{item} exists and is loaded successfully')
        print(pc.describe_index(name=item))


    
    

axis-q1fy24-index created successfully
{'dimension': 1536,
 'host': 'axis-q1fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-q1fy24-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
axis-q3fy24-index created successfully
{'dimension': 1536,
 'host': 'axis-q3fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-q3fy24-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
axis-q2fy24-index created successfully
{'dimension': 1536,
 'host': 'axis-q2fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-q2fy24-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [None]:
# if 'fin_index' not in pc.list_indexes():
#     # create index
#     pc.create_index(
#         name="fin-index",
#         dimension=1536,
#         metric="cosine",
#         spec=ServerlessSpec(
#             cloud="aws",
#             region="us-east-1",
#         )
#     )
#     print('finIndex created successfully')
#     print(pc.describe_index(name="fin-index"))
#     # initialize index
#     fin_index = pc.Index('fin-index')
# else:
#     fin_index = pc.Index('fin-index')
#     print('finIndex loaded successfully')
#     print(pc.describe_index(name="fin-index"))

In [31]:
doc_names, doc_dict.keys()

(['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24'],
 dict_keys(['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24']))

In [33]:
try:
    # delete index 
    pc.delete_index("test")
except Exception as e:
    print(f'caught exception: {e}')
    print('index not found')
    

caught exception: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'x-pinecone-api-version': '2024-04', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': '1d46e995f7565f84d62cee92aa5fc303', 'Date': 'Fri, 17 May 2024 08:02:30 GMT', 'Server': 'Google Frontend', 'Content-Length': '79', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource test not found"},"status":404}

index not found


In [36]:
new_index = 'test'
index_names = [i['name'] for i in pc.list_indexes()]

# create single serverless pinecone index for each index in index_set
if new_index not in index_names:
    # print(item)
    # create index
    pc.create_index(
        name=new_index,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )
    test_index = pc.Index(new_index)
    # pc_index_dict[new_index] = pc.Index(new_index)
    print(f'Index "{new_index}" created and initialised successfully')
    print(pc.describe_index(name=new_index))
else:
    # pc_index_dict[new_index] = pc.Index(new_index)
    test_index = pc.Index(new_index)
    print(f'Index "{new_index}" exists; loaded and initialised successfully as {new_index}_index')
    print(pc.describe_index(name=new_index))

Index "test" exists; loaded and initialised successfully as test_index
{'dimension': 1536,
 'host': 'test-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'test',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [40]:
# initialize index
# temp_index = pc.Index('test')

# initialize pinecone vector store
# metadata_filters = {
#         'quarter': 'q1',
#         'fiscal': 'fy24',
#     }

documents = doc_dict['axis-q1fy24']
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

vector_store = PineconeVectorStore(
    pinecone_index=test_index,
    namespace = 'axis bank')
    # metadata_filters = metadata_filters)
# create storage context with pinecone vector store
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# create vector store index with documents and storage context
item = VectorStoreIndex(
    nodes=base_nodes+objects,
    storage_context=storage_context,
    show_progress=True)

Embeddings have been explicitly disabled. Using MockEmbedding.


15it [00:00, 2530.86it/s]
100%|██████████| 15/15 [00:16<00:00,  1.11s/it]
Generating embeddings: 100%|██████████| 25/25 [00:01<00:00, 14.50it/s]
Upserted vectors: 100%|██████████| 25/25 [00:03<00:00,  7.25it/s]


In [41]:

# use namespaces and metadata in a single index for a bank

index_names = [i['name'] for i in pc.list_indexes()]

new_index = 'axis-bank'
# create single serverless pinecone index for each index in index_set
if new_index not in index_names:
    # print(item)
    # create index
    pc.create_index(
        name=new_index,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )
    axis_index = pc.Index(new_index)
    # pc_index_dict[new_index] = pc.Index(new_index)
    print(f'Index "{new_index}" created and initialised successfully')
    print(pc.describe_index(name=new_index))
else:
    # pc_index_dict[new_index] = pc.Index(new_index)
    axis_index = pc.Index(new_index)
    print(f'Index "{new_index}" exists; loaded and initialised successfully')
    print(pc.describe_index(name=new_index))

Index "axis-bank" created and initialised successfully
{'dimension': 1536,
 'host': 'axis-bank-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-bank',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [42]:
for doc in doc_names:
    print(doc)
    documents = doc_dict[doc]
    nodes = node_parser.get_nodes_from_documents(documents)
    base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
    
    vector_store = PineconeVectorStore(
        pinecone_index=axis_index,
        namespace=doc)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    axis_index = VectorStoreIndex(nodes=base_nodes+objects,
                            storage_context=storage_context,
                            show_progress=True)
    print(f'"{doc}" upserted into a separate namespace')
    
    

axis-q1fy24
Embeddings have been explicitly disabled. Using MockEmbedding.


15it [00:00, 6157.23it/s]
100%|██████████| 15/15 [00:15<00:00,  1.04s/it]
Generating embeddings: 100%|██████████| 25/25 [00:01<00:00, 17.39it/s]
Upserted vectors: 100%|██████████| 25/25 [00:03<00:00,  7.18it/s]


"axis-q1fy24" upserted into a separate namespace
axis-q3fy24
Embeddings have been explicitly disabled. Using MockEmbedding.


21it [00:00, 17342.07it/s]
100%|██████████| 21/21 [00:18<00:00,  1.13it/s]
Generating embeddings: 100%|██████████| 34/34 [00:01<00:00, 17.82it/s]


AttributeError: 'VectorStoreIndex' object has no attribute 'upsert'

##### Query engine with reranking

In [None]:
# imports
from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
    )
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

In [None]:
# initialize reranker
reranker = FlagEmbeddingReranker(
    top_n=15,
    model="BAAI/bge-reranker-large
)

In [None]:
print(type(doc_names))
doc_names

In [None]:
# query engine tools with namespaces
query_engine_tools =  [
    QueryEngineTool(
        query_engine=axis_index.as_query_engine(
            similarity_top_k=15,
            node_postprocessors=[reranker]),
        metadata=ToolMetadata(
            name=doc,
            description=f'useful for when you want to answer queries about Axis bank quarterly earnings report for {re.search(pattern, file).group()}',
            namespace=doc,
        )
    )
    for doc in doc_names
]

In [None]:
# subquestion query engine
query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    similarity_top_k=15,
    use_async=True,
    llm=llm,
)