In [1]:
# imports

import os
import re
import nest_asyncio
nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader, Document

In [2]:
# check for llamaparse key
LLAMAPARSE_API_KEY = os.environ.get('LLAMAPARSE_API_KEY')
if LLAMAPARSE_API_KEY is not None:
    print('API key found')
else:
    print('Check for API key in environment variable')

API key found


In [4]:
# positive loobehind assertion and positive lookahead assertion in regex to get filename

pattern = r"(?<=-)\w+(?=\.)"

re_dash_to_dot = re.compile(r"-([^\.]+)\.")
print(re.search(re_dash_to_dot, file_list[0]))
print(re.search(pattern, file_list[0]))
re.search(pattern, file_list[0]).group()

<re.Match object; span=(3, 11), match='-q1fy24.'>
<re.Match object; span=(4, 10), match='q1fy24'>


'q1fy24'

In [3]:
# get list of files over which to prep
file_list = [file for file in os.listdir('../data') if file.startswith('ufr')]
file_list


['ufr-q1fy24.pdf', 'ufr-q3fy24.pdf', 'ufr-q2fy24.pdf']

In [5]:
# list of variables over which to construct the indexes
# positive loobehind assertion and positive lookahead assertion in regex to get filename
pattern = r"(?<=-)\w+(?=\.)"
doc_names = ['axis-'+re.search(pattern, file).group() for file in file_list]
doc_names

['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24']

In [6]:
# list of variables over which to construct the indexes
# positive loobehind assertion and positive lookahead assertion in regex to get filename
pattern = r"(?<=-)\w+(?=\.)"
vec_index_list = ['axis-'+re.search(pattern, file).group()+'-index' for file in file_list]
vec_index_list

['axis-q1fy24-index', 'axis-q3fy24-index', 'axis-q2fy24-index']

In [94]:
pattern = r"\w{4}(?=\.)"
year = [re.search(pattern, file).group() for file in file_list]
year

['fy24', 'fy24', 'fy24']

In [97]:
pattern = r"(?<=-)\w{2}"
quarter = [re.search(pattern, file).group() for file in file_list]
quarter

['q1', 'q3', 'q2']

In [7]:
# instantiate parser
parser = LlamaParse(
    api_key=LLAMAPARSE_API_KEY,
    result_type="markdown", # or text
    num_workers=4, # for multiple files
    verbose=True,
    language="en", # default is english
)

In [8]:
# filename as metadata
file_extractor = {".pdf": parser}
filename_fn = lambda filename: {"file_name": filename}

In [10]:
doc_names

['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24']

In [11]:
file_list

['ufr-q1fy24.pdf', 'ufr-q3fy24.pdf', 'ufr-q2fy24.pdf']

In [12]:
# read in docs
doc_dict = {}

for file, doc_name, qtr, yr in zip(file_list, doc_names, quarter, year):
    reader = SimpleDirectoryReader(
        input_files=['../data/' + file],
        file_extractor=file_extractor,
        # filename_as_id=True,
        file_metadata=filename_fn,
        )
    doc = reader.load_data()
    for i in doc:
        i.metadata['file_descr']=f'Axis bank quarterly earnings report for {re.search(pattern, file).group()} '
        i.metadata['financial year']=yr,
        i.metadata['quarter']=qtr,
    doc_dict[doc_name] = doc

Started parsing the file under job_id 9796c9f8-fa86-478e-ada4-060ff0a77314
..Started parsing the file under job_id 161c555f-c8a1-49af-82bd-11021f780e5d
..Started parsing the file under job_id b5d71922-a164-4298-9b6f-3b70910d8fdb
...

In [15]:
def check_length(variable, expected_length):
    try:
        if len(variable)==len(expected_length):
            print('All files parsed')
        else:
            raise ValueError(f'Number of files parse is not equal to {len(expected_length)}')
    except ValueError as e:
        print(f'Caught Exception: {e}')

In [16]:
check_length(doc_dict, file_list)

All files parsed


In [21]:
# more imports
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.node_parser import MarkdownElementNodeParser

from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
    )

from pathlib import Path

In [22]:
# instantiate llm and markdown parser
llm = OpenAI(model='gpt-3.5-turbo-0125', temperature=0.1)
node_parser=MarkdownElementNodeParser(llm=llm)
storage_context = StorageContext.from_defaults()

In [23]:
data_path = Path(f'../data/axis_qr_index')

if not data_path.exists():
    Path.mkdir(data_path)
    print(data_path)
    print('data path created')
else:
    print(data_path)
    print('data path exists')

../data/axis_qr_index
data path exists


In [24]:
# loop over the doc_list dictionary to parse the documents, construct vector index and store to disk
for doc in doc_dict.keys():
    # print(doc)
    documents = doc_dict[doc]
    # run parser and get nodes for text and summary for tables
    nodes = node_parser.get_nodes_from_documents(documents)
    base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
    index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context)
    index.set_index_id(doc)
    index.storage_context.persist(persist_dir=f'../data/axis_qr_index')
    print(f'{doc} indexed and stored successfully to disk')

Embeddings have been explicitly disabled. Using MockEmbedding.


15it [00:00, 3953.66it/s]
100%|██████████| 15/15 [00:16<00:00,  1.13s/it]


axis-q1fy24 indexed and stored successfully to disk
Embeddings have been explicitly disabled. Using MockEmbedding.


21it [00:00, 19127.12it/s]
100%|██████████| 21/21 [00:17<00:00,  1.18it/s]


axis-q3fy24 indexed and stored successfully to disk
Embeddings have been explicitly disabled. Using MockEmbedding.


25it [00:00, 10157.67it/s]
100%|██████████| 25/25 [00:20<00:00,  1.19it/s]


axis-q2fy24 indexed and stored successfully to disk


In [26]:
# load stored index 
local_index= {}
for name in doc_names:   
    storage_context=StorageContext.from_defaults(
        persist_dir=f'../data/axis_qr_index')
    cur_index=load_index_from_storage(storage_context, index_id=name)
    local_index[name]=cur_index
    print(f'loaded index {name} from local storage')
    

loaded index axis-q1fy24 from local storage
loaded index axis-q3fy24 from local storage
loaded index axis-q2fy24 from local storage


In [None]:
# # https://docs.llamaindex.ai/en/stable/module_guides/storing/customization/
# # loop over the variable list and document dictionary to parse the nodes and construct indexes
# for file_list_item, vec_index_list_item in zip(file_list, vec_index_list):
    
#     # store vector index to disk
#     data_path = Path(f'../data/axis_qr_index/{vec_index_list_item}')
#     # print(data_path)
#     if not data_path.exists():
#         Path.mkdir(data_path, parents=True, exist_ok=True)
#         # print(data_path)
    
#     documents = doc_list[file_list_item]
#     # run parser and get nodes for text and summary for tables
#     nodes = node_parser.get_nodes_from_documents(documents)
#     base_nodes, objects = node_parser.get_nodes_and_objects(nodes)
#     index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context)
    
#     index.storage_context.persist(persist_dir=data_path)
#     print(f'{file_list_item} indexed and stored successfully to disk')

In [27]:
# vector store imports
# !pip install llama-index-vector-stores-pinecone
# !pip install pinecone-client
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from tqdm.autonotebook import tqdm


In [45]:
# initialize pinecone
pc = Pinecone()

# dictionary of indexes
pc_index_dict = {}

In [69]:
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'axis-q1fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'axis-q1fy24-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'axis-q2fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'axis-q2fy24-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'axis-q3fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'axis-q3fy24-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [62]:
# Get a list of all existing indexes
indexes = pc.list_indexes()
print(indexes)
# Check if a specific index exists
index_name = "axis-q1fy24-index"
if index_name in indexes[0]['name']:
    print(f"Index '{index_name}' exists.")
else:
    print(f"Index '{index_name}' does not exist.")

{'indexes': [{'dimension': 1536,
              'host': 'axis-q1fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'axis-q1fy24-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'axis-q2fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'axis-q2fy24-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1536,
              'host': 'axis-q3fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'axis-q3fy24-index',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}
Index 

In [75]:
# Get a list of all existing indexes
# indexes = pc.list_indexes()
pinecone.list_indexes().names()# 
index_names = [i['name'] for i in pc.list_indexes()]

In [77]:
index_names = [i['name'] for i in pc.list_indexes()]

# create separate serverless pinecone index for each index in index_set
for item in vec_index_list:
    # print(item)
    if item not in index_names:
        # print(item)
        # create index
        pc.create_index(
            name=item,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1",
            )
        )
        pc_index_dict[item] = pc.Index(item)
        print(f'{item} created successfully')
        print(pc.describe_index(name=item))
    else:
        pc_index_dict[item] = pc.Index(item)
        print(f'{item} exists and is loaded successfully')
        print(pc.describe_index(name=item))


    
    

axis-q1fy24-index exists and is loaded successfully
{'dimension': 1536,
 'host': 'axis-q1fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-q1fy24-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
axis-q3fy24-index exists and is loaded successfully
{'dimension': 1536,
 'host': 'axis-q3fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-q3fy24-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
axis-q2fy24-index exists and is loaded successfully
{'dimension': 1536,
 'host': 'axis-q2fy24-index-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-q2fy24-index',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [96]:

# use namespaces and metadata in a single index

index_names = [i['name'] for i in pc.list_indexes()]

new_index = 'axis-bank'
# create single serverless pinecone index for each index in index_set
if new_index not in index_names:
    # print(item)
    # create index
    pc.create_index(
        name=new_index,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )
    pc_index = pc.Index(new_index)
    # pc_index_dict[new_index] = pc.Index(new_index)
    print(f'Index {new_index} created and initialised successfully')
    print(pc.describe_index(name=new_index))
else:
    # pc_index_dict[new_index] = pc.Index(new_index)
    pc_index = pc.Index(new_index)
    print(f'Index {new_index} exists; loaded and initialised successfully')
    print(pc.describe_index(name=new_index))

Index axis-bank exists; loaded and initialised successfully
{'dimension': 1536,
 'host': 'axis-bank-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'axis-bank',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [None]:
# if 'fin_index' not in pc.list_indexes():
#     # create index
#     pc.create_index(
#         name="fin-index",
#         dimension=1536,
#         metric="cosine",
#         spec=ServerlessSpec(
#             cloud="aws",
#             region="us-east-1",
#         )
#     )
#     print('finIndex created successfully')
#     print(pc.describe_index(name="fin-index"))
#     # initialize index
#     fin_index = pc.Index('fin-index')
# else:
#     fin_index = pc.Index('fin-index')
#     print('finIndex loaded successfully')
#     print(pc.describe_index(name="fin-index"))

In [101]:
doc_names, doc_dict.keys()

(['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24'],
 dict_keys(['axis-q1fy24', 'axis-q3fy24', 'axis-q2fy24']))

In [None]:
# delte index 
pc.delete_index("test")

In [108]:
new_index = 'test'
# create single serverless pinecone index for each index in index_set
if new_index not in index_names:
    # print(item)
    # create index
    pc.create_index(
        name=new_index,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        )
    )
    pc_index = pc.Index(new_index)
    # pc_index_dict[new_index] = pc.Index(new_index)
    print(f'Index {new_index} created and initialised successfully')
    print(pc.describe_index(name=new_index))
else:
    # pc_index_dict[new_index] = pc.Index(new_index)
    pc_index = pc.Index(new_index)
    print(f'Index {new_index} exists; loaded and initialised successfully')
    print(pc.describe_index(name=new_index))

Index test created and initialised successfully
{'dimension': 1536,
 'host': 'test-a0ad14b.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'test',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [109]:
# initialize index
temp_index = pc.Index('test')
# initialize pinecone vector store
# metadata_filters = {
#         'quarter': 'q1',
#         'fiscal': 'fy24',
#     }
vector_store = PineconeVectorStore(
    pinecone_index=temp_index,
    namespace = 'test')
    # metadata_filters = metadata_filters)
# create storage context with pinecone vector store
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# create vector store index with documents and storage context
item = VectorStoreIndex.from_documents(
    doc_dict['axis-q1fy24'],
    storage_context=storage_context,
    show_progress=True)  

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  6.09it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  8.91it/s]
Upserted vectors: 100%|██████████| 10/10 [00:03<00:00,  2.70it/s]


In [None]:
for name, qtr, fiscal in zip(doc_names, quarter, year):
    metadata = {
        'quarter': qtr,
        'fiscal': fiscal,
    }
    vector_store = PineconeVectorStore(
        pinecone_index=pc_index,
        metadata)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
query_engine = axis_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])


    rerank = FlagEmbeddingReranker(model='BAAT/bge-reranker-large', top_n=5)
    # reranker = FlagEmbeddingReranker(model='BAAT/bge-reranker-large', top_n=5)

In [None]:
# run parser and get nodes for text and summary for tables
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = noder_parser.get_nodes_and_objects(nodes)

In [None]:
axis_index = VectorStoreIndex(nodes=base_nodes+objects)

# store vector index to disk

query_engine = axis_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])

In [None]:
rerank = FlagEmbeddingReranker(model='BAAT/bge-reranker-large', top_n=5)