In [2]:
# similar to qa.ipynb but using Llamaindex and Pinecone vector db service instead of Langchain

import sys
import os
from datetime import datetime
import logging

from IPython.display import Markdown, display

from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader, GPTListIndex
from llama_index.vector_stores import PineconeVectorStore
from llama_index import StorageContext
from llama_index import LangchainEmbedding, ServiceContext

import pinecone
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from langchain.embeddings.huggingface import HuggingFaceEmbeddings

import dotenv
dotenv.load_dotenv()


True

## Load HuggingFace embedding model using langchain
- Maybe save a bit since OpenAI API cost is adding up
- OTOH HuggingFace API is maybe slow?


In [3]:
# embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
# service_context = ServiceContext.from_defaults(embed_model=embed_model)

# embed_size = len(embed_model.get_query_embedding('What the heck'))
# embed_size


## Initialize Pinecone vector DB


In [7]:
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), 
              environment=os.getenv("PINECONE_ENV"))


In [8]:
print(datetime.now())
embed_size=1536
pinecone.create_index("dv-semantic-search", dimension=embed_size, metric="euclidean", pod_type="p1")
print(datetime.now())


In [9]:
pinecone_index = pinecone.Index("dv-semantic-search")


## Load Documents

In [10]:
# load documents
documents = SimpleDirectoryReader('hftc').load_data()


## Index Documents

In [None]:
# local index
# print(datetime.now())
# index = GPTVectorStoreIndex.from_documents(documents, 
# #                                            service_context=service_context,
#                                           )
# print(datetime.now())


In [13]:
print(datetime.now())

vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
#     metadata_filters={"title": "paul_graham_essay"}
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = GPTVectorStoreIndex.from_documents(documents, 
                                           storage_context=storage_context,
#                                            service_context=service_context
                                          )
print(datetime.now())


2023-05-05 12:22:38.128784


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens


> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 6382738 tokens


> [build_index_from_nodes] Total embedding token usage: 6382738 tokens
> [build_index_from_nodes] Total embedding token usage: 6382738 tokens
2023-05-05 13:53:36.378580


## Answer Questions


In [22]:
## Perform semantic search against index
query_engine = index.as_query_engine(retriever_mode="embedding", 
#                                      service_context=service_context,                                     
                                     verbose=False,
                                    )
query = "what are the names of some mentioned Managed Service Providers or MSPs?"
response = query_engine.query(query)

display(Markdown(f"<b>{response}</b>"))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 17 tokens


> [retrieve] Total embedding token usage: 17 tokens
> [retrieve] Total embedding token usage: 17 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 248 tokens


> [get_response] Total LLM token usage: 248 tokens
> [get_response] Total LLM token usage: 248 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens


<b>
The names of some mentioned Managed Service Providers or MSPs are AlphaServe, CDI, and Agio.</b>

In [21]:
query = "what are the names of some mentioned endpoint protection or endpoint detection and response or EDR products?"
response = query_engine.query(query)
display(Markdown(f"<b>{response}</b>"))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 18 tokens


> [retrieve] Total embedding token usage: 18 tokens
> [retrieve] Total embedding token usage: 18 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 805 tokens


> [get_response] Total LLM token usage: 805 tokens
> [get_response] Total LLM token usage: 805 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens


<b>
Some endpoint protection or endpoint detection and response or EDR products include: Carbon Black, Ninite Agent, Mimecast Web Security, Nessus Tenable Agent, Kaseya, Crowdstrike, and DNSFilter.</b>

In [20]:
query = "what are the names of some mentioned MDRs or MSSPs or other cybersecurity-related vendors?"
response = query_engine.query(query)
display(Markdown(f"<b>{response}</b>"))


INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens


> [retrieve] Total LLM token usage: 0 tokens
> [retrieve] Total LLM token usage: 0 tokens


INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 21 tokens


> [retrieve] Total embedding token usage: 21 tokens
> [retrieve] Total embedding token usage: 21 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 616 tokens


> [get_response] Total LLM token usage: 616 tokens
> [get_response] Total LLM token usage: 616 tokens


INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens


> [get_response] Total embedding token usage: 0 tokens
> [get_response] Total embedding token usage: 0 tokens


<b>
Unfortunately, no specific names of MDRs, MSSPs, or other cybersecurity-related vendors are mentioned in the context information provided.</b>