In [17]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.pinecone import Pinecone
from mlflow.deployments import get_deploy_client
from langchain_community.embeddings import MlflowEmbeddings
from domino_data.vectordb import DominoPineconeConfiguration


import pinecone

import os
import random
import warnings
warnings.filterwarnings('ignore')

In [18]:

loader = PyPDFLoader("/mnt/code/data/Northwind_Health_Plus_Benefits_Details.pdf")
texts = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0))

In [19]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_ENV = os.getenv('PINECONE_API_ENV')

In [20]:
print(f"There are {len(texts)} pages in the document")

There are 318 pages in the document


In [21]:
# Pick a sample page
print(texts[random.randint(0, len(texts))])

page_content='exceptions. These exceptions include:  \n• Non -FDA approved medications  \n• Non -prescription vitamins and supplements  \n• Drugs for cosmetic or elective purposes  \n• Drugs for fertility treatments  \n• Drugs for weight loss or gain  \nIn addition, Northwind Health Plus does not cover drugs that are considered experimental \nor investigational.' metadata={'source': '/mnt/code/Northwind_Health_Plus_Benefits_Details.pdf', 'page': 45}


In [26]:
# Create embeddings to embed queries
embed = MlflowEmbeddings(
    target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
    endpoint="embedding-ada-002ja2",
)

In [27]:
#text_field = "symptoms"

datasource_name = "nwh-benefits"
#print(PINECONE_ENV)
#Vector Access 
conf = DominoPineconeConfiguration(datasource=datasource_name)

#print(conf)

api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)
#api_key = PINECONE_API_KEY

#print(api_key)

# initialize pinecone
pinecone.init(
    api_key=api_key,
    environment=PINECONE_ENV,
    openapi_config=conf
)

index_name = "nwh-plus-benefits"
index = pinecone.Index(index_name)

In [28]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [32]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embed.embed_query, index_name=index_name)

In [39]:
# Ask your query
query = "Does my policy cover annual vision exams?"
# Get the closest matches to create some context and information for the answer
docs = docsearch.similarity_search(query)

In [40]:
print(docs)

[Document(page_content='• Ask your dentist about any special instructions that you may need to follow before or after \nthe procedure  \n• Make sure that you understand t he risks and benefits of the procedure  \n• Ask your dentist about any follow -up care that may be needed after the procedure  \n• Make sure that you have a plan for transportation in case you need to get to the facility for \nthe procedure.  \nDiagnostic X -Ray, Lab  And Imaging  \nCOVERED SERVICES: Diagnostic X -Ray, Lab and Imaging  \nNorthwind Health Plus covers diagnostic X -ray, lab, and imaging services. This includes \nservices like X -rays, CAT scans, MRIs, ultrasounds, and mammograms. Lab services are \ncovered for tests such as blood tests, urine tests, and other diagnostic tests ordered by a \ndoctor. Coverage for imaging services includes Magnetic Resonance Imaging (MRI), \nComputed Tomography (CT) scans, and Positron Emission Tomography (PET) scans.  \nExceptions'), Document(page_content="When considerin