# libraries

In [4]:
from IPython.display import display
from IPython.display import Markdown
import textwrap
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()
import os
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings

warnings.filterwarnings("ignore")


True

In [10]:

def to_markdown(text):
  text = text.replace('â€¢', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# Document retrival

In [18]:
model = ChatGoogleGenerativeAI(model="gemini-pro",
                             temperature=0.3)

In [19]:
data_folder = p.cwd() / "data"
p(data_folder).mkdir(parents=True, exist_ok=True)

pdf_url = "https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf"
pdf_file = str(p(data_folder, pdf_url.split("/")[-1]))

urllib.request.urlretrieve(pdf_url, pdf_file)

('/Users/ganeshjadhav/Desktop/ITIL_projects/Document_retrival_gpt/data/practitioners_guide_to_mlops_whitepaper.pdf',
 <http.client.HTTPMessage at 0x14ccf5c60>)

In [23]:
pdf_loader = PyPDFLoader(pdf_file)
pages = pdf_loader.load_and_split()
# print(pages[3].page_content)


In [24]:
context = "\n".join(str(p.page_content) for p in pages[:30])
print("The total words in the context: ", len(context))

The total words in the context:  55545


In [25]:
prompt_template = """Answer the question as precise as possible using the provided context. If the answer is
                    not contained in the context, say "answer not available in context" \n\n
                    Context: \n {context}?\n
                    Question: \n {question} \n
                    Answer:
                  """

prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [32]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=0)
context = "\n\n".join(str(p.page_content) for p in pages)
texts = text_splitter.split_text(context)

7

In [33]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [35]:
vector_index = Chroma.from_texts(texts, embeddings).as_retriever()


In [36]:
question = "Describe data management and feature management systems."
docs = vector_index.get_relevant_documents(question)

In [38]:
stuff_answer = stuff_chain(
    {"input_documents": docs, "question": question}, return_only_outputs=True
)

In [45]:
to_markdown(stuff_answer['output_text'])

> Data management and feature management systems provide a unified repository for ML features and datasets. This repository allows multiple uses in the MLOps environment, including experimentation, batch serving, continuous training, batch prediction, and real-time prediction. Feature management systems help data scientists and researchers discover and reuse available feature sets for their entities instead of re-creating them. This can save time and prevent inconsistent definitions and instances of the same data entities.