In [None]:
!pip install --quiet --upgrade langchain-text-splitters langchain-community langchain-openai

In [None]:
import getpass
import os
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [None]:
# Load PDF
!pip install --quiet pypdf

from langchain_community.document_loaders import PyPDFLoader
file_path = '/home/jovyan/Langchain/CyberGIS_Cloud_PEARC22-3.pdf'
loader = PyPDFLoader(file_path)

docs = loader.load()

In [None]:
# Load HTML web page
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing_extensions import List, TypedDict
from langchain_openai import OpenAIEmbeddings
#from langchain_core.embeddings import DeterministicFakeEmbedding

# https://lilianweng.github.io/posts/2023-06-23-agent/
web_source = 'https://i-guide.io/about-i-guide/'

# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=(web_source,),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [None]:
# Index documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
#embeddings = DeterministicFakeEmbedding(size=4096)

# Index chunks
vector_store = InMemoryVectorStore(embeddings)
_ = vector_store.add_documents(documents=all_splits)

In [None]:
from langchain import hub
from langchain.chat_models import init_chat_model

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")
llm = init_chat_model("gpt-4.1-nano", model_provider="openai")

In [None]:
#question = 'What is Task Decomposition?'
question = 'What is this paper about?'

retrieved_docs = vector_store.similarity_search(question)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
prompt = prompt.invoke({"question": question, "context": docs_content})
answer = llm.invoke(prompt)

In [None]:
answer.content
#answer