In [1]:
!pip install langchain langchain-openai pypdf unstructured pdfminer.six pdf2image pillow_heif unstructured_inference pikepdf chromadb lark

import os
os.environ["OPENAI_API_KEY"] = "YOUR API KEY"

Collecting langchain
  Downloading langchain-0.1.9-py3-none-any.whl (816 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.0/817.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-openai
  Downloading langchain_openai-0.0.7-py3-none-any.whl (33 kB)
Collecting pypdf
  Downloading pypdf-4.0.2-py3-none-any.whl (283 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unstructured
  Downloading unstructured-0.12.4-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Collecting

In [27]:
import requests
import xml.etree.ElementTree as ET

def extract_pdf_links(url):
    # Fetch the content of the URL
    response = requests.get(url)
    response.raise_for_status()

    # Parse the XML data
    root = ET.fromstring(response.content)

    # Extract and print all the link URLs
    links = root.findall(".//link")
    links = [link.text for link in links]

    pdf_links = []

    for link in links[1:]:
      base_url = "https://arxiv.org/pdf/"
      pdf_id = link.split('/')[-1]
      pdf_url = f"{base_url}{pdf_id}.pdf"
      pdf_links.append(pdf_url)

    return pdf_links

url = "https://export.arxiv.org/rss/cs.AI"
pdf_links = extract_pdf_links(url)

In [26]:
from langchain.document_loaders import OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ""]
)

# Load and Split Data
all_docs = []
for link in pdf_links[:10]:
  print(link)
  try:
    loader = OnlinePDFLoader(link)
    data = loader.load_and_split(text_splitter=recursive_splitter)
    all_docs.extend(data)
  except:
    print("Failed to load pdf: ", link)

https://arxiv.org/pdf/2402.15075.pdf
https://arxiv.org/pdf/2402.15140.pdf
https://arxiv.org/pdf/2402.15444.pdf
https://arxiv.org/pdf/2402.15445.pdf
https://arxiv.org/pdf/2402.15506.pdf
https://arxiv.org/pdf/2402.14827.pdf
https://arxiv.org/pdf/2402.14830.pdf
https://arxiv.org/pdf/2402.14833.pdf
https://arxiv.org/pdf/2402.14834.pdf
https://arxiv.org/pdf/2402.14835.pdf


In [28]:
len(all_docs)

305

In [29]:
# Store data in db
# db.delete()
db = Chroma.from_documents(all_docs, OpenAIEmbeddings())
print(db._collection.count())

305


In [30]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)


memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

retriever=db.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [32]:
question = "What is a factorizing partitioned expression?"

result = qa.invoke({"question": question})
result['answer']

'A factorizing partitioned expression refers to a method of decomposing a complex conditional probabilistic distribution (CPD) specified as a partitioned expression over discrete and continuous variables. This decomposition involves breaking down the statistical or arithmetic functions within the CPD into smaller components, typically sets of size two, to reduce the size of the CPD. This process aims to improve efficiency in Bayesian network models by simplifying the representation of the relationships between variables.'

In [33]:
question = "What is a conditional probabilistic distribution?"

result = qa.invoke({"question": question})
result['answer']

'A conditional probability distribution (CPD) is a probability distribution that gives the probability of a particular outcome given certain conditions or values of other variables. In the context of Bayesian networks, a CPD represents the relationship between a node and its parents in the network, specifying how the probability of the node depends on the values of its parent nodes.'

In [34]:
while True:
  user_input = input("You: ")

  if user_input == 'quit':
    break

  result = qa({"question": user_input})
  print(f"Assistant: {result['answer']}")

You: What does it mean to decompose a CPD over discrete and continuous variables?
Assistant: The process of decomposing a conditional probabilistic distribution over discrete and continuous variables involves using methods like Binary Factorization (BF) and Stacking Factorization (SF). BF factorizes the CPDs into simpler forms to ensure each CPD involves no more than three continuous nodes (two parents and one child). SF introduces intermediate variables to decompose the densities in the partitioned expressions exactly, ensuring each child node connects to no more than two continuous parents. By applying these factorization methods, the size of the CPDs can be reduced, making the inference process more efficient for hybrid Bayesian networks.
You: What was the motivation of this paper?
Assistant: The motivation behind the research on factorizing partitioned expressions in Bayesian networks was to address the issue of the exponential growth in the size of conditional probability distribu