# PyZoBot: A Platform for Conversational Information Extraction and Synthesis from Curated Zotero Reference Libraries through Advanced Retrieval-Augmented Generation.

## File: PyZoBot.ipynb

Description: This module serves as the main entry point for the PyZoBot application. It integrates Zotero's reference management capabilities with OpenAI's advanced language models to streamline and enhance the process of scientific literature review.

## Copyright (2024) Suad Alshammari, Lama Basalelah, Walaa Abu Rukbah, Ali Alsuhibani, Dayanjan S. Wijesinghe

Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

In [None]:
#Dependencies:
!pip install pyzotero -q
!pip install pandas -q
!pip install requests -q
!pip install openai -q
!pip install pinecone-client -q
!pip install langchain -q
!pip install unstructured -q
!pip install "unstructured[pdf]" -q
!pip install tiktoken -q
!pip install fitz -q
!pip install PyPDF2 -q
!pip install PyMuPDF -q
!pip install llama_index --q
#!pip install chromadb --q
!pip install chromadb==0.4.15 -q

In [None]:
import shutil

folder_path = "/content/pdfs_output"  # Replace with your folder path

# Check if the folder exists
if shutil.os.path.exists(folder_path):
    # If it exists, delete it
    shutil.rmtree(folder_path)
    print(f"The folder at {folder_path} has been deleted.")
else:
    print(f"The folder at {folder_path} does not exist.")

In [None]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Enter your OpenAI API key:")

zotero_api_key= getpass("Enter your Zotero API key:")
library_type= getpass("Enter your Zotero library type (in small letters write group or user):")
library_id= getpass("Enter your Zotero library id:")

In [None]:
import time
# %%time
import os
import openai
import pinecone
import langchain
import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.chat_models import ChatOpenAI
from tqdm.autonotebook import tqdm

In [None]:
from pyzotero import zotero
zot = zotero.Zotero(library_id=library_id, library_type=library_type, api_key= zotero_api_key)
items = zot.everything(zot.top())

In [None]:
import pandas as pd
df=  pd.json_normalize(items)

In [None]:
# ## Activate this chunk if you want to interact with a collection (pass the collection ID ):

# collection_ID = '4PBTYQIY'

# # Filter rows based on the presence of the collection_ID
# df = df[df['data.collections'].apply(lambda x: collection_ID in x)]

In [None]:
df1 = df[df['meta.numChildren'] == 0]
df2 = df[df['meta.numChildren'] != 0]

In [None]:
df2['links.self.href'] = df2['links.self.href'].astype(str)+ '/children'

In [None]:
frames = [df1, df2]

df3 = pd.concat(frames)

In [None]:
df4=df3
import requests
def fetch_url_content_as_json(url):
    try:
        headers = {
            'Zotero-API-Key': f'{zotero_api_key}'  # Adjust the header based on your API's requirements
        }
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            return response.json()  # Parse JSON response
        else:
            return {"error": f"Error: {response.status_code}"}
    except Exception as e:
        return {"error": f"Error: {str(e)}"}

# Apply the function to fetch JSON content
df4['JSONContent'] = df4['links.self.href'].apply(fetch_url_content_as_json)

In [None]:
def flatten_json(nested_json: dict, exclude: list=['']) -> dict:
    """
    Flatten a list of nested dicts.
    """
    out = dict()
    def flatten(x: (list, dict, str), name: str='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude:
                    flatten(x[a], f'{name}{a}.')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, f'{name}{i}.')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [None]:
df_source2 = pd.DataFrame([flatten_json(x) for x in df4['JSONContent']])

In [None]:
import re
df9= df_source2
cols_to_join = [col for col in df9.columns if col.endswith('.enclosure.href')]
df9['enclosure.href'] = df9[cols_to_join].apply(lambda x: '##'.join(x.values.astype(str)), axis=1)

In [None]:
df10= df9
cols_to_join = [col for col in df10.columns if col.endswith('.enclosure.title')]
df10['enclosure.title'] = df10[cols_to_join].apply(lambda x: '##'.join(x.values.astype(str)), axis=1)

In [None]:
df11= df10[['enclosure.title', 'enclosure.href']]

In [None]:
# Split the rows at '##' and create a list of new rows
df12=df11
new_df = df12['enclosure.title'].str.split('##', expand=True).stack().reset_index(level=1, drop=True).to_frame('enclosure.title')
# new_df
df12 = df12.drop('enclosure.title', axis=1).join(new_df)
# df12

In [None]:
# Split the rows at '##' and create a list of new rows
df13=df12
new_df2 = df13['enclosure.href'].str.split('##', expand=True).stack().reset_index(level=1, drop=True).to_frame('enclosure.href')

# new_df

df13 = df13.drop('enclosure.href', axis=1).join(new_df2)
df13.dropna(inplace=True)
# df13

In [None]:
df15= df13
df15 = df15.replace('nan', pd.NA)
df15= df15.dropna()
# df15

In [None]:
df15['PDF_Names'] = df15['enclosure.title']
df15= df15[['PDF_Names', 'enclosure.href']]


In [None]:
df16= df15.drop_duplicates(keep='first')
# df16

In [None]:
# Filter rows where the "PDF_Names" column ends with ".pdf"
df17 = df16[df16['PDF_Names'].str.endswith('.pdf')]

# Display the filtered DataFrame
#df17

In [None]:
# Create empty folder to store the pdfs

import os

# Specify the folder name you want to create
folder_name = 'pdfs_output'

# Path to the content directory in Google Colab
content_path = '/content'

# Full path to the new folder
folder_path = os.path.join(content_path, folder_name)

# Check if the folder already exists
if not os.path.exists(folder_path):
    # Create the folder
    os.makedirs(folder_path)
    print(f"Folder '{folder_name}' created successfully at {folder_path}")
else:
    print(f"Folder '{folder_name}' already exists at {folder_path}")

Folder 'pdfs_output' created successfully at /content/pdfs_output


In [None]:
df20=df17
import requests
import pandas as pd
import os


# Define your output folder
output_folder = "/content/pdfs_output"


headers = {'Zotero-API-Key': f'{zotero_api_key}'}

# Iterate through the dataframe
for index, row in df20.iterrows():
    api_url = row['enclosure.href']
    pdf_filename = row['PDF_Names']

    # Make an HTTP GET request for each URL
    response = requests.get(api_url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        binary_content = response.content
        content_type = response.headers.get("Content-Type")

        # Check if the content type is 'pdf/application'
        if content_type == 'application/pdf':
            pdf_filename = row['PDF_Names']
            pdf_filepath = os.path.join(output_folder, pdf_filename)

            # Save the PDF to the specified folder
            with open(pdf_filepath, 'wb') as pdf_file:
                pdf_file.write(binary_content)

            print(f"Saved PDF: {pdf_filename}")
        else:
            print(f"Skipped non-PDF content for URL: {api_url}")
    else:
        print(f"Failed to fetch data from the API for URL: {api_url}")

print("All PDFs processed.")

In [None]:
import time
# %%time
import os
import openai
import pinecone
import langchain
import tqdm
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.chat_models import ChatOpenAI
from tqdm.autonotebook import tqdm

In [None]:
!pip install llama-index-core
!pip install llama-index-llms-openai
!pip install llama-index-llms-replicate
!pip install llama-index-embeddings-huggingface

In [None]:
!pip install -U llama-index llama-index-core

In [None]:
from llama_index.core import download_loader

In [None]:
#! pip install llama_index --q
pdfs_dir = output_folder
pdf_names = os.listdir(pdfs_dir)
pdf_paths = [os.path.join(pdfs_dir, pdf_name) for pdf_name in pdf_names]

# PDF loader from LLama index : https://llamahub.ai/l/file-pdf
from pathlib import Path
from llama_index.core import download_loader

PyMuPDFReader = download_loader("PyMuPDFReader")

loader = PyMuPDFReader()


all_documents = []

# Process each PDF file
for pdf_file in pdf_paths:
    print(f"Processing file: {pdf_file}")
    documents = loader.load_data(file_path=pdf_file, metadata=True)
    # Add your processing logic here, using the 'documents' variable
    print(f"Number of documents in {pdf_file}: {len(documents)}")

    # Extend the list with documents from the current file
    all_documents.extend(documents)

# You can add additional processing or analysis outside the loop if needed

# Print the total number of documents
print(f"Total number of documents: {len(all_documents)}")

In [None]:
import langchain
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [None]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
# Assuming you have the 'all_documents' list of objects with 'get_text()' and 'metadata' attributes

chunk_size_limit = 500
max_chunk_overlap = 200

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size_limit,
    chunk_overlap=max_chunk_overlap
)
# Assuming you have a Document class with 'page_content' and 'metadata' attributes
class langchainschemadocumentDocument:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

# Initialize an empty list to store the split documents
split_docs = []

# Iterate through the documents and split each one
for document in all_documents:
    text = document.get_text()  # Replace with the actual method for getting text
    source = document.metadata['file_path'].split('/')[-1]  # Extract the file name

    # Split the document using the text splitter
    chunks = text_splitter.split_text(text)

    # Create Document instances for each chunk
    for chunk in chunks:
        # Use metadata as a dictionary with a key-value pair for the file name
        metadata = {'source': source}
        chunk_instance = Document(page_content=chunk, metadata=metadata)
        split_docs.append(chunk_instance)
        ids = [str(i) for i in range(1, len(split_docs) + 1)]

In [None]:
# Try to delete the collection if it exists
try:
    db.delete_collection()
except Exception as e:
    pass

# Now, create the Chroma vector store
db = Chroma.from_documents(split_docs, embeddings, ids=ids)

In [None]:

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Answer the user query using the relevant content provided in this prompt.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.\n
Take your time and provide as much information as you can in the answer.\n
For each sentence you write provide in-text citation, e.g., [1].\n start with number [1] everytime you generate an answer\n
If the sentence that you write has multiple citation provide them all, e.g., [1],[2],[3]... .\n
By the end of the answer provide References section as Markdown (###References) including the number and the file name\n
e.g., [1] Author et al. - YEAR- file name.pdf\n
Don't combine the References and write each one in new line.\n

----------------
{summaries}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [None]:

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-4", temperature=0, max_tokens=4000)

In [None]:
### for more information about the as.retriever() :: https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa ::
# %%time
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=db.as_retriever(search_type="mmr", search_kwargs={'k': 30, 'lambda_mult': 0.25}))


In [None]:
sample_question = "write the question here"

In [None]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs,
    verbose=True
)

In [None]:
from IPython.display import display, Markdown
def print_result(result):
  output_text = f"""### Question:
  {sample_question}
  ### Answer:
  {result['answer']}
  ### Sources:
  {result['source_documents']}
  ### All relevant sources:
  {' '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}
  """
  display(Markdown(output_text))

In [None]:
result = chain(sample_question)
print_result(result)