In [None]:
import os
import shutil

def copy_pdfs_from_zotero_storage(zotero_storage_path, target_directory):
    """
    Traverse the Zotero storage directory and copy all PDF files to the target directory.

    :param zotero_storage_path: Path to the Zotero 'storage' directory
    :param target_directory: Path to the directory where PDFs should be copied
    """
    # Ensure the target directory exists
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    # Traverse the Zotero storage directory
    for root, dirs, files in os.walk(zotero_storage_path):
        for file in files:
            if file.endswith('.pdf'):
                source_file_path = os.path.join(root, file)
                target_file_path = os.path.join(target_directory, file)

                # Copy the PDF to the target directory
                try:
                    shutil.copy2(source_file_path, target_file_path)
                    print(f"Copied: {source_file_path} to {target_file_path}")
                except Exception as e:
                    print(f"Error copying {source_file_path}: {e}")

zotero_storage_path = 'C:/Users/dglav/zotero/storage'

target_directory = 'C:/TUE/Thesis/RAG/papers'

copy_pdfs_from_zotero_storage(zotero_storage_path, target_directory)


## Define the OPEN AI API KEY

In [1]:
import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Get a list of all pdf documents holding papers

In [2]:
import os

def list_pdf_files(directory):
    """
    List all PDF files in the specified directory.

    :param directory: The directory to search for PDF files.
    :return: A list of full paths to PDF files.
    """
    pdf_files = []

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                # Full path of the PDF file
                full_path = os.path.join(root, file)
                pdf_files.append(full_path)

    return pdf_files

# Specify the directory to search
directory_path = 'C:/TUE/Thesis/RAG/papers'

# Get the list of PDF files
pdf_files_list = list_pdf_files(directory_path)

# Print the list of PDF files
for pdf_file in pdf_files_list:
    print(pdf_file)

C:/TUE/Thesis/RAG/papers\Cetin et al. - 2022 - Attri-VAE attribute-based interpretable represent.pdf
C:/TUE/Thesis/RAG/papers\Chen et al. - 2019 - Graph Networks as a Universal Machine Learning Fra.pdf
C:/TUE/Thesis/RAG/papers\Choudhary and DeCost - 2021 - Atomistic Line Graph Neural Network for improved m.pdf
C:/TUE/Thesis/RAG/papers\Court et al. - 2020 - 3-D Inorganic Crystal Structure Generation and Pro.pdf
C:/TUE/Thesis/RAG/papers\Eckart et al. - 2021 - Self-Supervised Learning on 3D Point Clouds by Lea.pdf
C:/TUE/Thesis/RAG/papers\Esmaeili et al. - 2018 - Structured Disentangled Representations.pdf
C:/TUE/Thesis/RAG/papers\Fu et al. - 2023 - MOFDiff Coarse-grained Diffusion for Metal-Organi.pdf
C:/TUE/Thesis/RAG/papers\Gasteiger et al. - 2022 - Directional Message Passing for Molecular Graphs.pdf
C:/TUE/Thesis/RAG/papers\Gasteiger et al. - 2022 - Fast and Uncertainty-Aware Directional Message Pas.pdf
C:/TUE/Thesis/RAG/papers\Gasteiger et al. - 2022 - GemNet Universal Directional G

## Load PDFs with PyPDFLoader

In [3]:
from langchain.document_loaders import PyPDFLoader

def load_documents_from_pdfs(pdf_files):
    """
    Load and extract documents from a list of PDF files using PyPDFLoader.

    :param pdf_files: List of paths to PDF files.
    :return: List of document objects containing text and metadata.
    """
    documents = []

    for pdf_file in pdf_files:
        try:
            # Initialize the PyPDFLoader with the path to the PDF file
            loader = PyPDFLoader(pdf_file)
            # Load the documents
            document = loader.load_and_split()
            
            # Append each loaded document with its metadata
            documents.append({
                 'document_name': pdf_file,
                 'pages': document
            })

            print(f"Loaded {len(document)} pages from {pdf_file}")
        except Exception as e:
            print(f"Error loading document from {pdf_file}: {e}")

    return documents


# Load documents from the PDF files
documents = load_documents_from_pdfs(pdf_files_list)

# Print the documents for verification
for doc in documents:
    print(f"Source: {doc['document_name']}")
    print(f"Length of pages: {len(doc['pages'])}")
    print("-" * 40)

# Calculate total page count
total_page_count = sum(len(doc['pages']) for doc in documents)
print(f"Total page count: {total_page_count}")


Loaded 35 pages from C:/TUE/Thesis/RAG/papers\Cetin et al. - 2022 - Attri-VAE attribute-based interpretable represent.pdf
Loaded 29 pages from C:/TUE/Thesis/RAG/papers\Chen et al. - 2019 - Graph Networks as a Universal Machine Learning Fra.pdf
Loaded 17 pages from C:/TUE/Thesis/RAG/papers\Choudhary and DeCost - 2021 - Atomistic Line Graph Neural Network for improved m.pdf
Loaded 33 pages from C:/TUE/Thesis/RAG/papers\Court et al. - 2020 - 3-D Inorganic Crystal Structure Generation and Pro.pdf
Loaded 19 pages from C:/TUE/Thesis/RAG/papers\Eckart et al. - 2021 - Self-Supervised Learning on 3D Point Clouds by Lea.pdf
Loaded 24 pages from C:/TUE/Thesis/RAG/papers\Esmaeili et al. - 2018 - Structured Disentangled Representations.pdf
Loaded 30 pages from C:/TUE/Thesis/RAG/papers\Fu et al. - 2023 - MOFDiff Coarse-grained Diffusion for Metal-Organi.pdf
Loaded 18 pages from C:/TUE/Thesis/RAG/papers\Gasteiger et al. - 2022 - Directional Message Passing for Molecular Graphs.pdf
Loaded 6 pages from

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 99 0 (offset 0)


Loaded 21 pages from C:/TUE/Thesis/RAG/papers\Jiao et al. - 2024 - Space Group Constrained Crystal Generation.pdf
Loaded 20 pages from C:/TUE/Thesis/RAG/papers\Kaundinya et al. - 2022 - Prediction of the Electron Density of States for C.pdf
Loaded 34 pages from C:/TUE/Thesis/RAG/papers\Li et al. - 2018 - Learning Deep Generative Models of Graphs.pdf
Loaded 31 pages from C:/TUE/Thesis/RAG/papers\Lipman et al. - 2023 - Flow Matching for Generative Modeling.pdf
Loaded 31 pages from C:/TUE/Thesis/RAG/papers\Mathieu et al. - 2019 - Disentangling Disentanglement in Variational Autoe.pdf
Loaded 18 pages from C:/TUE/Thesis/RAG/papers\Perez Rey et al. - 2020 - Diffusion Variational Autoencoders.pdf


Ignoring wrong pointing object 57 0 (offset 0)
Ignoring wrong pointing object 111 0 (offset 0)
Ignoring wrong pointing object 113 0 (offset 0)
Ignoring wrong pointing object 120 0 (offset 0)


Loaded 12 pages from C:/TUE/Thesis/RAG/papers\Petković et al. - 2023 - Equivariant Parameter Sharing for Porous Crystalli.pdf
Loaded 18 pages from C:/TUE/Thesis/RAG/papers\Satorras et al. - 2021 - E(n) Equivariant Graph Neural Networks.pdf
Loaded 17 pages from C:/TUE/Thesis/RAG/papers\Schütt et al. - 2018 - SchNet - a deep learning architecture for molecule.pdf
Loaded 15 pages from C:/TUE/Thesis/RAG/papers\Song and Ermon - 2019 - Generative Modeling by Estimating Gradients of the.pdf
Loaded 38 pages from C:/TUE/Thesis/RAG/papers\Sultanov et al. - 2023 - Data-Driven Score-Based Models for Generating Stab.pdf
Loaded 38 pages from C:/TUE/Thesis/RAG/papers\Tong et al. - 2024 - Improving and generalizing flow-based generative m.pdf
Loaded 20 pages from C:/TUE/Thesis/RAG/papers\Xie and Grossman - 2018 - Crystal Graph Convolutional Neural Networks for an.pdf
Loaded 22 pages from C:/TUE/Thesis/RAG/papers\Xie et al. - 2021 - MARS Markov Molecular Sampling for Multi-objectiv.pdf
Loaded 27 pages 

In [None]:
# Print total page count per document
for doc in documents:
    print(f"Source: {doc['document_name']}")
    print(f"Total page count: {len(doc['pages'])}")
    print("-" * 40)

# Calculate average page count
total_page_count = sum(len(doc['pages']) for doc in documents)
average_page_count = total_page_count / len(documents)
print(f"Average page count: {average_page_count}")

## Generate embeddings with OPENAI API and store them in  vector store locally

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
import psycopg

connection = os.environ['DATABASE_URL']  # Uses psycopg3!
if connection.startswith("postgres://"):
    connection = connection.replace("postgres://", "postgresql://", 1)
collection_name = "papers"

def get_db_connection():
    return psycopg.connect(connection, sslmode='require')

def split_documents(documents):
    """
    Split documents into chunks using RecursiveCharacterTextSplitter.

    :param documents: List of document objects.
    :return: List of text chunks with metadata.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    all_chunks = []

    for doc in documents:
        for page in doc['pages']:
            chunks = splitter.split_text(page.page_content)
            for chunk in chunks:
                all_chunks.append({
                    'text': chunk,
                    'metadata': {
                        'source': doc['document_name']
                    }
                })

    return all_chunks

def clean_chunk(text: str) -> str:
    # Replace null bytes with a space (or any other character)
    return text.replace("\0", " ")

def create_and_save_vector_store(chunks, collection_name):
    """
    Create a PGVector vector store from text chunks.

    :param chunks: List of text chunks with metadata.
    :param save_path: Path to save the FAISS vector store.
    """
    # Initialize the OpenAI embeddings model
    embeddings_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-3-small")

    # Extract texts and metadata
    texts = [chunk['text'] for chunk in chunks]
    clean_texts = [clean_chunk(text) for text in texts]
    metadata = [chunk['metadata'] for chunk in chunks]

    db_connection = get_db_connection()
    # Initialize the FAISS vector store
    vector_store = PGVector.from_texts(embedding=embeddings_model, 
                                       texts=clean_texts, 
                                       metadatas=metadata,
                                       collection_name=collection_name,
                                       pre_delete_collection=False,
                                       use_jsonb=True,
                                       connection=connection)
    
    # Explicitly commit the transaction to ensure changes are saved
    db_connection.commit()

In [5]:
connection

'postgresql://u5sgjn8jfinbts:p7e8428c379a8bd061c31b84df255e44799cc0b4cb088f5c674d85484b37ee2eb@c724r43q8jp5nk.cluster-czz5s0kz4scl.eu-west-1.rds.amazonaws.com:5432/d187f6m4n9r8dp'

In [9]:
# Split documents into manageable text chunks
chunks = split_documents(documents)

# Path to save the FAISS vector store
faiss_save_path = 'C:/TUE/Thesis/RAG/FAISS'

# Create and save the FAISS vector store locally
create_and_save_vector_store(chunks, faiss_save_path)

print(f"FAISS vector store saved at {faiss_save_path}")

FAISS vector store saved at C:/TUE/Thesis/RAG/FAISS


## Load PDFs with MathPixLoader

In [7]:
from langchain.document_loaders import MathpixPDFLoader

def load_documents_from_pdfs_mathpix(pdf_files):
    """
    Load and extract documents from a list of PDF files using PyPDFLoader.

    :param pdf_files: List of paths to PDF files.
    :return: List of document objects containing text and metadata.
    """
    documents = []

    for pdf_file in pdf_files:
        try:
            loader = MathpixPDFLoader(pdf_file)
            document = loader.load()
            
            # Append each loaded document with its metadata
            documents.append({
                 'document_name': pdf_file,
                 'pages': document
            })

            print(f"Loaded {len(document)} pages from {pdf_file}")
        except Exception as e:
            print(f"Error loading document from {pdf_file}: {e}")

    return documents


# Load documents from the PDF files
documents = load_documents_from_pdfs(pdf_files_list)

# Print the documents for verification
for doc in documents:
    print(f"Source: {doc['document_name']}")
    print(f"Length of pages: {len(doc['pages'])}")
    print("-" * 40)

Error loading document from C:/TUE/Thesis/RAG/papers\Cai et al. - 2020 - Machine learning-driven new material discovery.pdf: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.
Error loading document from C:/TUE/Thesis/RAG/papers\Cetin et al. - 2022 - Attri-VAE attribute-based interpretable represent.pdf: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.
Error loading document from C:/TUE/Thesis/RAG/papers\Chen et al. - 2019 - Graph Networks as a Universal Machine Learning Fra.pdf: Did not find mathpix_api_key, please add an environment variable `MATHPIX_API_KEY` which contains it, or pass `mathpix_api_key` as a named parameter.
Error loading document from C:/TUE/Thesis/RAG/papers\Choudhary and DeCost - 2021 - Atomistic Line Graph Neural Network for improved m.pdf: Did not find mathpix_api_k