## 0. Docstring

In [None]:
"""
Automate the ingestion and organization of PDF documents, their pages, and associated metadata, into a Qdrant vector database for later retrieval and analysis.

This script ingests PDF documents from a specified directory, extracts both document-level metadata and page-level content, storing it in a Qdrant vector database using two separate collections:
    - PDF_document: metadata about the PDF files-- such as title, page count, creation date, and document summary-- along with a vectorized version of the summary
    - PDF_document_page: Contains the text content of individual pages along with their page number and a reference to the associated PDF document.

Key functionality includes:
    
**Library Catalog Ingestion**:
    - Loads a library catalog (Excel file) containing metadata for the PDFs into a pandas DataFrame.
    - Processes specific date columns into a standardized format (Zulu time).

**PDF Processing**:
    - Walks through the specified directory, identifying PDF files.
    - For each PDF, computes a unique document ID, retrieves corresponding metadata from the DataFrame, and stores the metadata in the `PDF_document` collection.
    - For each page of the PDF, extracts the content and stores it in the `PDF_document_page` collection with a reference to the PDF document.


Usage:
    1. Ensure that the environment variables for Weaviate credentials (QDRANT_API_KEY, QDRANT_URL, OPENAI_API_KEY) are set.
    2. Place the PDF files in the specified `pdf_source_dir`.
    3. Ensure metadata.xlsx is present in `metadata_dir`.
    4. Run the script to upload the PDF metadata and page content to he collections
"""


## 1. Installs, Imports and Environmental Variables

In [1]:
import os
import sys
import pandas as pd
import streamlit as st
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PayloadSchemaType
from qdrant_client.http import exceptions as qdrant_exceptions
# from dotenv import load_dotenv, find_dotenv

# load_dotenv(find_dotenv())

In [6]:
'''This litle code block is used anytime you want to import a local module from within a Jupyter Notebook. This is required becuase Jupyter treats each cell as a module.'''

# Navigate up one level from the current notebook's directory to reach the root directory
current_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

import utils

## 2. Set Configurations and Clients

In [4]:
api_key = st.secrets["QDRANT_API_KEY"]
url = st.secrets["QDRANT_URL"]


client = QdrantClient(
    url=url,
    api_key=api_key,
)

In [10]:
pdf_source_dir = "./docs/pdfs/"
metadata_file_path = "./docs/metadata/metadata.xlsx"

### Confirm access to the Collections

In [5]:
try:
    pdfs_collection = client.get_collections()  # "ask_pdf_docs"
    # pages_collection = client.get_collections("ask_pdf_pages")
    print(pdfs_collection)
    # print(pages_collection)
except qdrant_exceptions.UnexpectedResponse as e:
    # Check if the error is a 404 Not Found
    if "404" in str(e):
        print("The server returned a 404 Not Found error, which indicates the server is active but could not find the requested URL or endpoint. This might be due to a wrong URL, an incorrect path, or a resource that doesn't exist.")
    else:
        # Re-raise the error if it's not a 404
        raise
except Exception as e:
    # Handle any other exceptions that may occur
    print(f"An unexpected error occurred: {e}")

collections=[CollectionDescription(name='ask_pdf_pages'), CollectionDescription(name='ask_pdf_docs'), CollectionDescription(name='ASK_vectorstore')]


## Load Metadata from Excel File

In [34]:
datetime_cols = ['creation_date', 'effective_date',
                 'upsert_date', 'expiration_date']

try:
    df = pd.read_excel(metadata_file_path)

    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce', utc=True).dt.strftime(
                '%Y-%m-%dT%H:%M:%SZ')

    df = df.apply(lambda col: col.astype(str).fillna(
        '') if col.dtype == 'float64' else col.fillna(''))

    print(f"Successfully imported: {metadata_file_path}")

except Exception as e:
    os.write(1, f"Failed to read the metadata file: {e}\n".encode())

Successfully imported: ./docs/metadata/metadata.xlsx


In [None]:
# starter code that pulls the library catalog into pdfs_collection

import pypdf
import json
from langchain.community.document_loaders import PyPDFLoader
import utils  # Ensure utils module with compute_doc_id function is imported

pdfs_collection = client.collections.get("PDF_document")
pdf_pages_collection = client.collections.get("PDF_document_page")


def process_pdf(pdf_path, df):
    try:
        # Compute the document_id for the PDF
        document_id = str(utils.compute_doc_id(pdf_path))

        # Find the metadata row in df that corresponds to this document_id
        pdf_metadata = df[df['document_id'] == str(document_id)]

        if not pdf_metadata.empty:
            if len(pdf_metadata) > 1:
                raise ValueError(
                    f"Found duplicates for document_id: '{document_id}', number of results: {len(pdf_metadata)}")
            pdf_metadata = pdf_metadata.iloc[0]
            properties = pdf_metadata.to_dict()
            if "publication_number" in properties:
                properties['publication_number'] = str(properties['publication_number'])
            pdfs_collection.data.insert(
                properties=properties,
                uuid=document_id
            )
        else:
            raise ValueError(
                f"No metadata found for document ID: {document_id}")

        # Process PDF pages. I believe this is just the metadata/payload, not the embeddings
        pages_objects = []
        loader = PyPDFLoader(pdf_path)
        for page in loader.load():
            pages_objects.append(
                wvc.data.DataObject(
                    properties={
                        "title": properties['title'],
                        "publication_number": str(properties['publication_number']),
                        "content": page.page_content,
                        "page_number": page.metadata["page"],
                    },
                    references={
                        "hasPdfDocument": document_id
                    }
                )
            )
        pdf_pages_collection.data.insert_many(pages_objects)

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


print("Loading PDFs from directory...")

for folder_name, sub_folders, filenames in os.walk(pdf_source_dir):
    for file in filenames:
        if file.lower().endswith('.pdf'):
            process_pdf(os.path.join(folder_name, file), df)