## 0. Docstring

In [None]:
"""
Automate the ingestion and organization of PDF documents, their pages, and associated metadata, into a Qdrant vector database for later retrieval and analysis.

This script ingests PDF documents from a specified directory, extracts both document-level metadata and page-level content, storing it in a Qdrant vector database using two separate collections:
    - PDF_document: metadata about the PDF files-- such as title, page count, creation date, and document summary-- along with a vectorized version of the summary
    - PDF_document_page: Contains the text content of individual pages along with their page number and a reference to the associated PDF document.

Key functionality includes:
    
**Metadata Table Ingestion**:
    - Loads an Excel file containing metadata for the PDFs into a pandas DataFrame.

**PDF Processing**:
    - Walks through the specified directory, identifying PDF files.
    - For each PDF, computes a unique document ID, retrieves corresponding metadata from the DataFrame, and stores the metadata in the `ask_pdf_docs` collection.
    - For each page of the PDF, extracts the content and stores it in the `ask_pdf_pages` collection as a vector and as text with a reference to the PDF document, PDF title, and page number stored in the payload.


Usage:
    1. Ensure that the environment variables for Qdrant credentials (QDRANT_API_KEY, QDRANT_URL, OPENAI_API_KEY) are set.
    2. Place the PDF files in the specified `pdf_source_dir`.
    3. Ensure metadata.xlsx is present in `metadata_dir`.
    4. Run the script to upload the PDF metadata and page content to he collections
"""


## 1. Installs, Imports and Environmental Variables

In [8]:
import os
import sys
from datetime import datetime, timezone
import pandas as pd
import streamlit as st
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PayloadSchemaType
from qdrant_client.http import exceptions as qdrant_exceptions
# from dotenv import load_dotenv, find_dotenv

# load_dotenv(find_dotenv())

In [9]:
'''This litle code block is used anytime you want to import a local module from within a Jupyter Notebook. This is required becuase Jupyter treats each cell as a module.'''

# Navigate up one level from the current notebook's directory to reach the root directory
import utils
current_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

## 2. Set Configurations and Clients

In [10]:
api_key = st.secrets["QDRANT_API_KEY"]
url = st.secrets["QDRANT_URL"]


client = QdrantClient(
    url=url,
    api_key=api_key,
)

In [11]:
pdf_source_dir = "./docs/pdfs/"
metadata_file_path = "./docs/metadata/metadata.xlsx"
pdfs_collection_name = "ask_pdf_docs"
pages_collection_name = "ask_pdf_pages"

### Confirm access to the Collections

In [12]:
try:
    pdfs_collection = client.get_collections()  # "ask_pdf_docs"
    # pages_collection = client.get_collections("ask_pdf_pages")
    print(pdfs_collection)
    # print(pages_collection)
except qdrant_exceptions.UnexpectedResponse as e:
    # Check if the error is a 404 Not Found
    if "404" in str(e):
        print("The server returned a 404 Not Found error, which indicates the server is active but could not find the requested URL or endpoint. This might be due to a wrong URL, an incorrect path, or a resource that doesn't exist.")
    else:
        # Re-raise the error if it's not a 404
        raise
except Exception as e:
    # Handle any other exceptions that may occur
    print(f"An unexpected error occurred: {e}")

collections=[CollectionDescription(name='ask_pdf_pages'), CollectionDescription(name='ask_pdf_docs'), CollectionDescription(name='ASK_vectorstore')]


## Load Metadata from Excel File

In [13]:
datetime_cols = ['creation_date', 'effective_date',
                 'upsert_date', 'expiration_date']

today_date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

# Add the 'upsert_date' field and populate every row with today's date

try:
    df = pd.read_excel(metadata_file_path)

    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce', utc=True).dt.strftime(
                '%Y-%m-%dT%H:%M:%SZ')

    df = df.apply(lambda col: col.astype(str).fillna(
        '') if col.dtype == 'float64' else col.fillna(''))

    df['upsert_date'] = today_date

    print(f"Successfully imported: {metadata_file_path}")

except Exception as e:
    os.write(1, f"Failed to read the metadata file: {e}\n".encode())

Successfully imported: ./docs/metadata/metadata.xlsx


### THis one works to input the pdf dat into Qdrant. The one below is more modular, but currently results in a Bad Gateway error

## THis one results ina bad gateway error

In [15]:
def get_pdf_metadata(pdf_path, df):
    try:
        # Compute the document_id for the PDF (using utils to generate uuid5)
        pdf_id = str(utils.compute_pdf_id(pdf_path)).strip()

        # Find the metadata row in df that corresponds to this pdf_id
        pdf_metadata = df[df['pdf_id'].str.strip().astype(
            str).str.lower() == pdf_id.lower()]

        if not pdf_metadata.empty:
            # Ensure no duplicate pdf_ids in the metadata
            if len(pdf_metadata) > 1:
                raise ValueError(
                    f"Found duplicates for pdf_id: '{pdf_id}', number of results: {len(pdf_metadata)}")

            pdf_metadata = pdf_metadata.iloc[0]
            document_metadata = pdf_metadata.to_dict()
            print(f"Successfully accessed metadata for pdf: {pdf_id}")
            return pdf_id, document_metadata
        else:
            raise ValueError(f"No metadata found for pdf: {pdf_id}")

    except Exception as e:
        print(f"Error retrieving metadata for {pdf_path}: {e}")
        return None, None  # Return None values if an error occurs to continue with the loop


def process_pdf_docs(pdf_path, df, client, pdfs_collection_name):
    try:
        # Access metadata for the given PDF
        pdf_id, document_metadata = get_pdf_metadata(pdf_path, df)

        if pdf_id and document_metadata:
            # Ensure pdf_id isn't already in Qdrant
            exists = utils.check_qdrant_record_exists(
                pdf_id, client, pdfs_collection_name)

            if exists:
                print(f"Record with ID {pdf_id} already exists in Qdrant!")
            else:
                null_vector = [0] * 1536
                client.upsert(
                    collection_name=pdfs_collection_name,
                    points=[{
                        "id": pdf_id,
                        "vector": null_vector,
                        "payload": document_metadata  # Metadata as payload
                    }]
                )
                print(f"Successfully inserted pdf {pdf_id} into Qdrant.")
        else:
            print(f"Skipping {pdf_path} due to missing metadata.")

    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")


"""Main Loop"""
if not utils.check_directory_exists(pdf_source_dir, create_if_not_exists=False):
    print(f"Exiting: Directory '{pdf_source_dir}' does not exist.")
else:
    for folder_name, sub_folders, filenames in os.walk(pdf_source_dir):
        for file in filenames:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(pdf_source_dir, file)
                print(f"Processing file: {file}")
                process_pdf_docs(pdf_path, df, client, pdfs_collection_name)

Processing file: CG-CVC_pol18-03_textembed.pdf
Successfully accessed metadata for pdf: b55419db-7e11-5439-9cfb-6abc1733f6af
Record with ID b55419db-7e11-5439-9cfb-6abc1733f6af already exists in Qdrant!
Processing file: AUXMAN.pdf
Successfully accessed metadata for pdf: 2d73553a-1802-527f-a086-54fb6f7db7ef
Record with ID 2d73553a-1802-527f-a086-54fb6f7db7ef already exists in Qdrant!


## This is the pdf pages code pulled over from weaviate as a starter

In [None]:
        # Process PDF pages. I believe this is just the metadata/payload, not the embeddings
        pages_objects = []
        loader = PyPDFLoader(pdf_path)
        for page in loader.load():
            pages_objects.append(
                wvc.data.DataObject(
                    properties={
                        "title": properties['title'],
                        "publication_number": str(properties['publication_number']),
                        "content": page.page_content,
                        "page_number": page.metadata["page"],
                    },
                    references={
                        "hasPdfDocument": pdf_id
                    }
                )
            )
        pdf_pages_collection.data.insert_many(pages_objects)

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


