In [1]:
import sys

import pandas as pd
%pip install --pre -U "weaviate-client==4.5.5"
%pip install python-dotenv

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os

import weaviate
import weaviate.classes as wvc
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [3]:
'''This litle code block is used anytime you want to import a local module from within a Jupyter Notebook. This is required becuase Jupyter treats each cell as a module.'''

# Navigate up one level from the current notebook's directory to reach the root directory
current_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

In [4]:
import utils

/Users/byronvoorbach/Development/projects/weaviate/ASK/.venv/lib/python3.10/site-packages/pydantic/_internal/_config.py:272: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/


## 2. Set Configurations and Clients

In [5]:
url = os.getenv("WEAVIATE_URL_COMP")
api_key = os.getenv("WEAVIATE_API_KEY_COMP")


client = weaviate.connect_to_wcs(
    cluster_url=url,
    auth_credentials=weaviate.auth.AuthApiKey(api_key),
    headers={
        "X-OpenAI-Api-Key": os.environ.get("OPENAI_API_KEY")
    }
)

In [6]:
pdf_source_directory = "../docs/test_pdfs/"
library_catalog_directory = "../docs/library_catalog/"
zulu_format = '%Y-%m-%dT%H:%MZ'

## 2. Create the Collection and Define the Properties

In [13]:
#create a Weaviate collection to hold PDF and PDFPages

if client.collections.exists("PDF_document"):
    client.collections.delete("PDF_document")
if client.collections.exists("PDF_document_page"):
    client.collections.delete("PDF_document_page")

pdfs_collection = client.collections.create(
    name="PDF_document",
    vectorizer_config=wvc.config.Configure.Vectorizer.none(),
    generative_config=wvc.config.Configure.Generative.openai(),
    properties=[
        wvc.config.Property(
            name="title",
            description="Name of the document. If none, defaults to File Name w/o extension",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="leadership_scope",
            description="1_National, 2_District, 3_Divison, 3_Sector, 4_Flotilla, 4_Station, 5_Facility",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="page_count",
            description="Number of pages in the document",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="creation_date",
            description="Uses existing pdf date, else defaults to ingestion date",
            data_type=wvc.config.DataType.DATE
        ),
        wvc.config.Property(
            name="effective_date",
            description="Date document became effective, else defaults to ingestion date",
            data_type=wvc.config.DataType.DATE
        ),
            wvc.config.Property(
            name="tagged_date",
            description="Date document metadata was added to the library catalog",
            data_type=wvc.config.DataType.DATE
        ),
        wvc.config.Property(
            name="upsert_date",
            description="Date uploaded to the vector database",
            data_type=wvc.config.DataType.DATE
        ),
        wvc.config.Property(
            name="expiration_date",
            description="If no cancellation date given, then defaults to effective date + 10 years per COMDINST M5215.6I",
            data_type=wvc.config.DataType.DATE
        ),
            wvc.config.Property(
            name="lifecycle",
            description="Last name of Auxiliarist who curated. Currently blank",
            data_type=wvc.config.DataType.TEXT
            ),
        wvc.config.Property(
            name="aux_specific",
            description="True if document specifically applies to the Auxiliary",
            data_type=wvc.config.DataType.BOOL
        ),
        wvc.config.Property(
            name="public_release",
            description="True if document is available on public internet",
            data_type=wvc.config.DataType.BOOL
        ),
        wvc.config.Property(
            name="publication_number",
            description="Identification number of the directive or document. In the case of Directives, underscores are used for spaces (e.g., COMDTINST_M1000.6A)",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="source",
            description="Web domain source of document (e.g.,uscg.mil, cgaux.org)",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="organization",
            description="Not curently used, can be used to track CG directive originator using Standard Distribution List (SDL), COMDTNOTE 5605 encl (3) (i.e., CG-BSX-1) or Auxiliary Unit Number (0130510)",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="curator",
            description="Last name of Auxiliarist who curated. Currently blank",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="file_name",
            description="Name of the PDF file",
            data_type=wvc.config.DataType.TEXT
        ),
    ]
)

pdf_pages_collection = client.collections.create(
    name="PDF_document_page",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.config.Configure.Generative.openai(),
    properties=[
        wvc.config.Property(
            name="content",
            description="content of the page",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="title",
            description="Taken from PDF_document object property of the same name",
            data_type=wvc.config.DataType.TEXT
        ),
        wvc.config.Property(
            name="page_number",
            description="Page number",
            data_type=wvc.config.DataType.INT
        ),
        wvc.config.Property(
            name="publication_number",
            description="Taken from PDF_document object property of the same name",
            data_type=wvc.config.DataType.TEXT,
            skip_vectorization=True
        ),
    ],
    references=[
        wvc.config.ReferenceProperty(
            name="hasPdfDocument",
            target_collection="PDF_document"
        )
    ]
)


## Load Objects

In [14]:
# Get the library catalog containing all the metadata and put it into a dataframe

catalog_file_path, last_update_date = utils.get_most_recent_filepath_and_date(
    "library_catalog", library_catalog_directory, "xlsx")

datetime_cols = ['creation_date', 'effective_date',
                 'tagged_date', 'upsert_date', 'expiration_date']

try:
    df = pd.read_excel(catalog_file_path)
    for col in datetime_cols:
        df[col] = pd.to_datetime(df[col], utc=True)
        df[col] = df[col].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
    df.fillna('', inplace=True)
    print(f"Successfully imported:  {catalog_file_path}")
except Exception as e:
    os.write(
        1, f"Failed to read the most recent library catalog file: {e}\n".encode())

found the following file(s) ['library_catalog_2024-04-19T0018Z.xlsx', 'library_catalog_2024-03-28T0018Z.xlsx', 'library_catalog_2024-02-17T0018Z.xlsx', 'library_catalog_2024-01-30T2033Z.xlsx', 'library_catalog_2023-12-19T1900Z.xlsx']
Successfully imported:  /Users/byronvoorbach/Development/projects/weaviate/ASK/docs/library_catalog/library_catalog_2024-04-19T0018Z.xlsx


In [15]:
# starter code that pulls the library catalog into pdfs_collection

import pypdf
import json
from langchain.document_loaders import PyPDFLoader
import utils  # Ensure utils module with compute_doc_id function is imported

pdfs_collection = client.collections.get("PDF_document")
pdf_pages_collection = client.collections.get("PDF_document_page")


def process_pdf(pdf_path, df):
    try:
        # Compute the document_id for the PDF
        document_id = str(utils.compute_doc_id(pdf_path))

        # Find the metadata row in df that corresponds to this document_id
        pdf_metadata = df[df['document_id'] == str(document_id)]

        if not pdf_metadata.empty:
            if len(pdf_metadata) > 1:
                raise ValueError(
                    f"Found duplicates for document_id: '{document_id}', number of results: {len(pdf_metadata)}")
            pdf_metadata = pdf_metadata.iloc[0]
            properties = pdf_metadata.to_dict()
            if "publication_number" in properties:
                properties['publication_number'] = str(properties['publication_number'])
            pdfs_collection.data.insert(
                properties=properties,
                uuid=document_id
            )
        else:
            raise ValueError(
                f"No metadata found for document ID: {document_id}")

        # Process PDF pages
        pages_objects = []
        loader = PyPDFLoader(pdf_path)
        for page in loader.load():
            pages_objects.append(
                wvc.data.DataObject(
                    properties={
                        "title": properties['title'],
                        "publication_number": str(properties['publication_number']),
                        "content": page.page_content,
                        "page_number": page.metadata["page"],
                    },
                    references={
                        "hasPdfDocument": document_id
                    }
                )
            )
        pdf_pages_collection.data.insert_many(pages_objects)

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")
    except Exception as e:
        print(f"An error occurred: {e}")


print("Loading PDFs from directory...")

for folder_name, sub_folders, filenames in os.walk(pdf_source_directory):
    for file in filenames:
        if file.lower().endswith('.pdf'):
            process_pdf(os.path.join(folder_name, file), df)

Loading PDFs from directory...
An error occurred: No metadata found for document ID: 03105046-8005-5f7b-b304-188d0abfb1a2
