In [5]:
%pip install --pre -U "weaviate-client==4.*"
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import weaviate
import weaviate.classes as wvc

from weaviate.util import generate_uuid5
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

In [8]:
import weaviate

# url = os.getenv("WEAVIATE_URL")
# api_key = os.getenv("WEAVIATE_API_KEY")
url = "https://1syylnhesmc5shxvmydv1q.c1.us-east4.gcp.weaviate.cloud"
api_key = "C3kymue0kUBSuoh5tIwo24nzEr7mhxIOtexi"

client = weaviate.connect_to_wcs(
    cluster_url=url,
    auth_credentials=weaviate.AuthApiKey(api_key)
)

In [10]:
#create a Weaviate collection to hold PDF and PDFPages

pdfs_collection = client.collections.create(
    name="PDF_document",
    vectorizer_config=wvc.Configure.Vectorizer.none(),
    generative_config=wvc.Configure.Generative.openai(),
    properties=[
        wvc.Property(
            name="Title",
            description="Name of the document. If none, defaults to File Name w/o extension",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="FileName",
            description="Name of the PDF file",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="LeadershipScope",
            description="1_National, 2_District, 3_Divison, 3_Sector, 4_Flotilla, 4_Station, 5_Facility",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="PageCount",
            description="Number of pages in the document",
            data_type=wvc.DataType.INT
        ),
        wvc.Property(
            name="CreationDate",
            description="Uses existing pdf date, else defaults to ingestion date",
            data_type=wvc.DataType.DATE
        ),
        wvc.Property(
            name="EffectiveDate",
            description="Date document became effective, else defaults to ingestion date",
            data_type=wvc.DataType.DATE
        ),
        wvc.Property(
            name="IngestionDate",
            description="Uses existing pdf date, else defaults to ingestion date",
            data_type=wvc.DataType.DATE
        ),
        wvc.Property(
            name="ExpirationDate",
            description="If no cancellation date given, then defaults to effective date + 10 years per COMDINST M5215.6I",
            data_type=wvc.DataType.DATE
        ),
        wvc.Property(
            name="Curator",
            description="Last name of Auxiliarist who curated. Currently blank",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="Source",
            description="Web domain source of document (e.g.,uscg.mil, cgaux.org)",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="AuxSpecific",
            description="True if document specifically applies to the Auxiliary",
            data_type=wvc.DataType.BOOL
        ),
        wvc.Property(
            name="Organization",
            description="Not curently used, can be used to track CG directive originator using Standard Distribution List (SDL), COMDTNOTE 5605 encl (3) (i.e., CG-BSX-1) or Auxiliary Unit Number (0130510)",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="PublicRelease",
            description="True if document is available on public internet",
            data_type=wvc.DataType.BOOL
        ),
        wvc.Property(
            name="PublicationNumber",
            description="Identification number of the directive or document. In the case of Directives, underscores are used for spaces (e.g., COMDTINST_M1000.6A)",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="uuid",
            description="Unique ID for the PDF document",
            data_type=wvc.DataType.UUID
        )
    ]
)

pdf_pages_collection = client.collections.create(
    name="PDF_document_page",
    vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.Configure.Generative.openai(),
    properties=[
        wvc.Property(
            name="content",
            description="content of the page",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="Title",
            description="Taken from PDF_document object property of the same name",
            data_type=wvc.DataType.TEXT
        ),
        wvc.Property(
            name="uuid",
            description="Unique ID for the PDF page",
            data_type=wvc.DataType.UUID
        ),
        wvc.Property(
            name="page_number",
            description="Page number",
            data_type=wvc.DataType.INT
        ),
        wvc.Property(
            name="PublicationNumber",
            description="Taken from PDF_document object property of the same name",
            data_type=wvc.DataType.TEXT,
            skip_vectorization=True
        ),
    ],
    references=[
        wvc.ReferenceProperty(
            name="hasPdfDocument",
            target_collection="PDF_document"
        )
    ]
)


In [None]:
#This will retrieve a reference to the collection, which allows you to do operations on the collection
#https://weaviate.io/developers/weaviate/client-libraries/python#data
pdfs_collection = client.collections.get("PDF_document")



###
# Loop over all PDFs in the folder containing the PDFs
# for each individual PDF
    # fetch the metadata (creation_date, expiration_date etc..) 
    # create a Weaviate dataObject and insert into Weavaiate (first code snippet below)
    # call textSplitter on the PDF, retrieving a list of pages
    # for each page
        # create a Weaviate dataObject that holds all the information required for the page (see schema configured above)
        # add a reference to the object indexed above (by uuid, see reference below)
        # Index all pages into Weaviate (see code snippet below)
# Done <3 


#This will insert data for a single PDF (not the pages)
#this id is used in the next block to create a reference
pdf_id = generate_uuid5()
pdfs_collection.data.insert(
    weaviate.classes.DataObject(
        properties={
            "title": "",
            "file_name": "This is the file name",
            "creation_date": ""
            #...
        },
        uuid=pdf_id
    )
)



pdf_pages_collection = client.collections.get("PDF_document_page")

#this is pseudo code that would create a data object for the pages(chunks)
#We create a reference from the page, back to the original document
pages = []
for i in 5:
    pages.append(
        weaviate.classes.DataObject(
        properties={
            "title": "The title of the page",
            "page_number": i,
            "content": "The content of the page"
        },
        references={
            "hasPdfDocument": wvc.Reference.to(uuids=pdf_id) 
        },
        uuid=generate_uuid5()
    )
    )

pdf_pages_collection.data.insert_many(pages)

