## 1. Installs

In [None]:
#%pip install --upgrade pip
#%pip list # See what's installed and versions
%pip install --pre -U "weaviate-client==4.*"
#%pip install "weaviate-client==3.*"

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

In [4]:
import weaviate

In [None]:
import weaviate

client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WEAVIATE_URL"),
    auth_credentials=weaviate.AuthApiKey(os.getenv("WEAVIATE_API_KEY"))
)

In [None]:
import weaviate

client = weaviate.Client(
    url=os.getenv("WEVIATE_URL"),
    auth_credentials=weaviate.AuthApiKey(os.getenv("WEAVIATE_API_KEY")),
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)

In [None]:
import weaviate
import json

# we will create the class "PDF_document"
pdf_document_class = {
    "class": "PDF_document",
    "description": "Info about the overall PDF",
    "properties": [
        {
            "name": "title",
            "description": "Name of the document. If none, defaults to File Name w/o extension",
            "dataType": ["text"],
        },
        {
            "name": "file_name",
            "description": "Name of the PDF file",
            "dataType": ["text"],
        },
        {
            "name": "leadership_scope",
            "description": "1_National, 2_District, 3_Divison, 3_Sector, 4_Flotilla, 4_Station, 5_Facility",
            "dataType": ["text"],
        },
        {
            "name": "page_count",
            "description": "Number of pages in the document",
            "dataType": ["int"],
        },
        {
            "pages": "creation_date",
            "description": "Uses existing pdf date, else defaults to ingestion date",
            "dataType": ["date"],
        },
        {
            "pages": "effective_date",
            "description": "Date document became effective",
            "dataType": ["date"],
        },
        {
            "pages": "ingestion_date",
            "description": "Uses existing pdf date, else defaults to ingestion date",
            "dataType": ["date"],
        },
        {
            "pages": "expiration_date",
            "description": "Defaults to effective date + 10 years per COMDINST 5215.6I",
            "dataType": ["date"],
        },
        {
            "pages": "curator",
            "description": "Last name of Auxiliarist who curated",
            "dataType": ["text"],
        },
        {
            "pages": "source",
            "description": "Web domain source of document (e.g.,uscg.mil, cgaux.org)",
            "dataType": ["text"],
        },
        {
            "name": "aux_specific",
            "description": "True if document specifically applies to the Auxiliary",
            "dataType": ["boolean"],
        },
        {
            "name": "organization",
            "description": "Not curently used, can be used to track CG Organizations (i.e., CG-BSX-1) or Auxiliary Unit Number (0130510)",
            "dataType": ["text"],
        },
        {
            "name": "public_release",
            "description": "True if document is available on public internet",
            "dataType": ["boolean"],
        },
        {
            "pages": "publication_number",
            "description": "Number of pages in the document",
            "dataType": ["text"],
        },
        {
            "name": "uuid",
            "description": "unique identifier for the doc",
            "dataType": ["uuid"],
        },
        {
            "name": "PDF_document_pages",
            "description": "The pages class",
            "dataType": ["text[]"],
        },
    ],
}

pdf_document_page_class = {
    "class": "PDF_document_page",
    "description": "Information about a specific page in a PDF document",
    "properties": [
        {
            "name": "content",
            "description": "The content of the page",
            "dataType": ["text"],
        },
        {
            "name": "uuid",
            "description": "unique identifier for the doc",
            "dataType": ["uuid"],
        },
        {
            "name": "page_number",
            "description": "The page number",
            "dataType": ["int"],
        },
        {
            "name": "PDF_document",
            "description": "The document to which the page refers",
            "dataType": ["text"],
        },
    ],
    "vectorizer": "text2vec-openai",
}

# add the schema
classes = [pdf_document_class, pdf_document_page_class]
client.schema.create_class(classes)

# get the schema
schema = client.schema.get()

# print the schema
print(json.dumps(schema, indent=4))