In [None]:
#Use the following in a bash script
# apt install tesseract-ocr libtesseract-dev poppler-utils

In [None]:
pip install -r ../requirements.in

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Title, NarrativeText, Text
from unstructured.chunking.basic import chunk_elements
from typing import List
import os
import weaviate
from weaviate.util import generate_uuid5
import ollama
from pprint import pprint
import json

In [None]:
ollama.pull("mxbai-embed-large:v1")
ollama.list()

In [None]:
# Use this for now
FILE_PATH = "../docs/Player_s Handbook.pdf"

elements = partition_pdf(filename=FILE_PATH)

In [None]:
#save this for later
FILE_PATH = "../docs/Player_s Handbook.pdf"

elements = partition_pdf(filename=FILE_PATH, strategy="hi_res", infer_table_structure=True)

In [19]:
import weaviate.classes as wvc
client = weaviate.connect_to_local(
    additional_config=wvc.init.AdditionalConfig(
        timeout=(60,1800),
    ),
)
client.connect()

In [None]:
import weaviate.classes as wvc
from weaviate.collections import Collection
from weaviate.client import WeaviateClient

def create_collection(client: WeaviateClient, collection_name: str)-> Collection:
    with client: 
        client.collections.delete(collection_name)
        client.collections.create(
            name=collection_name,
            vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_transformers(),
            vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                distance_metric=wvc.config.VectorDistances.COSINE # select prefered distance metric
        ),
    )
    collection = client.collections.get(name=collection_name)
    return collection

collection = create_collection(client, 'PHB')


In [None]:
narrative_texts = [elem for elem in elements if elem.category == "NarrativeText"]
titles = [elem for elem in elements if elem.category == "Title"]
uncat_texts = [elem for elem in elements if elem.category == "UncategorizedText"]


In [None]:

for text in narrative_texts:
    pprint(text.to_dict())

In [None]:
for title in titles:
    pprint(title.to_dict())

In [None]:
for text in uncat_texts:
    pprint(text.to_dict())

In [None]:
for elem in elements:
    pprint(elem.to_dict())

In [None]:
element_dicts = [chunk.to_dict() for chunk in elements]



In [12]:
embeddings = []

for element in element_dicts:
    response = ollama.embeddings(model='mxbai-embed-large:v1', prompt=element['text'])
    embedding = response["embedding"]
    embeddings.append(embedding)
    

In [None]:
embeddings

In [13]:
chunk_embeddings_with_metadata = [
        {
            "id":  None,
            "type": element['type'],
            "title": element['metadata']['filename'],
            "url": "None",
            "content": element['text'],
            "label": "No Label",
            "tokens": len(element['text'].split()),
            "embedding": embedding,
        }
        for element, embedding in zip(element_dicts, embeddings)
    ]

In [20]:
from typing import List, Dict 
from weaviate.client import WeaviateClient
import weaviate.classes as wvc
def load_chunks_into_weaviate(chunks: List[Dict], client: WeaviateClient, collection_name: str):
    collection = create_collection(client, collection_name)
    chunk_objs = []
    for chunk in chunks:
        chunk_obj = wvc.data.DataObject(
            properties={
                "content": chunk['content'],
                "tokens": chunk['tokens'],
                "title": chunk['title'],
                "type": chunk['type'],
                "url": chunk['url'],
                "label": chunk['label']
            }
        )    
        chunk_objs.append(chunk_obj)
        
    with client:
        collection.data.insert_many(chunk_objs)
        
    print(f"Loaded {len(chunks)} chunks into Weaviate")
        
load_chunks_into_weaviate(chunk_embeddings_with_metadata, client, 'PHB')

Loaded 13847 chunks into Weaviate
