In [1]:
from history_book.data_ingestion.book_ingestion import (
    build_history_book_db,
)
import weaviate

from history_book.data_models.book import (
    BookDBModel,
    ChapterDBModel,
    ParagraphDBModel,
)



In [2]:
# client = weaviate.connect_to_local(port=8081, grpc_port=50052)
client = weaviate.connect_to_local(port=8081, grpc_port=50052) # test server

In [3]:
# check existing collections
for collection_name in client.collections.list_all().keys():
    print(f"Collection: {collection_name}")

Collection: Books
Collection: Chapters
Collection: Paragraphs


In [4]:
# First check if collections exist and delete them if they do
for collection_name in client.collections.list_all().keys():
    print(f"Deleting existing collection: {collection_name}")
    client.collections.delete(collection_name)

Deleting existing collection: Books
Deleting existing collection: Chapters
Deleting existing collection: Paragraphs


## Read book data

In [5]:
all_books, all_chapters, all_paragraphs = build_history_book_db()

Opening book file...
Extracting book and chapter titles...
Process book data:


/Users/chris/Desktop/historyBook/history_book/.venv/lib/python3.11/site-packages/weaviate/collections/classes/config.py:1975: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


KeyboardInterrupt: 

In [6]:
len(all_books), len(all_chapters), len(all_paragraphs)

NameError: name 'all_books' is not defined

In [7]:
all_books

NameError: name 'all_books' is not defined

## Connect to Weaviate server

In [None]:
# test server connection: local port 8080
import requests


def test_server_connection():
    try:
        response = requests.get("http://localhost:8080")
        if response.status_code == 200:
            print("Server is running and reachable.")
        else:
            print(f"Server responded with status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to server: {e}")


test_server_connection()

Server is running and reachable.


In [9]:
# connect to Weaviate server
client = weaviate.connect_to_local()

            Please make sure to close the connection using `client.close()`.
  client = weaviate.connect_to_local()


In [10]:
# check if the client is connected
assert client.is_ready(), "Weaviate client is not connected."

In [11]:
client.collections.list_all()['Paragraphs'].to_dict()

{'name': 'Paragraphs',
 'properties': [_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-openai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=True)}),
  _Property(name='page', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-openai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=True)}),
  _Property(name='paragraph_index', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer=None, vector

In [12]:
client.collections.list_all()['Paragraphs']

_CollectionConfigSimple(name='Paragraphs', description=None, generative_config=None, properties=[_Property(name='text', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-openai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=True)}), _Property(name='page', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-openai': _PropertyVectorizerConfig(skip=False, vectorize_property_name=True)}), _Property(name='paragraph_index', description=None, data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization

## Test adding items (vectorization) [OBSOLETE -- items are now added during creation]

In [13]:
p_test = all_paragraphs[0]

In [14]:
result = p_test.write_model_to_collection()

In [15]:
result

UUID('7530abac-8974-452d-89e0-6fd743f73296')

In [18]:
paragraph_collection = client.collections.get("Paragraphs")

In [19]:
# get item from collection
retrieved_item = paragraph_collection.query.fetch_object_by_id(result, include_vector=True)

In [20]:
# assume dict with only one item is returned, pull out value
list(retrieved_item.vector.values())[0]

[0.09001196920871735,
 -0.017276706174016,
 -0.02438378892838955,
 0.020122868940234184,
 -0.030841749161481857,
 0.014447187073528767,
 0.0015510336961597204,
 0.028444981202483177,
 -0.037882257252931595,
 -0.037882257252931595,
 0.01942381076514721,
 0.05402715504169464,
 0.022785944864153862,
 0.03089168108999729,
 0.08215589821338654,
 0.03370455652475357,
 -0.09467235207557678,
 -0.02255292423069477,
 0.02040581963956356,
 0.042109888046979904,
 -0.024483654648065567,
 0.05612432584166527,
 -0.08608393371105194,
 0.06571140140295029,
 -0.007668826729059219,
 0.051430657505989075,
 0.07063809037208557,
 -0.02866135537624359,
 0.016577647998929024,
 0.07030520588159561,
 0.05056515708565712,
 0.03878104314208031,
 0.04174371808767319,
 -0.16071663796901703,
 -0.059786055237054825,
 -0.05033213645219803,
 0.007847752422094345,
 -0.014763427898287773,
 -0.00015642971266061068,
 0.07063809037208557,
 0.09327423572540283,
 0.0073817139491438866,
 -0.038048699498176575,
 0.0219037998467

In [None]:
# object has embedding after writing to collection
p_test.__dict__

{'id': '7530abac-8974-452d-89e0-6fd743f73296',
 'client': <weaviate.client.WeaviateClient at 0x128b2d290>,
 'collection': <weaviate.collections.collection.sync.Collection at 0x106497310>,
 'text': 'BEFO RE HI ST O RY When does History begin? It is tempting to reply ‘in the beginning’, but like many obvious answers, this soon turns out to be unhelpful. As a great Swiss historian once pointed out in another connection, history is the one subject where you cannot begin at the beginning. We can trace the chain of human descent back to the appearance of vertebrates, or even to the photosynthetic cells and other basic structures which lie at the start of life itself. We can go back further still, to the almost unimaginable upheavals which formed this planet and even to the origins of the universe. Yet this is not ‘history’.',
 'embedding': [0.09001196920871735,
  -0.017276706174016,
  -0.02438378892838955,
  0.020122868940234184,
  -0.030841749161481857,
  0.014447187073528767,
  0.001551033

## Test/Check

In [10]:
from weaviate.classes.query import Sort

### book

In [5]:
# read book collection
book_collection = client.collections.get("Books")

In [6]:
books_check = book_collection.query.fetch_objects(
    sort=Sort.by_property(name="book_index")
)

In [7]:
[obj.properties for obj in books_check.objects]

[{'end_page': 73,
  'book_index': 0,
  'start_page': 19,
  'title': 'BEFORE HISTORY'},
 {'end_page': 241,
  'book_index': 1,
  'start_page': 74,
  'title': 'CIVILIZATIONS'},
 {'end_page': 473,
  'book_index': 2,
  'start_page': 242,
  'title': 'THE CLASSICAL AGE'},
 {'end_page': 771,
  'book_index': 3,
  'start_page': 474,
  'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'end_page': 1005,
  'book_index': 4,
  'start_page': 772,
  'title': 'THE MAKING OF THE EUROPEAN AGE'},
 {'end_page': 1233,
  'book_index': 5,
  'start_page': 1006,
  'title': 'THE GREAT ACCELERATION'},
 {'end_page': 1420,
  'book_index': 6,
  'start_page': 1234,
  'title': 'THE END OF THE EUROPEAN AGE'},
 {'end_page': 1699,
  'book_index': 7,
  'start_page': 1421,
  'title': 'OUR OWN TIME'}]

### chapter

In [8]:
# read chapter collection
chapter_collection = client.collections.get("Chapters")

In [9]:
chapters_check = chapter_collection.query.fetch_objects(
    sort=Sort.by_property(name="book_index").by_property(name="chapter_index")
)

In [10]:
[obj.properties for obj in chapters_check.objects]

[{'chapter_index': 0,
  'end_page': 20,
  'book_index': 0,
  'start_page': 19,
  'title': 'Introduction'},
 {'end_page': 41,
  'chapter_index': 1,
  'book_index': 0,
  'start_page': 21,
  'title': 'The Foundations'},
 {'end_page': 58,
  'chapter_index': 2,
  'book_index': 0,
  'start_page': 42,
  'title': 'Homo Sapiens'},
 {'end_page': 73,
  'chapter_index': 3,
  'book_index': 0,
  'start_page': 59,
  'title': 'The Possibility of Civilization'},
 {'end_page': 75,
  'chapter_index': 0,
  'book_index': 1,
  'start_page': 74,
  'title': 'Introduction'},
 {'end_page': 86,
  'chapter_index': 1,
  'book_index': 1,
  'start_page': 76,
  'title': 'Early Civilized Life'},
 {'end_page': 109,
  'chapter_index': 2,
  'book_index': 1,
  'start_page': 87,
  'title': 'Ancient Mesopotamia'},
 {'end_page': 138,
  'chapter_index': 3,
  'book_index': 1,
  'start_page': 110,
  'title': 'Ancient Egypt'},
 {'end_page': 172,
  'chapter_index': 4,
  'book_index': 1,
  'start_page': 139,
  'title': 'Intruders 

### paragraph

In [8]:
# read paragraph collection
paragraph_collection = client.collections.get("Paragraphs")

In [11]:
paragraph_check = paragraph_collection.query.fetch_objects(
    limit=200,
    sort=Sort.by_property(name="book_index")
    .by_property(name="chapter_index")
    .by_property(name="paragraph_index"),
    include_vector=True,
)

In [12]:
paragraph_check.objects[0]

Object(uuid=_WeaviateUUIDInt('63726ece-7e1c-476b-adbc-03c59b060650'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'paragraph_index': 0, 'text': 'BEFO RE HI ST O RY When does History begin? It is tempting to reply ‘in the beginning’, but like many obvious answers, this soon turns out to be unhelpful. As a great Swiss historian once pointed out in another connection, history is the one subject where you cannot begin at the beginning. We can trace the chain of human descent back to the appearance of vertebrates, or even to the photosynthetic cells and other basic structures which lie at the start of life itself. We can go back further still, to the almost unimaginable upheavals which formed this planet and even to the origins of the universe. Yet this is not ‘history’.', 'page': 19, 'chapter_index': 0, 'book_index': 0}, references=None, vector={'text_ve

In [13]:
[obj.properties for obj in paragraph_check.objects]

[{'paragraph_index': 0,
  'text': 'BEFO RE HI ST O RY When does History begin? It is tempting to reply ‘in the beginning’, but like many obvious answers, this soon turns out to be unhelpful. As a great Swiss historian once pointed out in another connection, history is the one subject where you cannot begin at the beginning. We can trace the chain of human descent back to the appearance of vertebrates, or even to the photosynthetic cells and other basic structures which lie at the start of life itself. We can go back further still, to the almost unimaginable upheavals which formed this planet and even to the origins of the universe. Yet this is not ‘history’.',
  'page': 19,
  'chapter_index': 0,
  'book_index': 0},
 {'paragraph_index': 1,
  'text': 'Common sense helps here: history is the story of mankind, of what it has done, suffered or enjoyed. We all know that dogs and cats do not have histories, while human beings do. Even when historians write about a natural process beyond human

# Old

## (OLD) create weaviate schema for books 

In [9]:
from weaviate.collections.classes.config import DataType
from weaviate.collections.classes.config import ReferenceProperty

In [10]:
def pydantic_to_weaviate_schema(
    model_class, collection_name=None, references=None, vectorize_fields=None
):
    """
    Convert a Pydantic model to a Weaviate schema definition

    Args:
        model_class: Pydantic model class
        collection_name: Optional name for the collection (defaults to model class name)
        references: Optional list of references configurations
        vectorize_fields: Optional list of field names to vectorize

    Returns:
        dict: Weaviate schema definition

    TODO: consider creating connection here too
    """
    if collection_name is None:
        collection_name = model_class.__name__.replace("DBModel", "")

    # Get model schema
    schema = model_class.model_json_schema()
    properties = []

    # Map Pydantic/Python types to Weaviate data types
    type_mapping = {
        "string": DataType.TEXT,
        "integer": DataType.INT,
        "number": DataType.NUMBER,
        "boolean": DataType.BOOL,
        "array": DataType.TEXT_ARRAY,
        "object": DataType.OBJECT,
    }

    # Extract properties from the model schema
    for field_name, field_info in schema.get("properties", {}).items():
        # Skip ID field as Weaviate handles this
        if field_name == "id":
            continue

        field_type = field_info.get("type")
        weaviate_type = type_mapping.get(field_type, DataType.TEXT)

        # Skip embedding field as Weaviate handles vectors
        if field_name == "embedding":
            continue

        property_config = {
            "name": field_name,
            "data_type": weaviate_type,
            "description": f"The {field_name} of the {collection_name.lower()}",
        }

        # Add vectorization config if requested
        if vectorize_fields and field_name in vectorize_fields:
            property_config["moduleConfig"] = {
                "text2vec-transformers": {"vectorize": True}
            }

        properties.append(property_config)

    # Create schema definition
    schema_definition = {"class": collection_name, "properties": properties}

    # Add vectorizer if we have fields to vectorize
    if vectorize_fields:
        schema_definition["vectorizer"] = "text2vec-transformers"

    # Add references if provided
    if references:
        schema_definition["references"] = references

    return schema_definition

In [11]:
# Test schema generation for BookDBModel
book_schema = pydantic_to_weaviate_schema(BookDBModel, collection_name="Test_Books")

In [12]:
import json

In [13]:
print(json.dumps(book_schema, indent=2))

{
  "class": "Test_Books",
  "properties": [
    {
      "name": "title",
      "data_type": "text",
      "description": "The title of the test_books"
    },
    {
      "name": "start_page",
      "data_type": "int",
      "description": "The start_page of the test_books"
    },
    {
      "name": "end_page",
      "data_type": "int",
      "description": "The end_page of the test_books"
    },
    {
      "name": "book_index",
      "data_type": "int",
      "description": "The book_index of the test_books"
    }
  ]
}


In [14]:
chapter_schema = pydantic_to_weaviate_schema(
    ChapterDBModel,
    collection_name="Test_Chapters",
    references=[
        ReferenceProperty(
            name="belongsToBook",
            target_collection="Test_Books",  # Single target collection name (not in a list)
        )
    ],
)

In [15]:
chapter_schema

{'class': 'Test_Chapters',
 'properties': [{'name': 'title',
   'data_type': <DataType.TEXT: 'text'>,
   'description': 'The title of the test_chapters'},
  {'name': 'start_page',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The start_page of the test_chapters'},
  {'name': 'end_page',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The end_page of the test_chapters'},
  {'name': 'book_id',
   'data_type': <DataType.TEXT: 'text'>,
   'description': 'The book_id of the test_chapters'},
  {'name': 'chapter_index',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The chapter_index of the test_chapters'}],
 'references': [ReferenceProperty(name='belongsToBook', target_collection='Test_Books', description=None)]}

In [16]:
paragraph_schema = pydantic_to_weaviate_schema(
    ParagraphDBModel,
    collection_name="Test_Paragraphs",
    vectorize_fields=["text"],  # Make the text searchable via embeddings
    references=[
        ReferenceProperty(
            name="belongsToChapter",
            target_collection="Test_Chapters",
            description="The chapter this paragraph belongs to",
        ),
        ReferenceProperty(
            name="belongsToBook",
            target_collection="Test_Books",
            description="The book this paragraph belongs to",
        ),
    ],
)

In [17]:
paragraph_schema

{'class': 'Test_Paragraphs',
 'properties': [{'name': 'text',
   'data_type': <DataType.TEXT: 'text'>,
   'description': 'The text of the test_paragraphs',
   'moduleConfig': {'text2vec-transformers': {'vectorize': True}}},
  {'name': 'page',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The page of the test_paragraphs'},
  {'name': 'paragraph_index',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The paragraph_index of the test_paragraphs'},
  {'name': 'chapter_id',
   'data_type': <DataType.TEXT: 'text'>,
   'description': 'The chapter_id of the test_paragraphs'},
  {'name': 'book_id',
   'data_type': <DataType.TEXT: 'text'>,
   'description': 'The book_id of the test_paragraphs'}],
 'vectorizer': 'text2vec-transformers',
 'references': [ReferenceProperty(name='belongsToChapter', target_collection='Test_Chapters', description='The chapter this paragraph belongs to'),
  ReferenceProperty(name='belongsToBook', target_collection='Test_Books', description='The book

## (OLD) Create collections from schema

In [48]:
# First check if collections exist and delete them if they do
for collection_name in ["Test_Books", "Test_Chapters", "Test_Paragraphs"]:
    if collection_name in client.collections.list_all().keys():
        print(f"Deleting existing collection: {collection_name}")
        client.collections.delete(collection_name)

Deleting existing collection: Test_Books
Deleting existing collection: Test_Chapters
Deleting existing collection: Test_Paragraphs


In [49]:
book_schema

{'class': 'Test_Books',
 'properties': [{'name': 'title',
   'data_type': <DataType.TEXT: 'text'>,
   'description': 'The title of the test_books'},
  {'name': 'start_page',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The start_page of the test_books'},
  {'name': 'end_page',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The end_page of the test_books'},
  {'name': 'book_index',
   'data_type': <DataType.INT: 'int'>,
   'description': 'The book_index of the test_books'}]}

In [50]:
# Create Book collection
books_collection = client.collections.create(
    name=book_schema["class"], properties=book_schema["properties"]
)

/Users/chris/Desktop/historyBook/history_book/.venv/lib/python3.11/site-packages/weaviate/collections/classes/config.py:1975: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


In [51]:
# Create Chapter collection with reference to Book
chapters_collection = client.collections.create(
    name=chapter_schema["class"],
    properties=chapter_schema["properties"],
    references=chapter_schema["references"],
)

In [52]:
paragraphs_collection = client.collections.create(
    name=paragraph_schema["class"],
    properties=paragraph_schema["properties"],
    references=paragraph_schema["references"],
)

## Write data to collections

In [53]:
for book in all_books:
    books_collection.data.insert(
        properties={
            "title": book.title,
            "start_page": book.start_page,
            "end_page": book.end_page,
            "book_index": book.book_index,
        },
        uuid=book.id,  # Use existing UUID
    )
print(f"Imported {len(all_books)} books")

Imported 8 books


In [127]:
book.model_dump()

{'id': '1b25746a-4818-4fd9-b26a-62af98476d0e',
 'title': 'OUR OWN TIME',
 'start_page': 1421,
 'end_page': 1699,
 'book_index': 7}

In [54]:
# Import Chapters
success_count = 0
failure_count = 0
unique_ids = set()

for chapter in all_chapters:
    chapter_uuid = chapter.id

    # Check for duplicates
    if chapter_uuid in unique_ids:
        print(f"WARNING: Duplicate chapter UUID found: {chapter_uuid}")
        failure_count += 1
        continue

    unique_ids.add(chapter_uuid)

    try:
        # Create the chapter object
        res = chapters_collection.data.insert(
            properties={
                "title": chapter.title,
                "start_page": chapter.start_page,
                "end_page": chapter.end_page,
                "chapter_index": chapter.chapter_index,
            },
            uuid=chapter_uuid,
            references={"belongsToBook": [chapter.book_id]},
        )
        print(chapter.book_id)
        success_count += 1
    except Exception as e:
        print(f"Error inserting chapter {chapter_uuid}: {e}")
        failure_count += 1
    # print(res)

print(f"Successfully imported {success_count} chapters")
print(f"Failed to import {failure_count} chapters")

bf375278-e28b-459f-9f7d-6e8c16bee197
bf375278-e28b-459f-9f7d-6e8c16bee197
bf375278-e28b-459f-9f7d-6e8c16bee197
bf375278-e28b-459f-9f7d-6e8c16bee197
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
5

## Read data from collections

In [26]:
# read book data from Weaviate

results = client.collections.get("Test_Books")

In [27]:
books_check = results.query.fetch_objects()

In [28]:
[obj.properties for obj in books_check.objects]

[{'start_page': 1421,
  'book_index': 7,
  'title': 'OUR OWN TIME',
  'end_page': 1699},
 {'book_index': 2,
  'start_page': 242,
  'title': 'THE CLASSICAL AGE',
  'end_page': 473},
 {'start_page': 1006,
  'book_index': 5,
  'title': 'THE GREAT ACCELERATION',
  'end_page': 1233},
 {'start_page': 772,
  'book_index': 4,
  'title': 'THE MAKING OF THE EUROPEAN AGE',
  'end_page': 1005},
 {'start_page': 74,
  'book_index': 1,
  'title': 'CIVILIZATIONS',
  'end_page': 241},
 {'start_page': 474,
  'book_index': 3,
  'title': 'THE AGE OF DIVERGING TRADITIONS',
  'end_page': 771},
 {'start_page': 19,
  'book_index': 0,
  'title': 'BEFORE HISTORY',
  'end_page': 73},
 {'start_page': 1234,
  'book_index': 6,
  'title': 'THE END OF THE EUROPEAN AGE',
  'end_page': 1420}]

In [29]:
[str(obj.uuid) for obj in books_check.objects]

['1b25746a-4818-4fd9-b26a-62af98476d0e',
 '238bffe7-70a1-4827-b6f6-2a533f217f44',
 '27109765-15c7-48b6-a7c4-8baddb55986e',
 '31623345-5c35-42e1-8d30-eb3ee9a8e9c7',
 '4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2',
 '53962f49-6cba-4afa-9e45-0588a1493bf2',
 'bf375278-e28b-459f-9f7d-6e8c16bee197',
 'fa287e1c-4377-4350-88d9-0345dfeb9d60']

In [30]:
len([chapter.id for chapter in all_chapters])

62

In [31]:
len(set([chapter.id for chapter in all_chapters]))

62

In [62]:
# read chapter data from Weaviate

chapter_collection = client.collections.get("Test_Chapters")

In [67]:
from weaviate.classes.query import QueryReference

In [68]:
chapters_check = chapter_collection.query.fetch_objects(
    limit=None,
    return_references=QueryReference(
        link_on="belongsToBook",  # Reference field name
        return_properties=[
            "title",
            "book_index",
        ],  # Properties to return from the referenced object
    ),
)
print(f"Retrieved {len(chapters_check.objects)} chapters")

Retrieved 62 chapters


In [69]:
len(chapters_check.objects)

62

In [34]:
import pandas as pd

In [70]:
books_df = pd.DataFrame([obj.properties for obj in books_check.objects])
books_df["id"] = [str(obj.uuid) for obj in books_check.objects]

In [71]:
books_df

Unnamed: 0,start_page,book_index,title,end_page,id
0,1421,7,OUR OWN TIME,1699,1b25746a-4818-4fd9-b26a-62af98476d0e
1,242,2,THE CLASSICAL AGE,473,238bffe7-70a1-4827-b6f6-2a533f217f44
2,1006,5,THE GREAT ACCELERATION,1233,27109765-15c7-48b6-a7c4-8baddb55986e
3,772,4,THE MAKING OF THE EUROPEAN AGE,1005,31623345-5c35-42e1-8d30-eb3ee9a8e9c7
4,74,1,CIVILIZATIONS,241,4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
5,474,3,THE AGE OF DIVERGING TRADITIONS,771,53962f49-6cba-4afa-9e45-0588a1493bf2
6,19,0,BEFORE HISTORY,73,bf375278-e28b-459f-9f7d-6e8c16bee197
7,1234,6,THE END OF THE EUROPEAN AGE,1420,fa287e1c-4377-4350-88d9-0345dfeb9d60


In [None]:
# get uuid from reference
str(chapters_check.objects[0].uuid)

'07c4d0ea-d249-4467-b9cb-54dc54ec058a'

In [None]:
# get requested reference properties
chapters_df = pd.DataFrame([obj.properties for obj in chapters_check.objects])
chapters_df["id"] = [str(obj.uuid) for obj in chapters_check.objects]

In [91]:
chapter_book_props = [
    chapter.references["belongsToBook"].objects[0].properties
    for chapter in chapters_check.objects
]
chapter_book_props

[{'book_index': 4, 'title': 'THE MAKING OF THE EUROPEAN AGE'},
 {'book_index': 6, 'title': 'THE END OF THE EUROPEAN AGE'},
 {'book_index': 7, 'title': 'OUR OWN TIME'},
 {'book_index': 2, 'title': 'THE CLASSICAL AGE'},
 {'book_index': 5, 'title': 'THE GREAT ACCELERATION'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 5, 'title': 'THE GREAT ACCELERATION'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 0, 'title': 'BEFORE HISTORY'},
 {'book_index': 1, 'title': 'CIVILIZATIONS'},
 {'book_index': 2, 'title': 'THE CLASSICAL AGE'},
 {'book_index': 7, 'title': 'OUR OWN TIME'},
 {'book_index': 1, 'title': 'CIVILIZATIONS'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 6, 'title': 'THE END OF THE EUROPEAN AGE'},
 {'book_index': 6, 'title': 'THE END OF THE EUROPEAN AGE'},
 {'book_index': 4, 'title': 'THE MAKING OF THE EUROPEAN AGE'},

In [100]:
chapter_book_props_df = pd.DataFrame(chapter_book_props).rename(
    columns={"title": "book_title", "book_index": "book_index"}
)

In [101]:
chapters_with_book_df = pd.concat([chapters_df, chapter_book_props_df], axis=1)
chapters_with_book_df.sort_values(by=["book_index", "chapter_index"], inplace=True)

In [102]:
chapters_with_book_df

Unnamed: 0,book_id,chapter_index,start_page,title,end_page,id,book_index,book_title
9,,0,19,Introduction,20,230445a3-f212-4401-995c-eff775f9a5c8,0,BEFORE HISTORY
21,,1,21,The Foundations,41,44fda5ee-6417-41f6-8096-4d7bb6c240e2,0,BEFORE HISTORY
53,,2,42,Homo Sapiens,58,dc706b17-88c8-40a8-993a-b27d33160fe8,0,BEFORE HISTORY
42,,3,59,The Possibility of Civilization,73,a65f6c7e-6807-429c-8fd1-1770162590fa,0,BEFORE HISTORY
44,,0,74,Introduction,75,b821843a-6af6-4e84-a9ec-d82aeb7a859d,1,CIVILIZATIONS
...,...,...,...,...,...,...,...,...
12,,2,1481,The Cold War World,1557,2dacbb93-924b-4864-a182-c26a5a487d24,7,OUR OWN TIME
2,,3,1558,Crises and Détente,1604,0ee516dd-c59d-4f9f-8bb7-238936eda532,7,OUR OWN TIME
54,,4,1605,The Closing of an Era,1635,e04e35ff-21a5-4c03-b2e5-9f53bc3a21a1,7,OUR OWN TIME
55,,5,1636,Openings and Closures,1681,e04f08b8-69c9-42e3-9a07-ca156ec3a56d,7,OUR OWN TIME


## data model -> collection (skip intermediate schema)

In [105]:
client.collections.list_all()

{'Paragraphs': _CollectionConfigSimple(name='Paragraphs', description=None, generative_config=None, properties=[_Property(name='text', description='The text of the paragraphs', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none', vectorizer_configs=None), _Property(name='page', description='The page of the paragraphs', data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer='none', vectorizer_configs=None), _Property(name='paragraph_index', description='The paragraph_index of the paragraphs', data_type=<DataType.INT: 'int'>, index_filterable=True, index_range_filters=False, index_searchable=False, nested_properties=None, tokenization=None, vectorizer_config=None, vectorizer='none', vectorizer_confi

In [114]:
def create_collection_from_pydantic(
    client, model_class, collection_name=None, references=None, vectorize_fields=None
):
    """
    Create a Weaviate collection directly from a Pydantic model

    Args:
        client: Weaviate client
        model_class: Pydantic model class
        collection_name: Optional name for the collection (defaults to model class name)
        references: Optional list of references configurations
        vectorize_fields: Optional list of field names to vectorize

    Returns:
        dict: Weaviate schema definition

    TODO: consider creating connection here too
    """
    if collection_name is None:
        collection_name = model_class.__name__.replace("DBModel", "")

    # Get model schema
    schema = model_class.model_json_schema()
    properties = []

    # Map Pydantic/Python types to Weaviate data types
    type_mapping = {
        "string": DataType.TEXT,
        "integer": DataType.INT,
        "number": DataType.NUMBER,
        "boolean": DataType.BOOL,
        "array": DataType.TEXT_ARRAY,
        "object": DataType.OBJECT,
    }

    # Check if collection exists and delete
    if collection_name in client.collections.list_all().keys():
        client.collections.delete(collection_name)

    # Extract properties from the model schema
    for field_name, field_info in schema.get("properties", {}).items():
        # Skip ID field as Weaviate handles this
        if field_name == "id":
            continue

        field_type = field_info.get("type")
        weaviate_type = type_mapping.get(field_type, DataType.TEXT)

        # Skip embedding field as Weaviate handles vectors
        if field_name == "embedding":
            continue

        property_config = {
            "name": field_name,
            "data_type": weaviate_type,
            "description": f"The {field_name} of the {collection_name.lower()}",
        }

        # Add vectorization config if requested
        if vectorize_fields and field_name in vectorize_fields:
            property_config["moduleConfig"] = {
                "text2vec-transformers": {"vectorize": True}
            }

        properties.append(property_config)

    # Create collection
    collection_config = {"name": collection_name, "properties": properties}

    # Add vectorizer if we have fields to vectorize
    if vectorize_fields:
        collection_config["vectorizer"] = "text2vec-transformers"

    # Add references if provided
    if references:
        collection_config["references"] = references

    # Create the collection
    collection = client.collections.create(**collection_config)

    return collection

In [107]:
books_collection = create_collection_from_pydantic(
    client, BookDBModel, collection_name="Test_Books"
)

/Users/chris/Desktop/historyBook/history_book/.venv/lib/python3.11/site-packages/weaviate/collections/classes/config.py:1975: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


In [None]:
type(books_collection)

weaviate.collections.collection.sync.Collection

In [108]:
for book in all_books:
    books_collection.data.insert(
        properties={
            "title": book.title,
            "start_page": book.start_page,
            "end_page": book.end_page,
            "book_index": book.book_index,
        },
        uuid=book.id,  # Use existing UUID
    )
print(f"Imported {len(all_books)} books")

Imported 8 books


In [109]:
books_check = results.query.fetch_objects()
[obj.properties for obj in books_check.objects]

[{'book_id': None,
  'chapter_index': 7,
  'start_page': 975,
  'title': 'Ideas Old and New',
  'end_page': 1005},
 {'book_id': None,
  'chapter_index': 1,
  'start_page': 1236,
  'title': 'Strains in the System',
  'end_page': 1262},
 {'book_id': None,
  'chapter_index': 3,
  'start_page': 1558,
  'title': 'Crises and Détente',
  'end_page': 1604},
 {'book_id': None,
  'chapter_index': 1,
  'start_page': 244,
  'title': 'Remaking the Old World',
  'end_page': 248},
 {'book_id': None,
  'chapter_index': 2,
  'start_page': 1036,
  'title': 'Political Change in an Age of Revolution',
  'end_page': 1071},
 {'book_id': None,
  'chapter_index': 0,
  'start_page': 474,
  'title': 'Introduction',
  'end_page': 475},
 {'book_id': None,
  'chapter_index': 3,
  'start_page': 1072,
  'title': 'Political Change: A New Europe',
  'end_page': 1102},
 {'book_id': None,
  'chapter_index': 6,
  'start_page': 634,
  'title': 'Imperial China',
  'end_page': 663},
 {'end_page': 740,
  'chapter_index': 9,


In [115]:
chapters_collection = create_collection_from_pydantic(
    client,
    ChapterDBModel,
    collection_name="Test_Chapters",
    references=[
        ReferenceProperty(
            name="belongsToBook",
            target_collection="Test_Books",  # Single target collection name (not in a list)
        )
    ],
)

In [116]:
# Import Chapters
success_count = 0
failure_count = 0
unique_ids = set()

for chapter in all_chapters:
    chapter_uuid = chapter.id

    # Check for duplicates
    if chapter_uuid in unique_ids:
        print(f"WARNING: Duplicate chapter UUID found: {chapter_uuid}")
        failure_count += 1
        continue

    unique_ids.add(chapter_uuid)

    try:
        # Create the chapter object
        res = chapters_collection.data.insert(
            properties={
                "title": chapter.title,
                "start_page": chapter.start_page,
                "end_page": chapter.end_page,
                "chapter_index": chapter.chapter_index,
            },
            uuid=chapter_uuid,
            references={"belongsToBook": [chapter.book_id]},
        )
        print(chapter.book_id)
        success_count += 1
    except Exception as e:
        print(f"Error inserting chapter {chapter_uuid}: {e}")
        failure_count += 1
    # print(res)

print(f"Successfully imported {success_count} chapters")
print(f"Failed to import {failure_count} chapters")

bf375278-e28b-459f-9f7d-6e8c16bee197
bf375278-e28b-459f-9f7d-6e8c16bee197
bf375278-e28b-459f-9f7d-6e8c16bee197
bf375278-e28b-459f-9f7d-6e8c16bee197
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
4f0cc3cc-9cd9-414e-b958-5ae8dd242fa2
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
238bffe7-70a1-4827-b6f6-2a533f217f44
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
53962f49-6cba-4afa-9e45-0588a1493bf2
5

In [117]:
chapter_collection = client.collections.get("Test_Chapters")

In [118]:
chapters_check = chapter_collection.query.fetch_objects(
    limit=None,
    return_references=QueryReference(
        link_on="belongsToBook",  # Reference field name
        return_properties=[
            "title",
            "book_index",
        ],  # Properties to return from the referenced object
    ),
)
print(f"Retrieved {len(chapters_check.objects)} chapters")

Retrieved 62 chapters


In [119]:
# get requested reference properties
chapters_df = pd.DataFrame([obj.properties for obj in chapters_check.objects])
chapters_df["id"] = [str(obj.uuid) for obj in chapters_check.objects]

In [120]:
chapter_book_props = [
    chapter.references["belongsToBook"].objects[0].properties
    for chapter in chapters_check.objects
]
chapter_book_props

[{'book_index': 4, 'title': 'THE MAKING OF THE EUROPEAN AGE'},
 {'book_index': 6, 'title': 'THE END OF THE EUROPEAN AGE'},
 {'book_index': 7, 'title': 'OUR OWN TIME'},
 {'book_index': 2, 'title': 'THE CLASSICAL AGE'},
 {'book_index': 5, 'title': 'THE GREAT ACCELERATION'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 5, 'title': 'THE GREAT ACCELERATION'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 0, 'title': 'BEFORE HISTORY'},
 {'book_index': 1, 'title': 'CIVILIZATIONS'},
 {'book_index': 2, 'title': 'THE CLASSICAL AGE'},
 {'book_index': 7, 'title': 'OUR OWN TIME'},
 {'book_index': 1, 'title': 'CIVILIZATIONS'},
 {'book_index': 3, 'title': 'THE AGE OF DIVERGING TRADITIONS'},
 {'book_index': 6, 'title': 'THE END OF THE EUROPEAN AGE'},
 {'book_index': 6, 'title': 'THE END OF THE EUROPEAN AGE'},
 {'book_index': 4, 'title': 'THE MAKING OF THE EUROPEAN AGE'},

In [121]:
chapter_book_props_df = pd.DataFrame(chapter_book_props).rename(
    columns={"title": "book_title", "book_index": "book_index"}
)

In [122]:
chapters_with_book_df = pd.concat([chapters_df, chapter_book_props_df], axis=1)
chapters_with_book_df.sort_values(by=["book_index", "chapter_index"], inplace=True)

In [123]:
chapters_with_book_df

Unnamed: 0,book_id,chapter_index,start_page,title,end_page,id,book_index,book_title
9,,0,19,Introduction,20,230445a3-f212-4401-995c-eff775f9a5c8,0,BEFORE HISTORY
21,,1,21,The Foundations,41,44fda5ee-6417-41f6-8096-4d7bb6c240e2,0,BEFORE HISTORY
53,,2,42,Homo Sapiens,58,dc706b17-88c8-40a8-993a-b27d33160fe8,0,BEFORE HISTORY
42,,3,59,The Possibility of Civilization,73,a65f6c7e-6807-429c-8fd1-1770162590fa,0,BEFORE HISTORY
44,,0,74,Introduction,75,b821843a-6af6-4e84-a9ec-d82aeb7a859d,1,CIVILIZATIONS
...,...,...,...,...,...,...,...,...
12,,2,1481,The Cold War World,1557,2dacbb93-924b-4864-a182-c26a5a487d24,7,OUR OWN TIME
2,,3,1558,Crises and Détente,1604,0ee516dd-c59d-4f9f-8bb7-238936eda532,7,OUR OWN TIME
54,,4,1605,The Closing of an Era,1635,e04e35ff-21a5-4c03-b2e5-9f53bc3a21a1,7,OUR OWN TIME
55,,5,1636,Openings and Closures,1681,e04f08b8-69c9-42e3-9a07-ca156ec3a56d,7,OUR OWN TIME
