From 70fa4fa713325b8c3521b13fba0559abc8218bc7 Mon Sep 17 00:00:00 2001 From: Saba Sturua <45267439+jupyterjazz@users.noreply.github.com> Date: Thu, 23 Mar 2023 16:02:42 +0100 Subject: [PATCH] feat: create documents from dict (#1283) * feat: create documents from dict Signed-off-by: jupyterjazz * fix: ignore type Signed-off-by: jupyterjazz * refactor: change fn names Signed-off-by: jupyterjazz --------- Signed-off-by: jupyterjazz --- docarray/documents/helper.py | 42 ++++++++++++- tests/integrations/document/test_document.py | 62 ++++++++++++++++++-- 2 files changed, 95 insertions(+), 9 deletions(-) diff --git a/docarray/documents/helper.py b/docarray/documents/helper.py index 23fed9aa8e5..a7f7cc35a5f 100644 --- a/docarray/documents/helper.py +++ b/docarray/documents/helper.py @@ -73,7 +73,7 @@ def create_doc( return doc -def create_from_typeddict( +def create_doc_from_typeddict( typeddict_cls: Type['TypedDict'], # type: ignore **kwargs: Any, ): @@ -91,7 +91,7 @@ def create_from_typeddict( from docarray import BaseDocument from docarray.documents import Audio - from docarray.documents.helper import create_from_typeddict + from docarray.documents.helper import create_doc_from_typeddict from docarray.typing.tensor.audio import AudioNdArray @@ -100,7 +100,7 @@ class MyAudio(TypedDict): tensor: AudioNdArray - Doc = create_from_typeddict(MyAudio, __base__=Audio) + Doc = create_doc_from_typeddict(MyAudio, __base__=Audio) assert issubclass(Doc, BaseDocument) assert issubclass(Doc, Audio) @@ -118,3 +118,39 @@ class MyAudio(TypedDict): doc = create_model_from_typeddict(typeddict_cls, **kwargs) return doc + + +def create_doc_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc']: + """ + Create a subclass of BaseDocument based on example data given as a dictionary. + + In case the example contains None as a value, + corresponding field will be viewed as the type Any. + + :param model_name: Name of the new Document class + :param data_dict: Dictionary of field types to their corresponding values. + :return: the new Document class + + EXAMPLE USAGE + + .. code-block:: python + + import numpy as np + from docarray.documents import ImageDoc + from docarray.documents.helper import create_doc_from_dict + + data_dict = {'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), 'author': 'me'} + + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) + + assert issubclass(MyDoc, BaseDocument) + + """ + if not data_dict: + raise ValueError('`data_dict` should contain at least one item') + + field_types = { + field: (type(value) if value else Any, ...) + for field, value in data_dict.items() + } + return create_doc(__model_name=model_name, **field_types) # type: ignore diff --git a/tests/integrations/document/test_document.py b/tests/integrations/document/test_document.py index 2991d6cb8f0..35cbba24d53 100644 --- a/tests/integrations/document/test_document.py +++ b/tests/integrations/document/test_document.py @@ -2,12 +2,16 @@ import numpy as np import pytest -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError from typing_extensions import TypedDict from docarray import BaseDocument, DocumentArray from docarray.documents import AudioDoc, ImageDoc, TextDoc -from docarray.documents.helper import create_doc, create_from_typeddict +from docarray.documents.helper import ( + create_doc, + create_doc_from_typeddict, + create_doc_from_dict, +) from docarray.typing import AudioNdArray @@ -78,15 +82,15 @@ def test_create_doc(): assert issubclass(MyAudio, AudioDoc) -def test_create_from_typeddict(): +def test_create_doc_from_typeddict(): class MyMultiModalDoc(TypedDict): image: ImageDoc text: TextDoc with pytest.raises(ValueError): - _ = create_from_typeddict(MyMultiModalDoc, __base__=BaseModel) + _ = create_doc_from_typeddict(MyMultiModalDoc, __base__=BaseModel) - Doc = create_from_typeddict(MyMultiModalDoc) + Doc = create_doc_from_typeddict(MyMultiModalDoc) assert issubclass(Doc, BaseDocument) @@ -94,7 +98,53 @@ class MyAudio(TypedDict): title: str tensor: Optional[AudioNdArray] - Doc = create_from_typeddict(MyAudio, __base__=AudioDoc) + Doc = create_doc_from_typeddict(MyAudio, __base__=AudioDoc) assert issubclass(Doc, BaseDocument) assert issubclass(Doc, AudioDoc) + + +def test_create_doc_from_dict(): + data_dict = { + 'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), + 'text': TextDoc(text='hello'), + 'id': 123, + } + + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) + + assert issubclass(MyDoc, BaseDocument) + + doc = MyDoc( + image=ImageDoc(tensor=np.random.rand(3, 224, 224)), + text=TextDoc(text='hey'), + id=111, + ) + + assert isinstance(doc, BaseDocument) + assert isinstance(doc.text, TextDoc) + assert isinstance(doc.image, ImageDoc) + assert isinstance(doc.id, int) + + # Create a doc with an incorrect type + with pytest.raises(ValidationError): + doc = MyDoc( + image=ImageDoc(tensor=np.random.rand(3, 224, 224)), + text=['some', 'text'], # should be TextDoc + id=111, + ) + + # Handle empty data_dict + with pytest.raises(ValueError): + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict={}) + + # Data with a None value + data_dict = {'text': 'some text', 'other': None} + MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict) + + assert issubclass(MyDoc, BaseDocument) + + doc1 = MyDoc(text='txt', other=10) + doc2 = MyDoc(text='txt', other='also text') + + assert isinstance(doc1, BaseDocument) and isinstance(doc2, BaseDocument)