Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change serialization and add support for pydantic v2 #232

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 78 additions & 66 deletions docs/source/schema_serialization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
"source": [
"# Schema serialization\n",
"\n",
"A Kor schema can be serialized and deserialzed to JSON. This lets you store the schema outside of the code.\n",
"\n",
"**ATTENTION** This only works with pydantic v1 at the moment."
"A Kor schema can be serialized and deserialzed to JSON. This lets you store the schema outside of the code."
]
},
{
Expand Down Expand Up @@ -41,7 +39,8 @@
},
"outputs": [],
"source": [
"from kor.nodes import Object, Text, Number"
"from kor.nodes import Object, Text, Number\n",
"from kor.serializer import loads, dumps"
]
},
{
Expand All @@ -66,7 +65,71 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{\"id\": \"personal_info\", \"description\": \"Personal information about a given person.\", \"many\": true, \"attributes\": [{\"id\": \"first_name\", \"description\": \"The first name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"John\"]], \"$type\": \"Text\"}, {\"id\": \"last_name\", \"description\": \"The last name of the person\", \"many\": false, \"examples\": [[\"John Smith went to the store\", \"Smith\"]], \"$type\": \"Text\"}, {\"id\": \"age\", \"description\": \"The age of the person in years.\", \"many\": false, \"examples\": [[\"23 years old\", \"23\"], [\"I turned three on sunday\", \"3\"]], \"$type\": \"Number\"}], \"examples\": [[\"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\", [{\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23}, {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5}]]]}\n"
"{\n",
" \"attributes\": [\n",
" {\n",
" \"description\": \"The first name of the person\",\n",
" \"examples\": [\n",
" [\n",
" \"John Smith went to the store\",\n",
" \"John\"\n",
" ]\n",
" ],\n",
" \"id\": \"first_name\",\n",
" \"many\": false,\n",
" \"type\": \"text\"\n",
" },\n",
" {\n",
" \"description\": \"The last name of the person\",\n",
" \"examples\": [\n",
" [\n",
" \"John Smith went to the store\",\n",
" \"Smith\"\n",
" ]\n",
" ],\n",
" \"id\": \"last_name\",\n",
" \"many\": false,\n",
" \"type\": \"text\"\n",
" },\n",
" {\n",
" \"description\": \"The age of the person in years.\",\n",
" \"examples\": [\n",
" [\n",
" \"23 years old\",\n",
" 23\n",
" ],\n",
" [\n",
" \"I turned three on sunday\",\n",
" 3\n",
" ]\n",
" ],\n",
" \"id\": \"age\",\n",
" \"many\": false,\n",
" \"type\": \"number\"\n",
" }\n",
" ],\n",
" \"description\": \"Personal information about a given person.\",\n",
" \"examples\": [\n",
" [\n",
" \"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\",\n",
" [\n",
" {\n",
" \"age\": 23,\n",
" \"first_name\": \"John\",\n",
" \"last_name\": \"Smith\"\n",
" },\n",
" {\n",
" \"age\": 5,\n",
" \"first_name\": \"Jane\",\n",
" \"last_name\": \"Doe\"\n",
" }\n",
" ]\n",
" ]\n",
" ],\n",
" \"id\": \"personal_info\",\n",
" \"many\": true,\n",
" \"type\": \"object\"\n",
"}\n"
]
}
],
Expand Down Expand Up @@ -103,7 +166,7 @@
" many=True,\n",
")\n",
"\n",
"print(schema.json())"
"print(dumps(schema, sort_keys=True, indent=2))"
]
},
{
Expand All @@ -113,81 +176,31 @@
"source": [
"## Deserialization\n",
"\n",
"Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type.\n",
"\n",
"The following attribute types must be annotated with a type descrimintator (`$type`):\n",
"\n",
"- Number\n",
"- Text\n",
"- Bool\n",
"- Selection"
"Kor lets you define the schema in JSON. The structure of the JSON matches the struture of the `Object` type."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3bd33817",
"id": "6346cef2-f73d-47f8-b171-a3b960af6ca4",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"json = \"\"\"\n",
"{\n",
" \"id\": \"personal_info\",\n",
" \"description\": \"Personal information about a given person.\",\n",
" \"attributes\": [\n",
" {\n",
" \"$type\": \"Text\",\n",
" \"id\": \"first_name\",\n",
" \"description\": \"The first name of the person\",\n",
" \"examples\": [[\"John Smith went to the store\", \"John\"]]\n",
" },\n",
" {\n",
" \"$type\": \"Text\",\n",
" \"id\": \"last_name\",\n",
" \"description\": \"The last name of the person\",\n",
" \"examples\": [[\"John Smith went to the store\", \"Smith\"]]\n",
" },\n",
" {\n",
" \"$type\": \"Number\",\n",
" \"id\": \"age\",\n",
" \"description\": \"The age of the person in years.\",\n",
" \"examples\": [[\"23 years old\", \"23\"], [\"I turned three on sunday\", \"3\"]]\n",
" }\n",
" ],\n",
" \"examples\": [\n",
" [\n",
" \"John Smith was 23 years old. He was very tall. He knew Jane Doe. She was 5 years old.\",\n",
" [\n",
" {\"first_name\": \"John\", \"last_name\": \"Smith\", \"age\": 23},\n",
" {\"first_name\": \"Jane\", \"last_name\": \"Doe\", \"age\": 5}\n",
" ]\n",
" ]\n",
" ],\n",
" \"many\": true\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "markdown",
"id": "3581b713",
"metadata": {},
"source": [
"To deserialize a schema from JSON simply call the `parse_raw()` method."
"json = dumps(schema)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6088c98a",
"id": "f125f314-a9ee-4865-b573-a2b22570e80a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"schema = Object.parse_raw(json)"
"deserialized_object = loads(json)"
]
},
{
Expand Down Expand Up @@ -216,7 +229,6 @@
" model_name=\"gpt-3.5-turbo\",\n",
" temperature=0,\n",
" max_tokens=2000,\n",
" model_kwargs={\"frequency_penalty\": 0, \"presence_penalty\": 0, \"top_p\": 1.0},\n",
")"
]
},
Expand All @@ -234,7 +246,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"id": "193e257b-df01-45ec-af77-076d2070533b",
"metadata": {
"tags": []
Expand All @@ -246,7 +258,7 @@
"{'personal_info': [{'first_name': 'Eugene', 'last_name': '', 'age': '18'}]}"
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -257,7 +269,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"id": "c8295f36-f986-4db2-97bc-ef2e6cdbcc87",
"metadata": {
"tags": []
Expand Down Expand Up @@ -298,7 +310,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.9.6"
}
},
"nbformat": 4,
Expand Down
62 changes: 9 additions & 53 deletions kor/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,17 @@
from typing import (
Any,
Generic,
Literal,
Mapping,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
)

from pydantic import BaseModel

from ._pydantic import PYDANTIC_MAJOR_VERSION

# Name of field to store the type discriminator
TYPE_DISCRIMINATOR_FIELD = "$type"

T = TypeVar("T")


Expand Down Expand Up @@ -117,34 +112,6 @@ class ExtractionSchemaNode(AbstractSchemaNode, abc.ABC):
Tuple[str, Union[bool, int, float, str, Sequence[Union[str, int, float, bool]]]]
] = tuple()

def __init__(self, **kwargs: Any) -> None:
"""Initialize."""
super().__init__(**kwargs)
if PYDANTIC_MAJOR_VERSION == 1:
self.__dict__[TYPE_DISCRIMINATOR_FIELD] = type(self).__name__

@classmethod
def parse_obj(cls, data: dict) -> ExtractionSchemaNode:
"""Parse an object."""
if PYDANTIC_MAJOR_VERSION != 1:
raise NotImplementedError("Only supported for pydantic 1.x")
type_ = data.pop(TYPE_DISCRIMINATOR_FIELD, None)
if type_ is None:
raise ValueError(f"Need to specify type ({TYPE_DISCRIMINATOR_FIELD})")
for sub in cls.__subclasses__():
if type_ == sub.__name__:
return sub(**data)
raise TypeError(f"Unknown sub-type: {type_}")

@classmethod
def validate(cls: Type[ExtractionSchemaNode], v: Any) -> ExtractionSchemaNode:
if isinstance(v, dict):
return cls.parse_obj(v)
elif isinstance(v, cls):
return v
else:
raise TypeError(f"Unsupported type: {type(v)}")


class Number(ExtractionSchemaNode):
"""Built-in number input."""
Expand All @@ -153,6 +120,8 @@ class Number(ExtractionSchemaNode):
Tuple[str, Union[int, float, Sequence[Union[float, int]]]]
] = tuple()

type: Literal["number"] = "number"

def accept(self, visitor: AbstractVisitor[T], **kwargs: Any) -> T:
"""Accept a visitor."""
return visitor.visit_number(self, **kwargs)
Expand All @@ -162,6 +131,7 @@ class Text(ExtractionSchemaNode):
"""Built-in text input."""

examples: Sequence[Tuple[str, Union[Sequence[str], str]]] = tuple()
type: Literal["text"] = "text"

def accept(self, visitor: AbstractVisitor[T], **kwargs: Any) -> T:
"""Accept a visitor."""
Expand All @@ -172,6 +142,7 @@ class Bool(ExtractionSchemaNode):
"""Built-in bool input."""

examples: Sequence[Tuple[str, Union[Sequence[bool], bool]]] = tuple()
type: Literal["bool"] = "bool"

def accept(self, visitor: AbstractVisitor[T], **kwargs: Any) -> T:
"""Accept a visitor."""
Expand All @@ -182,6 +153,7 @@ class Option(AbstractSchemaNode):
"""Built-in option input must be part of a selection input."""

examples: Sequence[str] = tuple()
type: Literal["option"] = "option"

def accept(self, visitor: AbstractVisitor[T], **kwargs: Any) -> T:
"""Accept a visitor."""
Expand Down Expand Up @@ -224,6 +196,7 @@ class Selection(AbstractSchemaNode):
options: Sequence[Option]
examples: Sequence[Tuple[str, Union[str, Sequence[str]]]] = tuple()
null_examples: Sequence[str] = tuple()
type: Literal["selection"] = "selection"

def accept(self, visitor: AbstractVisitor[T], **kwargs: Any) -> T:
"""Accept a visitor."""
Expand Down Expand Up @@ -257,7 +230,8 @@ class Object(AbstractSchemaNode):

"""

attributes: Sequence[Union[ExtractionSchemaNode, Selection, Object]]
attributes: Sequence[Union[Selection, Object, Number, Text, Bool]]
type: Literal["object"] = "object"

examples: Sequence[
Tuple[
Expand All @@ -272,21 +246,3 @@ class Object(AbstractSchemaNode):
def accept(self, visitor: AbstractVisitor[T], **kwargs: Any) -> T:
"""Accept a visitor."""
return visitor.visit_object(self, **kwargs)

@classmethod
def parse_raw(cls, *args: Any, **kwargs: Any) -> Object:
"""Parse raw data."""
if PYDANTIC_MAJOR_VERSION != 1:
raise NotImplementedError(
f"parse_raw is not supported for pydantic {PYDANTIC_MAJOR_VERSION}"
)
return super().parse_raw(*args, **kwargs)

@classmethod
def parse_obj(cls, *args: Any, **kwargs: Any) -> Object:
"""Parse an object."""
if PYDANTIC_MAJOR_VERSION != 1:
raise NotImplementedError(
f"parse_obj is not supported for pydantic {PYDANTIC_MAJOR_VERSION}"
)
return super().parse_obj(*args, **kwargs)
26 changes: 26 additions & 0 deletions kor/serializer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import json
from typing import Optional

from ._pydantic import PYDANTIC_MAJOR_VERSION
from .nodes import Object

# PUBLIC API


def loads(string: str) -> Object:
"""Deserialize a string to a schema node."""
if PYDANTIC_MAJOR_VERSION == 1:
return Object.parse_raw(string) # type: ignore[attr-defined]
return Object.model_validate_json(string) # type: ignore[attr-defined]


def dumps(
object: Object, *, indent: Optional[int] = None, sort_keys: bool = False
) -> str:
"""Serialize a schema node to a string."""
if PYDANTIC_MAJOR_VERSION == 1:
d = object.dict() # type: ignore[attr-defined]
else:
d = object.model_dump() # type: ignore[attr-defined]

return json.dumps(d, indent=indent, sort_keys=sort_keys)
Loading