Skip to content

[FIX] Correct scope for new functions in the optimization context (CF-687) #490

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions code_to_optimize/code_directories/circular_deps/constants.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,2 @@
DEFAULT_API_URL = "https://api.galileo.ai/"
DEFAULT_APP_URL = "https://app.galileo.ai/"


# function_names: GalileoApiClient.get_console_url
# module_abs_path : /home/mohammed/Work/galileo-python/src/galileo/api_client.py
# preexisting_objects: {('GalileoApiClient', ()), ('_set_destination', ()), ('get_console_url', (FunctionParent(name='GalileoApiClient', type='ClassDef'),))}
# project_root_path: /home/mohammed/Work/galileo-python/src
115 changes: 115 additions & 0 deletions code_to_optimize/code_directories/unstructured_example/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from __future__ import annotations

import base64
import json
import zlib
from copy import deepcopy
from typing import Any, Iterable
from utils import Point

from coordinates import PixelSpace
from elements import (
TYPE_TO_TEXT_ELEMENT_MAP,
CheckBox,
Element,
ElementMetadata,
)

# ================================================================================================
# SERIALIZATION/DESERIALIZATION (SERDE) RELATED FUNCTIONS
# ================================================================================================
# These serde functions will likely relocate to `unstructured.documents.elements` since they are
# so closely related to elements and this staging "brick" is deprecated.
# ================================================================================================

# == DESERIALIZERS ===============================


def elements_from_base64_gzipped_json(b64_encoded_elements: str) -> list[Element]:
"""Restore Base64-encoded gzipped JSON elements to element objects.

This is used to when deserializing `ElementMetadata.orig_elements` from its compressed form in
JSON and dict forms and perhaps for other purposes.
"""
# -- Base64 str -> gzip-encoded (JSON) bytes --
decoded_b64_bytes = base64.b64decode(b64_encoded_elements)
# -- undo gzip compression --
elements_json_bytes = zlib.decompress(decoded_b64_bytes)
# -- JSON (bytes) to JSON (str) --
elements_json_str = elements_json_bytes.decode("utf-8")
# -- JSON (str) -> dicts --
element_dicts = json.loads(elements_json_str)
# -- dicts -> elements --
return elements_from_dicts(element_dicts)


def elements_from_dicts(element_dicts: Iterable[dict[str, Any]]) -> list[Element]:
"""Convert a list of element-dicts to a list of elements."""
elements: list[Element] = []

for item in element_dicts:
element_id: str = item.get("element_id", None)
metadata = (
ElementMetadata()
if item.get("metadata") is None
else ElementMetadata.from_dict(item["metadata"])
)

if item.get("type") in TYPE_TO_TEXT_ELEMENT_MAP:
ElementCls = TYPE_TO_TEXT_ELEMENT_MAP[item["type"]]
elements.append(ElementCls(text=item["text"], element_id=element_id, metadata=metadata))
elif item.get("type") == "CheckBox":
elements.append(
CheckBox(checked=item["checked"], element_id=element_id, metadata=metadata)
)

return elements

def elements_to_base64_gzipped_json(elements: Iterable[Element]) -> str:
"""Convert `elements` to Base64-encoded gzipped JSON.

This is used to when serializing `ElementMetadata.orig_elements` to make it as compact as
possible when transported as JSON, for example in an HTTP response. This compressed form is also
present when elements are in dict form ("element_dicts"). This function is not coupled to that
purpose however and could have other uses.
"""
# -- adjust floating-point precision of coordinates down for a more compact str value --
precision_adjusted_elements = _fix_metadata_field_precision(elements)
# -- serialize elements as dicts --
element_dicts = elements_to_dicts(precision_adjusted_elements)
# -- serialize the dicts to JSON (bytes) --
json_bytes = json.dumps(element_dicts, sort_keys=True).encode("utf-8")
# -- compress the JSON bytes with gzip compression --
deflated_bytes = zlib.compress(json_bytes)
# -- base64-encode those bytes so they can be serialized as a JSON string value --
b64_deflated_bytes = base64.b64encode(deflated_bytes)
# -- convert to a string suitable for serializing in JSON --
return b64_deflated_bytes.decode("utf-8")


def elements_to_dicts(elements: Iterable[Element]) -> list[dict[str, Any]]:
"""Convert document elements to element-dicts."""
return [e.to_dict() for e in elements]


def _fix_metadata_field_precision(elements: Iterable[Element]) -> list[Element]:
out_elements: list[Element] = []
for element in elements:
el = deepcopy(element)
if el.metadata.coordinates:
precision = 1 if isinstance(el.metadata.coordinates.system, PixelSpace) else 2
points = el.metadata.coordinates.points
assert points is not None
rounded_points: list[Point] = []
for point in points:
x, y = point
rounded_point = (round(x, precision), round(y, precision))
rounded_points.append(rounded_point)
el.metadata.coordinates.points = tuple(rounded_points)

if el.metadata.detection_class_prob:
el.metadata.detection_class_prob = round(el.metadata.detection_class_prob, 5)

out_elements.append(el)

return out_elements
113 changes: 113 additions & 0 deletions code_to_optimize/code_directories/unstructured_example/coordinates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from __future__ import annotations

from enum import Enum
from typing import Any, Dict, Sequence, Tuple, Union


class Orientation(Enum):
SCREEN = (1, -1) # Origin in top left, y increases in the down direction
CARTESIAN = (1, 1) # Origin in bottom left, y increases in upward direction


def convert_coordinate(old_t, old_t_max, new_t_max, t_orientation):
"""Convert a coordinate into another system along an axis using a linear transformation"""
return (
(1 - old_t / old_t_max) * (1 - t_orientation) / 2
+ old_t / old_t_max * (1 + t_orientation) / 2
) * new_t_max


class CoordinateSystem:
"""A finite coordinate plane with given width and height."""

orientation: Orientation

def __init__(self, width: Union[int, float], height: Union[int, float]):
self.width = width
self.height = height

def __eq__(self, other: object):
if not isinstance(other, CoordinateSystem):
return False
return (
str(self.__class__.__name__) == str(other.__class__.__name__)
and self.width == other.width
and self.height == other.height
and self.orientation == other.orientation
)

def convert_from_relative(
self,
x: Union[float, int],
y: Union[float, int],
) -> Tuple[Union[float, int], Union[float, int]]:
"""Convert to this coordinate system from a relative coordinate system."""
x_orientation, y_orientation = self.orientation.value
new_x = convert_coordinate(x, 1, self.width, x_orientation)
new_y = convert_coordinate(y, 1, self.height, y_orientation)
return new_x, new_y

def convert_to_relative(
self,
x: Union[float, int],
y: Union[float, int],
) -> Tuple[Union[float, int], Union[float, int]]:
"""Convert from this coordinate system to a relative coordinate system."""
x_orientation, y_orientation = self.orientation.value
new_x = convert_coordinate(x, self.width, 1, x_orientation)
new_y = convert_coordinate(y, self.height, 1, y_orientation)
return new_x, new_y

def convert_coordinates_to_new_system(
self,
new_system: CoordinateSystem,
x: Union[float, int],
y: Union[float, int],
) -> Tuple[Union[float, int], Union[float, int]]:
"""Convert from this coordinate system to another given coordinate system."""
rel_x, rel_y = self.convert_to_relative(x, y)
return new_system.convert_from_relative(rel_x, rel_y)

def convert_multiple_coordinates_to_new_system(
self,
new_system: CoordinateSystem,
coordinates: Sequence[Tuple[Union[float, int], Union[float, int]]],
) -> Tuple[Tuple[Union[float, int], Union[float, int]], ...]:
"""Convert (x, y) coordinates from current system to another coordinate system."""
new_system_coordinates = []
for x, y in coordinates:
new_system_coordinates.append(
self.convert_coordinates_to_new_system(new_system=new_system, x=x, y=y),
)
return tuple(new_system_coordinates)


class RelativeCoordinateSystem(CoordinateSystem):
"""Relative coordinate system where x and y are on a scale from 0 to 1."""

orientation = Orientation.CARTESIAN

def __init__(self):
self.width = 1
self.height = 1


class PixelSpace(CoordinateSystem):
"""Coordinate system representing a pixel space, such as an image. The origin is at the top
left."""

orientation = Orientation.SCREEN


class PointSpace(CoordinateSystem):
"""Coordinate system representing a point space, such as a pdf. The origin is at the bottom
left."""

orientation = Orientation.CARTESIAN


TYPE_TO_COORDINATE_SYSTEM_MAP: Dict[str, Any] = {
"PixelSpace": PixelSpace,
"PointSpace": PointSpace,
"CoordinateSystem": CoordinateSystem,
}
Loading
Loading