Skip to content

Commit

Permalink
fix: fix edge case when two elements are strings (#37)
Browse files Browse the repository at this point in the history
* fix: fix pr#29

* fix: fix pr#29
  • Loading branch information
hanxiao committed Jan 12, 2022
1 parent e9ce570 commit 9646bed
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 42 deletions.
100 changes: 64 additions & 36 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,18 @@ def __getitem__(
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)

return _docs._get_attributes(*_attrs)
if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self._id2offset:
return DocumentArray([self[index[0]], self[index[1]]])
else:
return getattr(self[index[0]], index[1])
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)
return _docs._get_attributes(*_attrs)
elif isinstance(index[0], bool):
return DocumentArray(itertools.compress(self._data, index))
elif isinstance(index[0], int):
Expand Down Expand Up @@ -231,31 +237,45 @@ def __setitem__(
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
):
_docs = self[index[0]]
_attrs = index[1]

if isinstance(_attrs, str):
# a -> [a]
# [a, a] -> [a, a]
_attrs = (index[1],)
if isinstance(value, (list, tuple)) and not any(
isinstance(el, (tuple, list)) for el in value
):
# [x] -> [[x]]
# [[x], [y]] -> [[x], [y]]
value = (value,)
if not isinstance(value, (list, tuple)):
# x -> [x]
value = (value,)

for _a, _v in zip(_attrs, value):
if _a == 'blob':
_docs.blobs = _v
elif _a == 'embedding':
_docs.embeddings = _v
if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self._id2offset:
for _d, _v in zip((self[index[0]], self[index[1]]), value):
_d._data = _v._data
self._rebuild_id2offset()
elif hasattr(self[index[0]], index[1]):
setattr(self[index[0]], index[1], value)
else:
for _d, _vv in zip(_docs, _v):
setattr(_d, _a, _vv)
# to avoid accidentally add new unsupport attribute
raise ValueError(
f'`{index[1]}` is neither a valid id nor attribute name'
)
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]

if isinstance(_attrs, str):
# a -> [a]
# [a, a] -> [a, a]
_attrs = (index[1],)
if isinstance(value, (list, tuple)) and not any(
isinstance(el, (tuple, list)) for el in value
):
# [x] -> [[x]]
# [[x], [y]] -> [[x], [y]]
value = (value,)
if not isinstance(value, (list, tuple)):
# x -> [x]
value = (value,)

for _a, _v in zip(_attrs, value):
if _a == 'blob':
_docs.blobs = _v
elif _a == 'embedding':
_docs.embeddings = _v
else:
for _d, _vv in zip(_docs, _v):
setattr(_d, _a, _vv)
elif isinstance(index[0], bool):
if len(index) != len(self._data):
raise IndexError(
Expand Down Expand Up @@ -313,12 +333,20 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)
for _d in _docs:
_d.pop(*_attrs)
if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self._id2offset:
del self[index[0]]
del self[index[1]]
else:
self[index[0]].pop(index[1])
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)
for _d in _docs:
_d.pop(*_attrs)
elif isinstance(index[0], bool):
self._data = list(
itertools.compress(self._data, (not _i for _i in index))
Expand Down
16 changes: 13 additions & 3 deletions docs/fundamentals/document/construct.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,19 @@ d = Document()
<Document ('id',) at 5dd542406d3f11eca3241e008a366d49>
```

Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`. You can override this `id` or assign your own `id` during construction, as demonstrated below.
Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`.

````{tip}
The random `id` is the hex value of [UUID1](https://docs.python.org/3/library/uuid.html#uuid.uuid1). To convert it into the string of UUID:
```python
import uuid
str(uuid.UUID(d.id))
```
````

Though possible, it is not recommended modifying `.id` of a Document frequently, as this will lead to unexpected behavior.


## Construct with attributes

Expand All @@ -25,7 +37,6 @@ This is the most common usage of the constructor: initializing a Document object
from docarray import Document
import numpy

d0 = Document(id='my_id')
d1 = Document(text='hello')
d2 = Document(buffer=b'\f1')
d3 = Document(blob=numpy.array([1, 2, 3]))
Expand All @@ -43,7 +54,6 @@ Don't forget to leverage autocomplete in your IDE.
```

```text
<Document ('id',) at my_id>
<Document ('id', 'mime_type', 'text') at a14effee6d3e11ec8bde1e008a366d49>
<Document ('id', 'buffer') at a14f00986d3e11ec8bde1e008a366d49>
<Document ('id', 'blob') at a14f01a66d3e11ec8bde1e008a366d49>
Expand Down
2 changes: 1 addition & 1 deletion docs/get-started/what-is.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ As one can see, you can convert a DocumentArray into AwkwardArray via `.to_list(

[Zarr](https://zarr.readthedocs.io/en/stable/) is a format for the storage of chunked, compressed, N-dimensional arrays. I know Zarr quite long time ago, to me it is the package when a `numpy.ndarray` is so big to fit into memory. Zarr provides a comprehensive set of functions that allows one to chunk, compress, stream large NdArray. Hence, from that perspective, Zarr like `numpy.ndarray` focuses on numerical representation and computation.

In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, it is often an intermediate representation of Document for computing, then being thrown away. Therefore, having a consistent data structure that can live *long enough* to cover creating, storing, computing, transferring, returning and rendering is a motivation behind DocArray.
In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, `ndarray` is often an intermediate representation of Document for computing, then throw away. Therefore, having a consistent data structure that lives *long enough* to cover creating, storing, computing, transferring, returning and rendering is one of the major motivations of DocArray.

## To Jina Users

Expand Down
40 changes: 38 additions & 2 deletions tests/unit/array/test_advance_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,6 @@ def test_advance_selector_mixed():


def test_single_boolean_and_padding():
from docarray import DocumentArray

da = DocumentArray.empty(3)

with pytest.raises(IndexError):
Expand All @@ -237,3 +235,41 @@ def test_single_boolean_and_padding():

assert len(da[True, False]) == 1
assert len(da[False, False]) == 0


def test_edge_case_two_strings():
# getitem
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
assert da['1', 'id'] == '1'
assert len(da['1', '2']) == 2
assert isinstance(da['1', '2'], DocumentArray)
with pytest.raises(KeyError):
da['hello', '2']
with pytest.raises(AttributeError):
da['1', 'hello']
assert len(da['1', '2', '3']) == 3
assert isinstance(da['1', '2', '3'], DocumentArray)

# delitem
del da['1', '2']
assert len(da) == 1

da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
del da['1', 'id']
assert len(da) == 3
assert not da[0].id

del da['2', 'hello']

# setitem
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
da['1', '2'] = DocumentArray.empty(2)
assert da[0].id != '1'
assert da[1].id != '2'

da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
da['1', 'text'] = 'hello'
assert da['1'].text == 'hello'

with pytest.raises(ValueError):
da['1', 'hellohello'] = 'hello'

0 comments on commit 9646bed

Please sign in to comment.