docarray · JoanFM · Jun 27, 2022 · Jun 23, 2022 · Jun 24, 2022 · Jun 24, 2022
diff --git a/docarray/array/storage/elastic/backend.py b/docarray/array/storage/elastic/backend.py
@@ -179,6 +179,22 @@ def _update_offset2ids_meta(self):
             r = bulk(self._client, requests)
             self._client.indices.refresh(index=self._index_name_offset2id)
 
+            # Clean trailing unused offsets
+            offset_count = self._client.count(index=self._index_name_offset2id)
+            unused_offsets = range(len(self._offset2ids.ids), offset_count['count'])
+
+            if len(unused_offsets) > 0:
+                requests = [
+                    {
+                        '_op_type': 'delete',
+                        '_id': offset_,  # note offset goes here because it's what we want to get by
+                        '_index': self._index_name_offset2id,
+                    }
+                    for offset_ in unused_offsets
+                ]
+                r = bulk(self._client, requests)
+                self._client.indices.refresh(index=self._index_name_offset2id)
+
     def _get_offset2ids_meta(self) -> List:
         """Return the offset2ids stored in elastic
 

diff --git a/docarray/array/storage/elastic/seqlike.py b/docarray/array/storage/elastic/seqlike.py
@@ -72,4 +72,6 @@ def _upload_batch(self, docs: Iterable['Document']):
     def extend(self, docs: Iterable['Document']):
         docs = list(docs)
         self._upload_batch(docs)
-        self._offset2ids.extend([doc.id for doc in docs])
+        self._offset2ids.extend(
+            [doc.id for doc in docs if doc.id not in self._offset2ids.ids]
+        )
diff --git a/tests/unit/array/storage/__init__.py b/tests/unit/array/storage/__init__.py
diff --git a/tests/unit/array/storage/elastic/__init__.py b/tests/unit/array/storage/elastic/__init__.py
diff --git a/tests/unit/array/storage/elastic/test_add.py b/tests/unit/array/storage/elastic/test_add.py
@@ -0,0 +1,42 @@
+from docarray import Document, DocumentArray
+
+
+def test_add_ignore_existing_doc_id(start_storage):
+    elastic_doc = DocumentArray(
+        storage='elasticsearch',
+        config={
+            'n_dim': 3,
+            'columns': [('price', 'int')],
+            'distance': 'l2_norm',
+            'index_name': 'test_add',
+        },
+    )
+
+    with elastic_doc:
+        elastic_doc.extend(
+            [
+                Document(id='r0', embedding=[0, 0, 0]),
+                Document(id='r1', embedding=[1, 1, 1]),
+                Document(id='r2', embedding=[2, 2, 2]),
+                Document(id='r3', embedding=[3, 3, 3]),
+                Document(id='r4', embedding=[4, 4, 4]),
+            ]
+        )
+
+    with elastic_doc:
+        elastic_doc.extend(
+            [
+                Document(id='r0', embedding=[0, 0, 0]),
+                Document(id='r2', embedding=[2, 2, 2]),
+                Document(id='r4', embedding=[4, 4, 4]),
+                Document(id='r5', embedding=[2, 2, 2]),
+                Document(id='r6', embedding=[4, 4, 4]),
+            ]
+        )
+
+    indexed_offset_count = elastic_doc._client.count(
+        index=elastic_doc._index_name_offset2id
+    )['count']
+
+    assert len(elastic_doc) == len(elastic_doc[:, 'embedding'])
+    assert len(elastic_doc) == indexed_offset_count
diff --git a/tests/unit/array/storage/elastic/test_del.py b/tests/unit/array/storage/elastic/test_del.py
@@ -0,0 +1,47 @@
+from docarray import Document, DocumentArray
+import pytest
+
+
+@pytest.mark.parametrize('deleted_elmnts', [[0, 1], ['r0', 'r1']])
+def test_delete_offset_success_sync_es_offset_index(deleted_elmnts, start_storage):
+    elastic_doc = DocumentArray(
+        storage='elasticsearch',
+        config={
+            'n_dim': 3,
+            'columns': [('price', 'int')],
+            'distance': 'l2_norm',
+            'index_name': 'test_delete',
+        },
+    )
+
+    with elastic_doc:
+        elastic_doc.extend(
+            [
+                Document(id='r0', embedding=[0, 0, 0]),
+                Document(id='r1', embedding=[1, 1, 1]),
+                Document(id='r2', embedding=[2, 2, 2]),
+                Document(id='r3', embedding=[3, 3, 3]),
+                Document(id='r4', embedding=[4, 4, 4]),
+                Document(id='r5', embedding=[5, 5, 5]),
+                Document(id='r6', embedding=[6, 6, 6]),
+                Document(id='r7', embedding=[7, 7, 7]),
+            ]
+        )
+
+    expected_offset_after_del = ['r2', 'r3', 'r4', 'r5', 'r6', 'r7']
+
+    with elastic_doc:
+        del elastic_doc[deleted_elmnts]
+
+    indexed_offset_count = elastic_doc._client.count(
+        index=elastic_doc._index_name_offset2id
+    )['count']
+
+    assert len(elastic_doc._offset2ids.ids) == indexed_offset_count
+
+    for id in expected_offset_after_del:
+        expected_offset = str(expected_offset_after_del.index(id))
+        actual_offset_index = elastic_doc._client.search(
+            index=elastic_doc._index_name_offset2id, query={'match': {'blob': id}}
+        )['hits']['hits'][0]['_id']
+        assert actual_offset_index == expected_offset