-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
base_chunker.py
87 lines (73 loc) · 3.25 KB
/
base_chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import hashlib
import logging
from typing import Optional
from embedchain.config.add_config import ChunkerConfig
from embedchain.helpers.json_serializable import JSONSerializable
from embedchain.models.data_type import DataType
logger = logging.getLogger(__name__)
class BaseChunker(JSONSerializable):
def __init__(self, text_splitter):
"""Initialize the chunker."""
self.text_splitter = text_splitter
self.data_type = None
def create_chunks(self, loader, src, app_id=None, config: Optional[ChunkerConfig] = None):
"""
Loads data and chunks it.
:param loader: The loader whose `load_data` method is used to create
the raw data.
:param src: The data to be handled by the loader. Can be a URL for
remote sources or local content for local loaders.
:param app_id: App id used to generate the doc_id.
"""
documents = []
chunk_ids = []
id_map = {}
min_chunk_size = config.min_chunk_size if config is not None else 1
logger.info(f"Skipping chunks smaller than {min_chunk_size} characters")
data_result = loader.load_data(src)
data_records = data_result["data"]
doc_id = data_result["doc_id"]
# Prefix app_id in the document id if app_id is not None to
# distinguish between different documents stored in the same
# elasticsearch or opensearch index
doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
metadatas = []
for data in data_records:
content = data["content"]
metadata = data["meta_data"]
# add data type to meta data to allow query using data type
metadata["data_type"] = self.data_type.value
metadata["doc_id"] = doc_id
# TODO: Currently defaulting to the src as the url. This is done intentianally since some
# of the data types like 'gmail' loader doesn't have the url in the meta data.
url = metadata.get("url", src)
chunks = self.get_chunks(content)
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id
if id_map.get(chunk_id) is None and len(chunk) >= min_chunk_size:
id_map[chunk_id] = True
chunk_ids.append(chunk_id)
documents.append(chunk)
metadatas.append(metadata)
return {
"documents": documents,
"ids": chunk_ids,
"metadatas": metadatas,
"doc_id": doc_id,
}
def get_chunks(self, content):
"""
Returns chunks using text splitter instance.
Override in child class if custom logic.
"""
return self.text_splitter.split_text(content)
def set_data_type(self, data_type: DataType):
"""
set the data type of chunker
"""
self.data_type = data_type
# TODO: This should be done during initialization. This means it has to be done in the child classes.
@staticmethod
def get_word_count(documents) -> int:
return sum(len(document.split(" ")) for document in documents)