From 47c3dd16e145815314a04aa067af27c92b3111a7 Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 09:52:27 +0700 Subject: [PATCH 1/7] feat: add data from lark doc and lark wiki --- knowledge server/.gitignore | 3 +- knowledge server/config.example.yaml | 7 ++ knowledge server/config/config.py | 16 ++++ ...atasource.yaml => datasource.example.yaml} | 0 knowledge server/loader/datasource.py | 31 -------- knowledge server/loader/factory.py | 48 ++++++++++++ knowledge server/loader/lark.py | 65 ++++++++++++++++ knowledge server/main.py | 71 ++++++++++------- knowledge server/pyproject.toml | 1 + knowledge server/uv.lock | 78 +++++++++++++++++++ 10 files changed, 260 insertions(+), 60 deletions(-) rename knowledge server/{datasource.yaml => datasource.example.yaml} (100%) delete mode 100644 knowledge server/loader/datasource.py create mode 100644 knowledge server/loader/factory.py create mode 100644 knowledge server/loader/lark.py diff --git a/knowledge server/.gitignore b/knowledge server/.gitignore index a539470..89b33ba 100644 --- a/knowledge server/.gitignore +++ b/knowledge server/.gitignore @@ -1 +1,2 @@ -config.yaml \ No newline at end of file +config.yaml +datasource.yaml diff --git a/knowledge server/config.example.yaml b/knowledge server/config.example.yaml index bd76931..37bc4fa 100644 --- a/knowledge server/config.example.yaml +++ b/knowledge server/config.example.yaml @@ -7,3 +7,10 @@ vector_store: enable_full_text_search: true chunk_size: 1000 chunk_overlap: 200 +embeddings: + source: ollama + model: embeddinggemma:latest +lark: + domain: "https://open.larksuite.com" + app_id: "app_id_here" + app_secret: "app_secret_here" \ No newline at end of file diff --git a/knowledge server/config/config.py b/knowledge server/config/config.py index 9203c5d..da7e011 100644 --- a/knowledge server/config/config.py +++ b/knowledge server/config/config.py @@ -23,6 +23,7 @@ def __init__(self, filepath): self.chunk_size = config.get("chunk_size", 1000) self.chunk_overlap = config.get("chunk_overlap", 200) self.embeddings = EmbeddingsConfig(config) + self.lark = LarkConfig(config) class EmbeddingsConfig: @@ -38,6 +39,21 @@ def __init__(self, config: dict): self.model = embeddings_config.get("model", None) +class LarkConfig: + domain: str + app_id: str + app_secret: str + + def __init__(self, config: dict): + lark_config = config.get("lark", None) + if lark_config is None: + raise ValueError("Lark configuration is missing in the config file.") + + self.domain = lark_config.get("domain", None) + self.app_id = lark_config.get("app_id", None) + self.app_secret = lark_config.get("app_secret", None) + + class VectorStoreConfig: type: str url: str diff --git a/knowledge server/datasource.yaml b/knowledge server/datasource.example.yaml similarity index 100% rename from knowledge server/datasource.yaml rename to knowledge server/datasource.example.yaml diff --git a/knowledge server/loader/datasource.py b/knowledge server/loader/datasource.py deleted file mode 100644 index 66b1aea..0000000 --- a/knowledge server/loader/datasource.py +++ /dev/null @@ -1,31 +0,0 @@ -import logging - -from langchain_core.document_loaders.base import BaseLoader -from loader.directory import DirectoryLoader - - -class Datasource: - type: str - path: str - url: str - - def __init__(self, type: str, path: str = "", url: str = ""): - self.type = type - self.path = path - self.url = url - - -class DatasourceLoader(BaseLoader): - loader: BaseLoader - - def __init__(self, datasource: Datasource, logger: logging.Logger): - if datasource.type == "directory": - self.loader = DirectoryLoader(datasource.path, logger) - else: - raise ValueError(f"Unsupported source type: {datasource.type}") - - def lazy_load(self): - return self.loader.lazy_load() - - def load(self): - return self.loader.load() diff --git a/knowledge server/loader/factory.py b/knowledge server/loader/factory.py new file mode 100644 index 0000000..f43278f --- /dev/null +++ b/knowledge server/loader/factory.py @@ -0,0 +1,48 @@ +import logging + + +from loader.lark import LarkSuiteDocLoader, LarkSuiteWikiLoader + + +from langchain_core.document_loaders.base import BaseLoader +from loader.directory import DirectoryLoader + +import lark_oapi as lark + + +class Datasource: + type: str + path: str + url: str + id: str + + def __init__(self, type: str, path: str = "", url: str = "", id: str = ""): + self.type = type + self.path = path + self.url = url + self.id = id + + +class LoaderFactory: + logger: logging.Logger + lark_client: lark.Client + + def __init__(self, lark_client: lark.Client, logger: logging.Logger) -> None: + self.lark_client = lark_client + self.logger = logger + + def get_loader(self, datasource: Datasource) -> BaseLoader: + if datasource.type == "directory": + return DirectoryLoader(datasource.path, self.logger) + elif datasource.type == "lark-doc": + return LarkSuiteDocLoader( + client=self.lark_client, + document_id=datasource.id, + ) + elif datasource.type == "lark-wiki": + return LarkSuiteWikiLoader( + client=self.lark_client, + wiki_id=datasource.id, + ) + else: + raise ValueError(f"Unsupported source type: {datasource.type}") diff --git a/knowledge server/loader/lark.py b/knowledge server/loader/lark.py new file mode 100644 index 0000000..1d773b3 --- /dev/null +++ b/knowledge server/loader/lark.py @@ -0,0 +1,65 @@ +from langchain_community.document_loaders.base import BaseLoader +from langchain_core.documents import Document +import lark_oapi as lark +from lark_oapi.api.docx.v1 import RawContentDocumentRequest, GetDocumentRequest +from lark_oapi.api.wiki.v2 import GetNodeSpaceRequest + +from typing import Iterator + + +class LarkSuiteDocLoader(BaseLoader): + client: lark.Client + document_id: str + + def __init__(self, client: lark.Client, document_id: str): + self.client = client + self.document_id = document_id + + def lazy_load(self) -> Iterator[Document]: + request_raw = ( + RawContentDocumentRequest.builder().document_id(self.document_id).build() + ) + + response_raw = self.client.docx.v1.document.raw_content(request_raw) + if not response_raw.success(): + raise RuntimeError( + f"Failed to fetch document raw content: {response_raw.msg}" + ) + + request_metadata = ( + GetDocumentRequest.builder().document_id(self.document_id).build() + ) + + response_metadata = self.client.docx.v1.document.get(request_metadata) + if not response_metadata.success(): + raise RuntimeError( + f"Failed to fetch document metadata: {response_metadata.msg}" + ) + + metadata = { + "document_id": self.document_id, + "revision_id": response_metadata.data.document.revision_id, + "title": response_metadata.data.document.title, + "source": f"lark-doc://{self.document_id}", + } + + content = response_raw.data.content + + if content is None: + content = "" + + yield Document(page_content=str(content), metadata=metadata) + + +class LarkSuiteWikiLoader(LarkSuiteDocLoader): + def __init__(self, client: lark.Client, wiki_id: str): + request = GetNodeSpaceRequest.builder().token(wiki_id).obj_type("wiki").build() + + response = client.wiki.v2.space.get_node(request) + if not response.success(): + raise RuntimeError(f"Failed to fetch wiki node space: {response.msg}") + + document_id = response.data.node.obj_token + if not document_id: + raise RuntimeError("Wiki node space does not contain a valid document ID.") + super().__init__(client=client, document_id=str(document_id)) diff --git a/knowledge server/main.py b/knowledge server/main.py index ce0d163..0bad0b9 100644 --- a/knowledge server/main.py +++ b/knowledge server/main.py @@ -1,12 +1,15 @@ import logging from config.config import Config -from loader.datasource import Datasource, DatasourceLoader +from loader.factory import Datasource, LoaderFactory from langchain_text_splitters import RecursiveCharacterTextSplitter from mcp.server.fastmcp import FastMCP from model.factory import EmbeddingsFactory from vector_store.milvus import MilvusVectorStore +import lark_oapi as lark + + CONFIG_FILE_PATH = "config.yaml" @@ -35,13 +38,24 @@ def read_datasource(logger: logging.Logger) -> list[Datasource]: for source in from_yaml.get("datasource", []): if "type" not in source: raise ValueError("Document source type is missing.") - if source["type"] == "directory" and "path" not in source: - raise ValueError("Directory source path is missing.") - datasources.append( - Datasource( - source["type"], source.get("path", None), source.get("url", None) + if source["type"] == "directory": + if "path" not in source: + raise ValueError("Directory source path is missing.") + datasources.append( + Datasource( + source["type"], source.get("path", None), source.get("url", None) + ) ) - ) + elif source["type"] == "lark-doc": + if "id" not in source: + raise ValueError("Lark document source id is missing.") + datasources.append(Datasource(source["type"], id=source.get("id", None))) + elif source["type"] == "lark-wiki": + if "id" not in source: + raise ValueError("Lark wiki source id is missing.") + datasources.append(Datasource(source["type"], id=source.get("id", None))) + else: + raise ValueError(f"Unsupported document source type: {source['type']}") return datasources @@ -53,8 +67,19 @@ def main(): logger.setLevel(config.log_level.upper()) logger.debug(config) + lark_log_level = getattr(lark.LogLevel, config.log_level.upper(), lark.LogLevel.INFO) + lark_client = ( + lark.Client.builder() + .domain(config.lark.domain) + .app_id(config.lark.app_id) + .app_secret(config.lark.app_secret) + .log_level(lark_log_level) + .build() + ) + datasources = read_datasource(logger) - loaders = [DatasourceLoader(datasource, logger) for datasource in datasources] + loaderFactory = LoaderFactory(lark_client=lark_client, logger=logger) + loaders = [loaderFactory.get_loader(datasource) for datasource in datasources] docs = [] for loader in loaders: @@ -84,27 +109,17 @@ def main(): logger.info("Adding %d document chunks to the vector store", len(chunks)) vector_store.add_documents(chunks) - logger.debug( - "Searching for query: %s", "What is Barito project name is inspired from?" - ) - results = vector_store.search( - query="What is Barito project name is inspired from?", top_k=4 - ) - logger.debug("Search results total: %s", len(results)) - for i, result in enumerate(results): - logger.info("Result %d: %s", i + 1, result.page_content[:200]) - - logger.info( - "Searching for query: %s", + queries = [ + "What is Barito project name is inspired from?", "Who is the goto financial head of consumer payment infrastructure?", - ) - results = vector_store.search( - query="Who is the goto financial head of consumer payment infrastructure?", - top_k=4, - ) - logger.debug("Search results total: %s", len(results)) - for i, result in enumerate(results): - logger.info("Result %d: %s", i + 1, result.page_content[:200]) + "How to query pod metrics?", + ] + for query in queries: + logger.debug("Searching for query: %s", query) + results = vector_store.search(query=query, top_k=4) + logger.debug("Search results total: %s", len(results)) + for i, result in enumerate(results): + logger.info("Result %d: %s", i + 1, result.page_content[:200]) mcp_server = FastMCP("KnowledgeServer") diff --git a/knowledge server/pyproject.toml b/knowledge server/pyproject.toml index d47f42d..14b3333 100644 --- a/knowledge server/pyproject.toml +++ b/knowledge server/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.12" dependencies = [ "langchain-community>=0.4.1", "langchain-ollama>=1.0.0", + "lark-oapi>=1.4.24", "mcp>=1.22.0", "pymilvus>=2.6.4", "pypdf>=6.4.0", diff --git a/knowledge server/uv.lock b/knowledge server/uv.lock index ceba306..dc281de 100644 --- a/knowledge server/uv.lock +++ b/knowledge server/uv.lock @@ -708,6 +708,7 @@ source = { virtual = "." } dependencies = [ { name = "langchain-community" }, { name = "langchain-ollama" }, + { name = "lark-oapi" }, { name = "mcp" }, { name = "pymilvus" }, { name = "pypdf" }, @@ -725,6 +726,7 @@ dev = [ requires-dist = [ { name = "langchain-community", specifier = ">=0.4.1" }, { name = "langchain-ollama", specifier = ">=1.0.0" }, + { name = "lark-oapi", specifier = ">=1.4.24" }, { name = "mcp", specifier = ">=1.22.0" }, { name = "pymilvus", specifier = ">=2.6.4" }, { name = "pypdf", specifier = ">=6.4.0" }, @@ -849,6 +851,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/19/23/54ac7435ddd2d03ce638329808f8fc2d3b23a236a3b5afaddf740be9cd61/langsmith-0.4.48-py3-none-any.whl", hash = "sha256:cf4cc3c17696f1ad1212d629409d0f8ce2038709777c22906534a22d5fdf42c6", size = 410629, upload-time = "2025-11-26T01:06:51.001Z" }, ] +[[package]] +name = "lark-oapi" +version = "1.4.24" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "httpx" }, + { name = "pycryptodome" }, + { name = "requests" }, + { name = "requests-toolbelt" }, + { name = "websockets" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/01/25acd41e3f0e67e77397176b333f705927e462cc60545d39ace813858c49/lark_oapi-1.4.24-py3-none-any.whl", hash = "sha256:a253289e9018b65e90a854ff1bc395ba150d6ad49b8218643997074b9d85d087", size = 6614478, upload-time = "2025-11-12T08:31:34.938Z" }, +] + [[package]] name = "lxml" version = "6.0.2" @@ -1417,6 +1434,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/e3/59cd50310fc9b59512193629e1984c1f95e5c8ae6e5d8c69532ccc65a7fe/pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934", size = 118140, upload-time = "2025-09-09T13:23:46.651Z" }, ] +[[package]] +name = "pycryptodome" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/a6/8452177684d5e906854776276ddd34eca30d1b1e15aa1ee9cefc289a33f5/pycryptodome-3.23.0.tar.gz", hash = "sha256:447700a657182d60338bab09fdb27518f8856aecd80ae4c6bdddb67ff5da44ef", size = 4921276, upload-time = "2025-05-17T17:21:45.242Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/5d/bdb09489b63cd34a976cc9e2a8d938114f7a53a74d3dd4f125ffa49dce82/pycryptodome-3.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:0011f7f00cdb74879142011f95133274741778abba114ceca229adbf8e62c3e4", size = 2495152, upload-time = "2025-05-17T17:20:20.833Z" }, + { url = "https://files.pythonhosted.org/packages/a7/ce/7840250ed4cc0039c433cd41715536f926d6e86ce84e904068eb3244b6a6/pycryptodome-3.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:90460fc9e088ce095f9ee8356722d4f10f86e5be06e2354230a9880b9c549aae", size = 1639348, upload-time = "2025-05-17T17:20:23.171Z" }, + { url = "https://files.pythonhosted.org/packages/ee/f0/991da24c55c1f688d6a3b5a11940567353f74590734ee4a64294834ae472/pycryptodome-3.23.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4764e64b269fc83b00f682c47443c2e6e85b18273712b98aa43bcb77f8570477", size = 2184033, upload-time = "2025-05-17T17:20:25.424Z" }, + { url = "https://files.pythonhosted.org/packages/54/16/0e11882deddf00f68b68dd4e8e442ddc30641f31afeb2bc25588124ac8de/pycryptodome-3.23.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb8f24adb74984aa0e5d07a2368ad95276cf38051fe2dc6605cbcf482e04f2a7", size = 2270142, upload-time = "2025-05-17T17:20:27.808Z" }, + { url = "https://files.pythonhosted.org/packages/d5/fc/4347fea23a3f95ffb931f383ff28b3f7b1fe868739182cb76718c0da86a1/pycryptodome-3.23.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d97618c9c6684a97ef7637ba43bdf6663a2e2e77efe0f863cce97a76af396446", size = 2309384, upload-time = "2025-05-17T17:20:30.765Z" }, + { url = "https://files.pythonhosted.org/packages/6e/d9/c5261780b69ce66d8cfab25d2797bd6e82ba0241804694cd48be41add5eb/pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9a53a4fe5cb075075d515797d6ce2f56772ea7e6a1e5e4b96cf78a14bac3d265", size = 2183237, upload-time = "2025-05-17T17:20:33.736Z" }, + { url = "https://files.pythonhosted.org/packages/5a/6f/3af2ffedd5cfa08c631f89452c6648c4d779e7772dfc388c77c920ca6bbf/pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:763d1d74f56f031788e5d307029caef067febf890cd1f8bf61183ae142f1a77b", size = 2343898, upload-time = "2025-05-17T17:20:36.086Z" }, + { url = "https://files.pythonhosted.org/packages/9a/dc/9060d807039ee5de6e2f260f72f3d70ac213993a804f5e67e0a73a56dd2f/pycryptodome-3.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:954af0e2bd7cea83ce72243b14e4fb518b18f0c1649b576d114973e2073b273d", size = 2269197, upload-time = "2025-05-17T17:20:38.414Z" }, + { url = "https://files.pythonhosted.org/packages/f9/34/e6c8ca177cb29dcc4967fef73f5de445912f93bd0343c9c33c8e5bf8cde8/pycryptodome-3.23.0-cp313-cp313t-win32.whl", hash = "sha256:257bb3572c63ad8ba40b89f6fc9d63a2a628e9f9708d31ee26560925ebe0210a", size = 1768600, upload-time = "2025-05-17T17:20:40.688Z" }, + { url = "https://files.pythonhosted.org/packages/e4/1d/89756b8d7ff623ad0160f4539da571d1f594d21ee6d68be130a6eccb39a4/pycryptodome-3.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:6501790c5b62a29fcb227bd6b62012181d886a767ce9ed03b303d1f22eb5c625", size = 1799740, upload-time = "2025-05-17T17:20:42.413Z" }, + { url = "https://files.pythonhosted.org/packages/5d/61/35a64f0feaea9fd07f0d91209e7be91726eb48c0f1bfc6720647194071e4/pycryptodome-3.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:9a77627a330ab23ca43b48b130e202582e91cc69619947840ea4d2d1be21eb39", size = 1703685, upload-time = "2025-05-17T17:20:44.388Z" }, + { url = "https://files.pythonhosted.org/packages/db/6c/a1f71542c969912bb0e106f64f60a56cc1f0fabecf9396f45accbe63fa68/pycryptodome-3.23.0-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:187058ab80b3281b1de11c2e6842a357a1f71b42cb1e15bce373f3d238135c27", size = 2495627, upload-time = "2025-05-17T17:20:47.139Z" }, + { url = "https://files.pythonhosted.org/packages/6e/4e/a066527e079fc5002390c8acdd3aca431e6ea0a50ffd7201551175b47323/pycryptodome-3.23.0-cp37-abi3-macosx_10_9_x86_64.whl", hash = "sha256:cfb5cd445280c5b0a4e6187a7ce8de5a07b5f3f897f235caa11f1f435f182843", size = 1640362, upload-time = "2025-05-17T17:20:50.392Z" }, + { url = "https://files.pythonhosted.org/packages/50/52/adaf4c8c100a8c49d2bd058e5b551f73dfd8cb89eb4911e25a0c469b6b4e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67bd81fcbe34f43ad9422ee8fd4843c8e7198dd88dd3d40e6de42ee65fbe1490", size = 2182625, upload-time = "2025-05-17T17:20:52.866Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e9/a09476d436d0ff1402ac3867d933c61805ec2326c6ea557aeeac3825604e/pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8987bd3307a39bc03df5c8e0e3d8be0c4c3518b7f044b0f4c15d1aa78f52575", size = 2268954, upload-time = "2025-05-17T17:20:55.027Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c5/ffe6474e0c551d54cab931918127c46d70cab8f114e0c2b5a3c071c2f484/pycryptodome-3.23.0-cp37-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa0698f65e5b570426fc31b8162ed4603b0c2841cbb9088e2b01641e3065915b", size = 2308534, upload-time = "2025-05-17T17:20:57.279Z" }, + { url = "https://files.pythonhosted.org/packages/18/28/e199677fc15ecf43010f2463fde4c1a53015d1fe95fb03bca2890836603a/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:53ecbafc2b55353edcebd64bf5da94a2a2cdf5090a6915bcca6eca6cc452585a", size = 2181853, upload-time = "2025-05-17T17:20:59.322Z" }, + { url = "https://files.pythonhosted.org/packages/ce/ea/4fdb09f2165ce1365c9eaefef36625583371ee514db58dc9b65d3a255c4c/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_i686.whl", hash = "sha256:156df9667ad9f2ad26255926524e1c136d6664b741547deb0a86a9acf5ea631f", size = 2342465, upload-time = "2025-05-17T17:21:03.83Z" }, + { url = "https://files.pythonhosted.org/packages/22/82/6edc3fc42fe9284aead511394bac167693fb2b0e0395b28b8bedaa07ef04/pycryptodome-3.23.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:dea827b4d55ee390dc89b2afe5927d4308a8b538ae91d9c6f7a5090f397af1aa", size = 2267414, upload-time = "2025-05-17T17:21:06.72Z" }, + { url = "https://files.pythonhosted.org/packages/59/fe/aae679b64363eb78326c7fdc9d06ec3de18bac68be4b612fc1fe8902693c/pycryptodome-3.23.0-cp37-abi3-win32.whl", hash = "sha256:507dbead45474b62b2bbe318eb1c4c8ee641077532067fec9c1aa82c31f84886", size = 1768484, upload-time = "2025-05-17T17:21:08.535Z" }, + { url = "https://files.pythonhosted.org/packages/54/2f/e97a1b8294db0daaa87012c24a7bb714147c7ade7656973fd6c736b484ff/pycryptodome-3.23.0-cp37-abi3-win_amd64.whl", hash = "sha256:c75b52aacc6c0c260f204cbdd834f76edc9fb0d8e0da9fbf8352ef58202564e2", size = 1799636, upload-time = "2025-05-17T17:21:10.393Z" }, + { url = "https://files.pythonhosted.org/packages/18/3d/f9441a0d798bf2b1e645adc3265e55706aead1255ccdad3856dbdcffec14/pycryptodome-3.23.0-cp37-abi3-win_arm64.whl", hash = "sha256:11eeeb6917903876f134b56ba11abe95c0b0fd5e3330def218083c7d98bbcb3c", size = 1703675, upload-time = "2025-05-17T17:21:13.146Z" }, +] + [[package]] name = "pydantic" version = "2.12.4" @@ -2239,6 +2286,37 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload-time = "2017-04-05T20:21:32.581Z" }, ] +[[package]] +name = "websockets" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/21/e6/26d09fab466b7ca9c7737474c52be4f76a40301b08362eb2dbc19dcc16c1/websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee", size = 177016, upload-time = "2025-03-05T20:03:41.606Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/6b/4545a0d843594f5d0771e86463606a3988b5a09ca5123136f8a76580dd63/websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3", size = 175437, upload-time = "2025-03-05T20:02:16.706Z" }, + { url = "https://files.pythonhosted.org/packages/f4/71/809a0f5f6a06522af902e0f2ea2757f71ead94610010cf570ab5c98e99ed/websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665", size = 173096, upload-time = "2025-03-05T20:02:18.832Z" }, + { url = "https://files.pythonhosted.org/packages/3d/69/1a681dd6f02180916f116894181eab8b2e25b31e484c5d0eae637ec01f7c/websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2", size = 173332, upload-time = "2025-03-05T20:02:20.187Z" }, + { url = "https://files.pythonhosted.org/packages/a6/02/0073b3952f5bce97eafbb35757f8d0d54812b6174ed8dd952aa08429bcc3/websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215", size = 183152, upload-time = "2025-03-05T20:02:22.286Z" }, + { url = "https://files.pythonhosted.org/packages/74/45/c205c8480eafd114b428284840da0b1be9ffd0e4f87338dc95dc6ff961a1/websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5", size = 182096, upload-time = "2025-03-05T20:02:24.368Z" }, + { url = "https://files.pythonhosted.org/packages/14/8f/aa61f528fba38578ec553c145857a181384c72b98156f858ca5c8e82d9d3/websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65", size = 182523, upload-time = "2025-03-05T20:02:25.669Z" }, + { url = "https://files.pythonhosted.org/packages/ec/6d/0267396610add5bc0d0d3e77f546d4cd287200804fe02323797de77dbce9/websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe", size = 182790, upload-time = "2025-03-05T20:02:26.99Z" }, + { url = "https://files.pythonhosted.org/packages/02/05/c68c5adbf679cf610ae2f74a9b871ae84564462955d991178f95a1ddb7dd/websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4", size = 182165, upload-time = "2025-03-05T20:02:30.291Z" }, + { url = "https://files.pythonhosted.org/packages/29/93/bb672df7b2f5faac89761cb5fa34f5cec45a4026c383a4b5761c6cea5c16/websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597", size = 182160, upload-time = "2025-03-05T20:02:31.634Z" }, + { url = "https://files.pythonhosted.org/packages/ff/83/de1f7709376dc3ca9b7eeb4b9a07b4526b14876b6d372a4dc62312bebee0/websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9", size = 176395, upload-time = "2025-03-05T20:02:33.017Z" }, + { url = "https://files.pythonhosted.org/packages/7d/71/abf2ebc3bbfa40f391ce1428c7168fb20582d0ff57019b69ea20fa698043/websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7", size = 176841, upload-time = "2025-03-05T20:02:34.498Z" }, + { url = "https://files.pythonhosted.org/packages/cb/9f/51f0cf64471a9d2b4d0fc6c534f323b664e7095640c34562f5182e5a7195/websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931", size = 175440, upload-time = "2025-03-05T20:02:36.695Z" }, + { url = "https://files.pythonhosted.org/packages/8a/05/aa116ec9943c718905997412c5989f7ed671bc0188ee2ba89520e8765d7b/websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675", size = 173098, upload-time = "2025-03-05T20:02:37.985Z" }, + { url = "https://files.pythonhosted.org/packages/ff/0b/33cef55ff24f2d92924923c99926dcce78e7bd922d649467f0eda8368923/websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151", size = 173329, upload-time = "2025-03-05T20:02:39.298Z" }, + { url = "https://files.pythonhosted.org/packages/31/1d/063b25dcc01faa8fada1469bdf769de3768b7044eac9d41f734fd7b6ad6d/websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22", size = 183111, upload-time = "2025-03-05T20:02:40.595Z" }, + { url = "https://files.pythonhosted.org/packages/93/53/9a87ee494a51bf63e4ec9241c1ccc4f7c2f45fff85d5bde2ff74fcb68b9e/websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f", size = 182054, upload-time = "2025-03-05T20:02:41.926Z" }, + { url = "https://files.pythonhosted.org/packages/ff/b2/83a6ddf56cdcbad4e3d841fcc55d6ba7d19aeb89c50f24dd7e859ec0805f/websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8", size = 182496, upload-time = "2025-03-05T20:02:43.304Z" }, + { url = "https://files.pythonhosted.org/packages/98/41/e7038944ed0abf34c45aa4635ba28136f06052e08fc2168520bb8b25149f/websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375", size = 182829, upload-time = "2025-03-05T20:02:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/e0/17/de15b6158680c7623c6ef0db361da965ab25d813ae54fcfeae2e5b9ef910/websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d", size = 182217, upload-time = "2025-03-05T20:02:50.14Z" }, + { url = "https://files.pythonhosted.org/packages/33/2b/1f168cb6041853eef0362fb9554c3824367c5560cbdaad89ac40f8c2edfc/websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4", size = 182195, upload-time = "2025-03-05T20:02:51.561Z" }, + { url = "https://files.pythonhosted.org/packages/86/eb/20b6cdf273913d0ad05a6a14aed4b9a85591c18a987a3d47f20fa13dcc47/websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa", size = 176393, upload-time = "2025-03-05T20:02:53.814Z" }, + { url = "https://files.pythonhosted.org/packages/1b/6c/c65773d6cab416a64d191d6ee8a8b1c68a09970ea6909d16965d26bfed1e/websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561", size = 176837, upload-time = "2025-03-05T20:02:55.237Z" }, + { url = "https://files.pythonhosted.org/packages/fa/a8/5b41e0da817d64113292ab1f8247140aac61cbf6cfd085d6a0fa77f4984f/websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f", size = 169743, upload-time = "2025-03-05T20:03:39.41Z" }, +] + [[package]] name = "wrapt" version = "2.0.1" From d039fd7d9c7b38b1e9ed5c1bbd28a99cb200956a Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 09:58:29 +0700 Subject: [PATCH 2/7] docs: update readme --- knowledge server/README.md | 101 +++++++++++++++++++++-- knowledge server/datasource.example.yaml | 4 + 2 files changed, 97 insertions(+), 8 deletions(-) diff --git a/knowledge server/README.md b/knowledge server/README.md index 3f236de..86fe37b 100644 --- a/knowledge server/README.md +++ b/knowledge server/README.md @@ -4,11 +4,12 @@ A Model Context Protocol (MCP) server that provides AI agents with access to a k ## Features -- **Document Loading**: Supports PDF and Markdown files from local directories +- **Document Loading**: Supports PDF and Markdown files from local directories, Lark Docs, and Lark Wikis - **Vector Storage**: Uses Milvus for efficient vector similarity search with full-text search support - **Embeddings**: Configurable embeddings via Ollama - **Text Chunking**: Recursive character text splitting with configurable chunk size and overlap - **MCP Integration**: Exposes knowledge base queries through FastMCP server +- **Lark Integration**: Direct integration with Lark Suite for loading documents and wikis - **Flexible Configuration**: YAML-based configuration for easy customization ## Architecture @@ -16,7 +17,9 @@ A Model Context Protocol (MCP) server that provides AI agents with access to a k ``` ┌─────────────┐ ┌──────────────┐ ┌────────────┐ │ Datasource │─────▶│ Loader │─────▶│ Splitter │ -│ (YAML) │ │ (PDF/MD) │ │ │ +│ (YAML) │ │ Directory │ │ │ +│ │ │ Lark Doc │ │ │ +│ │ │ Lark Wiki │ │ │ └─────────────┘ └──────────────┘ └────────────┘ │ ▼ @@ -55,19 +58,63 @@ vector_store: chunk_size: 1000 chunk_overlap: 200 embeddings: - provider: ollama - model: nomic-embed-text + source: ollama + model: embeddinggemma:latest +lark: + domain: "https://open.larksuite.com" + app_id: "your_app_id" + app_secret: "your_app_secret" ``` +**Configuration Options:** +- `log_level`: Logging level (DEBUG, INFO, WARNING, ERROR) - applies to both application and Lark client +- `vector_store`: Milvus configuration +- `chunk_size`: Size of text chunks for splitting +- `chunk_overlap`: Overlap between chunks +- `embeddings`: Ollama embeddings configuration +- `lark`: Lark Suite API credentials (required only if using Lark datasources) + ### 2. Configure Data Sources -Create `datasource.yaml`: +Create `datasource.yaml` with one or more data sources: + +**Local Directory (PDF and Markdown files):** +```yaml +datasource: + - type: directory + path: ../datasets/ +``` + +**Lark Document:** +```yaml +datasource: + - type: lark-doc + id: "doc-id" +``` + +**Lark Wiki:** +```yaml +datasource: + - type: lark-wiki + id: "wiki-id" +``` + +**Multiple Sources:** ```yaml datasource: - type: directory path: ../datasets/ + - type: lark-doc + id: "doc-id" + - type: lark-wiki + id: "wiki-id" ``` +**Supported Datasource Types:** +- `directory`: Load PDF and Markdown files from a local directory +- `lark-doc`: Load a single Lark document by ID +- `lark-wiki`: Load all pages from a Lark wiki by ID + ### 3. Start Milvus Using Docker Compose: @@ -81,10 +128,16 @@ This will start Milvus on `http://localhost:19530`. ### Running the Server +Using Python directly: ```bash uv run python main.py ``` +Or using the Makefile: +```bash +make run +``` + The server will: 1. Load documents from configured datasources 2. Split documents into chunks @@ -110,8 +163,9 @@ query_knowledge_base( ├── config/ │ └── config.py # Configuration loader ├── loader/ -│ ├── datasource.py # Datasource abstraction -│ └── directory.py # Directory loader (PDF/MD) +│ ├── factory.py # Loader factory and datasource abstraction +│ ├── directory.py # Directory loader (PDF/MD) +│ └── lark.py # Lark Suite loaders (Doc/Wiki) ├── model/ │ ├── factory.py # Embeddings factory │ └── model_garden.py # Model configurations @@ -120,6 +174,7 @@ query_knowledge_base( ├── main.py # Application entry point ├── config.yaml # Runtime configuration ├── datasource.yaml # Data source definitions +├── Makefile # Development tasks └── pyproject.toml # Project dependencies ``` @@ -131,15 +186,28 @@ query_knowledge_base( - **pymilvus**: Milvus vector database client - **pypdf**: PDF parsing - **pyyaml**: YAML configuration parsing +- **lark-oapi**: Lark Suite Open API SDK ## Development ### Code Style +Run all checks: +```bash +make check +``` + +Or run individual tasks: +```bash +make lint # Run ruff check --fix +make format # Run ruff format +make type-check # Run ty check +``` + Format code using Ruff: ```bash uv run ruff format . -uv run ruff check . +uv run ruff check --fix ``` ### Type Checking @@ -155,6 +223,23 @@ If you encounter `ImportError: cannot import name 'Blob'`, ensure you're using t from langchain_community.document_loaders.blob_loaders import Blob ``` +### Lark API Issues + +**Authentication Errors:** +- Verify `app_id` and `app_secret` in `config.yaml` +- Ensure your Lark app has the required permissions: + - `docx:document` for document access + - `wiki:wiki` for wiki access + +**Document Not Found:** +- Verify the document/wiki ID is correct +- Check that your app has access to the document/wiki +- Ensure the document/wiki hasn't been deleted + +**Getting Document IDs:** +- For Lark Docs: The ID is in the URL: `https://xxx.larksuite.com/docx/{document_id}` +- For Lark Wikis: The ID is in the URL: `https://xxx.larksuite.com/wiki/{wiki_id}` + ### Milvus Connection Issues Verify Milvus is running: diff --git a/knowledge server/datasource.example.yaml b/knowledge server/datasource.example.yaml index 5b4de2f..a12a614 100644 --- a/knowledge server/datasource.example.yaml +++ b/knowledge server/datasource.example.yaml @@ -1,3 +1,7 @@ datasource: - type: directory path: ../datasets/ + - type: lark-doc + id: "some-lark-doc-id" + - type: lark-wiki + id: "some-lark-wiki-id" From 751fbe14bc4adeca05583c03fa3b44191b24a3ce Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 10:05:41 +0700 Subject: [PATCH 3/7] refactor: move the datasource validation to the class constructor --- knowledge server/loader/factory.py | 12 ++++++++++++ knowledge server/main.py | 26 +++++++------------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/knowledge server/loader/factory.py b/knowledge server/loader/factory.py index f43278f..552280a 100644 --- a/knowledge server/loader/factory.py +++ b/knowledge server/loader/factory.py @@ -17,10 +17,22 @@ class Datasource: id: str def __init__(self, type: str, path: str = "", url: str = "", id: str = ""): + if not type: + raise ValueError("Document source type is missing.") + self.type = type self.path = path self.url = url self.id = id + + if self.type == "directory" and not self.path: + raise ValueError("Directory source path is missing.") + elif self.type == "lark-doc" and not self.id: + raise ValueError("Lark document source id is missing.") + elif self.type == "lark-wiki" and not self.id: + raise ValueError("Lark wiki source id is missing.") + elif self.type not in ["directory", "lark-doc", "lark-wiki"]: + raise ValueError(f"Unsupported document source type: {self.type}") class LoaderFactory: diff --git a/knowledge server/main.py b/knowledge server/main.py index 0bad0b9..5f10ee1 100644 --- a/knowledge server/main.py +++ b/knowledge server/main.py @@ -36,26 +36,14 @@ def read_datasource(logger: logging.Logger) -> list[Datasource]: datasources = [] for source in from_yaml.get("datasource", []): - if "type" not in source: - raise ValueError("Document source type is missing.") - if source["type"] == "directory": - if "path" not in source: - raise ValueError("Directory source path is missing.") - datasources.append( - Datasource( - source["type"], source.get("path", None), source.get("url", None) - ) + datasources.append( + Datasource( + type=source.get("type", ""), + path=source.get("path", ""), + url=source.get("url", ""), + id=source.get("id", "") ) - elif source["type"] == "lark-doc": - if "id" not in source: - raise ValueError("Lark document source id is missing.") - datasources.append(Datasource(source["type"], id=source.get("id", None))) - elif source["type"] == "lark-wiki": - if "id" not in source: - raise ValueError("Lark wiki source id is missing.") - datasources.append(Datasource(source["type"], id=source.get("id", None))) - else: - raise ValueError(f"Unsupported document source type: {source['type']}") + ) return datasources From cafa5a4bbb402dc3c788270f217be1c2a85dac0d Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 13:59:53 +0700 Subject: [PATCH 4/7] fix: add additional metadata for lark source --- knowledge server/loader/factory.py | 4 ++-- knowledge server/loader/lark.py | 19 +++++++++++++++++++ knowledge server/main.py | 6 ++++-- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/knowledge server/loader/factory.py b/knowledge server/loader/factory.py index 552280a..30941fa 100644 --- a/knowledge server/loader/factory.py +++ b/knowledge server/loader/factory.py @@ -19,12 +19,12 @@ class Datasource: def __init__(self, type: str, path: str = "", url: str = "", id: str = ""): if not type: raise ValueError("Document source type is missing.") - + self.type = type self.path = path self.url = url self.id = id - + if self.type == "directory" and not self.path: raise ValueError("Directory source path is missing.") elif self.type == "lark-doc" and not self.id: diff --git a/knowledge server/loader/lark.py b/knowledge server/loader/lark.py index 1d773b3..81c901e 100644 --- a/knowledge server/loader/lark.py +++ b/knowledge server/loader/lark.py @@ -40,6 +40,7 @@ def lazy_load(self) -> Iterator[Document]: "document_id": self.document_id, "revision_id": response_metadata.data.document.revision_id, "title": response_metadata.data.document.title, + "type": "lark-doc", "source": f"lark-doc://{self.document_id}", } @@ -52,6 +53,9 @@ def lazy_load(self) -> Iterator[Document]: class LarkSuiteWikiLoader(LarkSuiteDocLoader): + wiki_metadata: dict + wiki_id: str + def __init__(self, client: lark.Client, wiki_id: str): request = GetNodeSpaceRequest.builder().token(wiki_id).obj_type("wiki").build() @@ -59,7 +63,22 @@ def __init__(self, client: lark.Client, wiki_id: str): if not response.success(): raise RuntimeError(f"Failed to fetch wiki node space: {response.msg}") + self.wiki_id = wiki_id + self.wiki_metadata = { + "owner": response.data.node.owner, + "creator": response.data.node.creator, + } + document_id = response.data.node.obj_token if not document_id: raise RuntimeError("Wiki node space does not contain a valid document ID.") super().__init__(client=client, document_id=str(document_id)) + + def lazy_load(self): + document = super().lazy_load() + for doc in document: + doc.metadata["source"] = f"lark-wiki://{self.wiki_id}" + doc.metadata["type"] = "lark-wiki" + doc.metadata["lark_owner"] = self.wiki_metadata["owner"] + doc.metadata["lark_creator"] = self.wiki_metadata["creator"] + yield doc diff --git a/knowledge server/main.py b/knowledge server/main.py index 5f10ee1..5b5813e 100644 --- a/knowledge server/main.py +++ b/knowledge server/main.py @@ -41,7 +41,7 @@ def read_datasource(logger: logging.Logger) -> list[Datasource]: type=source.get("type", ""), path=source.get("path", ""), url=source.get("url", ""), - id=source.get("id", "") + id=source.get("id", ""), ) ) @@ -55,7 +55,9 @@ def main(): logger.setLevel(config.log_level.upper()) logger.debug(config) - lark_log_level = getattr(lark.LogLevel, config.log_level.upper(), lark.LogLevel.INFO) + lark_log_level = getattr( + lark.LogLevel, config.log_level.upper(), lark.LogLevel.INFO + ) lark_client = ( lark.Client.builder() .domain(config.lark.domain) From df9d6f94752ede6876bfa95025a7fd7182c3435c Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:49:22 +0700 Subject: [PATCH 5/7] feat: add lark space as datasource --- knowledge server/datasource.example.yaml | 2 + knowledge server/loader/directory.py | 6 ++- knowledge server/loader/factory.py | 15 +++++- knowledge server/loader/lark.py | 67 +++++++++++++++++++++++- knowledge server/main.py | 28 +++++----- 5 files changed, 100 insertions(+), 18 deletions(-) diff --git a/knowledge server/datasource.example.yaml b/knowledge server/datasource.example.yaml index a12a614..ea37e6a 100644 --- a/knowledge server/datasource.example.yaml +++ b/knowledge server/datasource.example.yaml @@ -5,3 +5,5 @@ datasource: id: "some-lark-doc-id" - type: lark-wiki id: "some-lark-wiki-id" + - type: lark-space + id: "some-lark-space-id" \ No newline at end of file diff --git a/knowledge server/loader/directory.py b/knowledge server/loader/directory.py index e6854d8..476a892 100644 --- a/knowledge server/loader/directory.py +++ b/knowledge server/loader/directory.py @@ -1,4 +1,5 @@ from collections.abc import Iterator +import logging from langchain_core.document_loaders.base import BaseBlobParser from langchain_community.document_loaders import ( FileSystemBlobLoader, @@ -23,10 +24,11 @@ def parse(self, blob: Blob) -> list[Document]: class DirectoryLoader(BaseLoader): pdf_loader: PyPDFDirectoryLoader md_loader: GenericLoader + logger: logging.Logger - def __init__(self, path: str, logger) -> None: + def __init__(self, path: str, logger: logging.Logger) -> None: self.pdf_loader = PyPDFDirectoryLoader( - path, recursive=False, mode="single", extraction_mode="layout" + path, recursive=True, mode="single", extraction_mode="layout" ) self.md_loader = GenericLoader( blob_loader=FileSystemBlobLoader( diff --git a/knowledge server/loader/factory.py b/knowledge server/loader/factory.py index 30941fa..a0eb4c6 100644 --- a/knowledge server/loader/factory.py +++ b/knowledge server/loader/factory.py @@ -1,7 +1,11 @@ import logging -from loader.lark import LarkSuiteDocLoader, LarkSuiteWikiLoader +from loader.lark import ( + LarkSuiteDocLoader, + LarkSuiteWikiLoader, + LarkSuiteWikiSpaceLoader, +) from langchain_core.document_loaders.base import BaseLoader @@ -31,7 +35,9 @@ def __init__(self, type: str, path: str = "", url: str = "", id: str = ""): raise ValueError("Lark document source id is missing.") elif self.type == "lark-wiki" and not self.id: raise ValueError("Lark wiki source id is missing.") - elif self.type not in ["directory", "lark-doc", "lark-wiki"]: + elif self.type == "lark-space" and not self.id: + raise ValueError("Lark space source id is missing.") + elif self.type not in ["directory", "lark-doc", "lark-wiki", "lark-space"]: raise ValueError(f"Unsupported document source type: {self.type}") @@ -56,5 +62,10 @@ def get_loader(self, datasource: Datasource) -> BaseLoader: client=self.lark_client, wiki_id=datasource.id, ) + elif datasource.type == "lark-space": + return LarkSuiteWikiSpaceLoader( + client=self.lark_client, + space_id=datasource.id, + ) else: raise ValueError(f"Unsupported source type: {datasource.type}") diff --git a/knowledge server/loader/lark.py b/knowledge server/loader/lark.py index 81c901e..456cc79 100644 --- a/knowledge server/loader/lark.py +++ b/knowledge server/loader/lark.py @@ -2,10 +2,16 @@ from langchain_core.documents import Document import lark_oapi as lark from lark_oapi.api.docx.v1 import RawContentDocumentRequest, GetDocumentRequest -from lark_oapi.api.wiki.v2 import GetNodeSpaceRequest +from lark_oapi.api.wiki.v2 import ( + GetNodeSpaceRequest, + GetSpaceRequest, + ListSpaceNodeRequest, +) from typing import Iterator +### TODO: Restructure the metadata to show the Lark Wiki/Docs hierarchy better. + class LarkSuiteDocLoader(BaseLoader): client: lark.Client @@ -82,3 +88,62 @@ def lazy_load(self): doc.metadata["lark_owner"] = self.wiki_metadata["owner"] doc.metadata["lark_creator"] = self.wiki_metadata["creator"] yield doc + + +class LarkSuiteWikiSpaceLoader(BaseLoader): + client: lark.Client + space_id: str + space_name: str + space_description: str + + def __init__(self, client: lark.Client, space_id: str): + self.client = client + self.space_id = space_id + request = GetSpaceRequest.builder().space_id(self.space_id).build() + + response = self.client.wiki.v2.space.get(request) + if not response.success(): + raise RuntimeError(f"Failed to fetch wiki node space: {response.msg}") + + self.space_name = ( + str(response.data.space.name) if response.data.space.name else "" + ) + self.space_description = ( + str(response.data.space.description) + if response.data.space.description + else "" + ) + + def lazy_load(self) -> Iterator[Document]: + yield from self.__lazy_load_space_node_children(space_id=self.space_id) + + def __lazy_load_space_node_children( + self, space_id: str, parent_node_token: str = "" + ) -> Iterator[Document]: + ### TODO: handle pagination + request = ListSpaceNodeRequest.builder().space_id(space_id).page_size(50) + if parent_node_token != "": + request.parent_node_token(parent_node_token) + request = request.build() + + response = self.client.wiki.v2.space_node.list(request) + if not response.success(): + raise RuntimeError(f"Failed to list wiki space nodes: {response.msg}") + if not response.data.items: + return + + for node in response.data.items: + ### TODO: handle other obj_types + node_token = node.node_token if node.node_token else "" + if node.obj_type == "docx": + loader = LarkSuiteWikiLoader(client=self.client, wiki_id=node_token) + for doc in loader.lazy_load(): + doc.metadata["source"] = f"lark-space://{self.space_id}" + doc.metadata["space_name"] = self.space_name + doc.metadata["space_description"] = self.space_description + yield doc + if node.has_child: + yield from self.__lazy_load_space_node_children( + space_id=space_id, parent_node_token=node_token + ) + return diff --git a/knowledge server/main.py b/knowledge server/main.py index 5b5813e..79563eb 100644 --- a/knowledge server/main.py +++ b/knowledge server/main.py @@ -34,6 +34,10 @@ def read_datasource(logger: logging.Logger) -> list[Datasource]: with open("datasource.yaml", "r") as f: from_yaml = yaml.safe_load(f) + if not from_yaml or "datasource" not in from_yaml: + logger.error("No datasource found in datasource.yaml") + return [] + datasources = [] for source in from_yaml.get("datasource", []): datasources.append( @@ -71,23 +75,13 @@ def main(): loaderFactory = LoaderFactory(lark_client=lark_client, logger=logger) loaders = [loaderFactory.get_loader(datasource) for datasource in datasources] - docs = [] - for loader in loaders: - docs.extend(loader.load()) - - for doc in docs: - logger.info("Loaded document from %s", doc.metadata.get("source", "unknown")) - logger.debug("Document content: %s", doc.page_content[:100]) - splitter = RecursiveCharacterTextSplitter( chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap, length_function=len, is_separator_regex=False, ) - embeddings = EmbeddingsFactory.get_embeddings(config.embeddings) - vector_store = MilvusVectorStore( config.vector_store, chunk_size=config.chunk_size, @@ -95,9 +89,17 @@ def main(): embeddings=embeddings, logger=logger, ) - chunks = splitter.split_documents(docs) - logger.info("Adding %d document chunks to the vector store", len(chunks)) - vector_store.add_documents(chunks) + + for loader in loaders: + for doc in loader.lazy_load(): + logger.debug("Document content: %s", doc.page_content[:20]) + logger.info( + "Loaded document from %s", doc.metadata.get("source", "unknown") + ) + document = doc # make a copy from iterator to single Document + chunks = splitter.split_documents([document]) + logger.info("Adding %d document chunks to the vector store", len(chunks)) + vector_store.add_documents(chunks) queries = [ "What is Barito project name is inspired from?", From 9f3db9fd5109605db2f5ace8a32f274291e59914 Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 16:51:56 +0700 Subject: [PATCH 6/7] docs: update readme --- knowledge server/README.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/knowledge server/README.md b/knowledge server/README.md index 86fe37b..cead9ae 100644 --- a/knowledge server/README.md +++ b/knowledge server/README.md @@ -4,12 +4,12 @@ A Model Context Protocol (MCP) server that provides AI agents with access to a k ## Features -- **Document Loading**: Supports PDF and Markdown files from local directories, Lark Docs, and Lark Wikis +- **Document Loading**: Supports PDF and Markdown files from local directories, Lark Docs, Lark Wikis, and Lark Wiki Spaces - **Vector Storage**: Uses Milvus for efficient vector similarity search with full-text search support - **Embeddings**: Configurable embeddings via Ollama - **Text Chunking**: Recursive character text splitting with configurable chunk size and overlap - **MCP Integration**: Exposes knowledge base queries through FastMCP server -- **Lark Integration**: Direct integration with Lark Suite for loading documents and wikis +- **Lark Integration**: Direct integration with Lark Suite for loading documents, wikis, and entire wiki spaces - **Flexible Configuration**: YAML-based configuration for easy customization ## Architecture @@ -20,6 +20,7 @@ A Model Context Protocol (MCP) server that provides AI agents with access to a k │ (YAML) │ │ Directory │ │ │ │ │ │ Lark Doc │ │ │ │ │ │ Lark Wiki │ │ │ +│ │ │ Lark Space │ │ │ └─────────────┘ └──────────────┘ └────────────┘ │ ▼ @@ -99,6 +100,13 @@ datasource: id: "wiki-id" ``` +**Lark Wiki Space (loads all documents in a space):** +```yaml +datasource: + - type: lark-space + id: "space-id" +``` + **Multiple Sources:** ```yaml datasource: @@ -108,12 +116,15 @@ datasource: id: "doc-id" - type: lark-wiki id: "wiki-id" + - type: lark-space + id: "space-id" ``` **Supported Datasource Types:** - `directory`: Load PDF and Markdown files from a local directory - `lark-doc`: Load a single Lark document by ID -- `lark-wiki`: Load all pages from a Lark wiki by ID +- `lark-wiki`: Load a single wiki page by ID +- `lark-space`: Load all documents from a Lark wiki space by space ID (recursively loads all child pages) ### 3. Start Milvus @@ -165,7 +176,7 @@ query_knowledge_base( ├── loader/ │ ├── factory.py # Loader factory and datasource abstraction │ ├── directory.py # Directory loader (PDF/MD) -│ └── lark.py # Lark Suite loaders (Doc/Wiki) +│ └── lark.py # Lark Suite loaders (Doc/Wiki/Space) ├── model/ │ ├── factory.py # Embeddings factory │ └── model_garden.py # Model configurations @@ -239,6 +250,7 @@ from langchain_community.document_loaders.blob_loaders import Blob **Getting Document IDs:** - For Lark Docs: The ID is in the URL: `https://xxx.larksuite.com/docx/{document_id}` - For Lark Wikis: The ID is in the URL: `https://xxx.larksuite.com/wiki/{wiki_id}` +- For Lark Spaces: The space ID can be found in wiki space settings or via the Lark API ### Milvus Connection Issues From 94fbd98ee6d5bfb82717329b56e949f5ea633529 Mon Sep 17 00:00:00 2001 From: William Albertus Dembo <29192168+walbertus@users.noreply.github.com> Date: Thu, 4 Dec 2025 17:22:00 +0700 Subject: [PATCH 7/7] refactor: reorder some steps --- knowledge server/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/knowledge server/main.py b/knowledge server/main.py index 79563eb..22aa10d 100644 --- a/knowledge server/main.py +++ b/knowledge server/main.py @@ -92,11 +92,11 @@ def main(): for loader in loaders: for doc in loader.lazy_load(): - logger.debug("Document content: %s", doc.page_content[:20]) + document = doc # make a copy from iterator to single Document + logger.debug("Document content: %s", document.page_content[:20]) logger.info( - "Loaded document from %s", doc.metadata.get("source", "unknown") + "Loaded document from %s", document.metadata.get("source", "unknown") ) - document = doc # make a copy from iterator to single Document chunks = splitter.split_documents([document]) logger.info("Adding %d document chunks to the vector store", len(chunks)) vector_store.add_documents(chunks)