From e60e373c0a4bf063377ea7efb959384e8a2f23a3 Mon Sep 17 00:00:00 2001 From: LJ Date: Tue, 11 Mar 2025 09:46:19 -0700 Subject: [PATCH] Switch to the `ExtractByMistral` in the `manual_extraction` example. Doesn't work yet - engine side needs some debug. --- .../manual_extraction/manual_extraction.py | 53 +++++-------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/examples/manual_extraction/manual_extraction.py b/examples/manual_extraction/manual_extraction.py index 2088facf5..39018a54e 100644 --- a/examples/manual_extraction/manual_extraction.py +++ b/examples/manual_extraction/manual_extraction.py @@ -48,48 +48,13 @@ class ClassInfo: methods: cocoindex.typing.List[MethodInfo] @dataclasses.dataclass -class ManualInfo: +class ModuleInfo: title: str description: str classes: cocoindex.typing.Table[ClassInfo] methods: cocoindex.typing.Table[MethodInfo] -class ExtractManual(cocoindex.op.FunctionSpec): - """Extract manual information from a Markdown.""" - -@cocoindex.op.executor_class() -class ExtractManualExecutor: - """Executor for ExtractManual.""" - - spec: ExtractManual - - def __call__(self, _markdown: str) -> ManualInfo: - return ManualInfo( - title="title_placeholder", - description="description_placeholder", - classes=[ - ClassInfo( - name="class_name_placeholder", - description="class_description_placeholder", - methods=[ - MethodInfo( - name="method_name_placeholder", - args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")], - description="method_description_placeholder" - ) - ] - ) - ], - methods=[ - MethodInfo( - name="method_name_placeholder", - args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")], - description="method_description_placeholder" - ) - ] - ) - class CleanUpManual(cocoindex.op.FunctionSpec): """Clean up manual information.""" @@ -101,9 +66,9 @@ class CleanUpManualExecutor: spec: CleanUpManual - def __call__(self, manual_info: ManualInfo) -> ManualInfo | None: + def __call__(self, module_info: ModuleInfo) -> ModuleInfo | None: # TODO: Clean up - return manual_info + return module_info @cocoindex.flow_def(name="ManualExtraction") def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): @@ -116,9 +81,15 @@ def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco with data_scope["documents"].row() as doc: doc["markdown"] = doc["content"].transform(PdfToMarkdown()) - doc["raw_manual_info"] = doc["markdown"].transform(ExtractManual()) - doc["manual_info"] = doc["raw_manual_info"].transform(CleanUpManual()) - manual_infos.collect(filename=doc["filename"], manual_info=doc["manual_info"]) + doc["raw_module_info"] = doc["markdown"].transform( + cocoindex.functions.ExtractByMistral( + model=cocoindex.functions.MistralModelSpec( + model_id="microsoft/Phi-3.5-mini-instruct", + isq_type="Q8_0"), + output_type=cocoindex.typing.encode_enriched_type(ModuleInfo), + instructions="Please extract Python module information from the manual.")) + doc["module_info"] = doc["raw_module_info"].transform(CleanUpManual()) + manual_infos.collect(filename=doc["filename"], module_info=doc["module_info"]) manual_infos.export( "manual_infos",