In [None]:
!pip install python-docx trafilatura markdown-it-py mdit_plain pypdf python-pptx openpyxl nltk

# SuperComponents

Supercomponents in general behave like any other component. They have init params, from_dict() and to_dict() methods as usual. The init params typically determine how the internal pipeline is constructed (e.g. which components are used).

In [1]:
from haystack_experimental.super_components.converters.file_converter import MultiFileConverter

In [2]:
sc = MultiFileConverter()

In [3]:
sc.to_dict()

{'type': 'haystack_experimental.super_components.converters.file_converter.MultiFileConverter',
 'init_parameters': {'mime_types': None,
  'split_by': 'word',
  'split_length': 250,
  'split_overlap': 30,
  'split_threshold': 0,
  'splitting_function': None,
  'respect_sentence_boundary': True,
  'language': 'en',
  'use_split_rules': True,
  'extend_abbreviations': True,
  'encoding': 'utf-8',
  'json_content_key': 'content'}}

## Expanding SuperComponents (converting to PipelineWrapper)
What makes SuperComponents special is the ability to expand it by calling their `to_pipeline_wrapper_dict()` method. This converts the component to a generic `PipelineWrapper` that contains the pipeline constructed by the SuperComponent. From there on the pipeline can be changed in any way.

In [4]:
sc.to_pipeline_wrapper_dict()

{'type': 'haystack_experimental.components.wrappers.pipeline_wrapper.PipelineWrapper',
 'init_parameters': {'pipeline': {'metadata': {},
   'max_runs_per_component': 100,
   'components': {'CSVToDocument': {'type': 'haystack.components.converters.csv.CSVToDocument',
     'init_parameters': {'encoding': 'utf-8', 'store_full_path': False}},
    'DOCXToDocument': {'type': 'haystack.components.converters.docx.DOCXToDocument',
     'init_parameters': {'table_format': 'csv', 'store_full_path': False}},
    'HTMLToDocument': {'type': 'haystack.components.converters.html.HTMLToDocument',
     'init_parameters': {'extraction_kwargs': {}, 'store_full_path': False}},
    'JSONConverter': {'type': 'haystack.components.converters.json.JSONConverter',
     'init_parameters': {'jq_schema': None,
      'content_key': 'content',
      'extra_meta_fields': None,
      'store_full_path': False}},
    'MarkdownToDocument': {'type': 'haystack.components.converters.markdown.MarkdownToDocument',
     'init_p

In [5]:
from haystack_experimental.components.wrappers.pipeline_wrapper import PipelineWrapper

PipelineWrapper.from_dict(sc.to_pipeline_wrapper_dict())

<haystack_experimental.components.wrappers.pipeline_wrapper.PipelineWrapper object at 0x127cf4620>
Inputs:
  - meta: Union[Dict[str, Any], List[Dict[str, Any]]]
  - extraction_kwargs: Optional[Dict[str, Any]]
  - sources: List[Union[str, Path, ByteStream]]
  - documents: List[Document]
  - top_k: Optional[int]
Outputs:
  - documents: List[Document]