In [None]:
from exasol.ai.text.extraction import *
from typing import List

#default models
NAMED_ENTITY_MODEL="guishe/nuner-v2_fewnerd_fine_super"
NLI_MODEL="tasksource/ModernBERT-large-nli"
FEATURE_EXTRACTION_MODEL="answerdotai/ModernBERT-large"#todo only define once?

def DefaultExtractor(
    ai_lab_config,
    named_entity_recognition_enabled: bool = False,
    topic_classification_enabled: bool = False,
    keyword_search_enabled: bool = False,
    topics: None | List[str] = None,
    parallelism_per_node: int = 1,
    batch_size: int = 100):

    bucketfs_connection=ai_lab_config.te_bfs_connection
    bucketfs_sub_dir=ai_lab_config.te_models_bfs_dir
    paths = []
    if named_entity_recognition_enabled:
        named_entity_extractor = NamedEntityExtractor(
            named_entity_settings=HftNamedEntitySettings(
                    model_name=NAMED_ENTITY_MODEL,
                    bucketfs_conn_name=bucketfs_connection,
                    sub_dir=bucketfs_sub_dir,
            ),
            parallelism_per_node=parallelism_per_node,
            batch_size=batch_size
        )
        paths.append(named_entity_extractor)
    if topic_classification_enabled:
        if not topics:
            raise RuntimeError("Topic Classification selected, but no topics are specified.")
        topic_classification_extractor = TopicClassifierExtractor(
            topic_settings=HftTopicSettings(
                model_name=NLI_MODEL,
                bucketfs_conn_name=bucketfs_connection,
                sub_dir=bucketfs_sub_dir,
                topics=topics
            ),
            parallelism_per_node=parallelism_per_node,
            batch_size=batch_size
        )
        paths.append(topic_classification_extractor)
    if keyword_search_enabled:
        keyword_search_extractor = KeywordSearchExtractor(
            keyword_settings=PatternRankKeywordSettings(
                model_name=FEATURE_EXTRACTION_MODEL,
                bucketfs_conn_name=bucketfs_connection,
                sub_dir=bucketfs_sub_dir
            ),
            parallelism_per_node=parallelism_per_node,
            batch_size=batch_size
        )
        paths.append(keyword_search_extractor)
    return BranchExtractor(paths=paths)