# Preprocessing Component

## Initiating observation via Phoenix

In [1]:
# %pip pip install arize-phoenix
# %pip install llama-index-callbacks-arize-phoenix

# observability
import phoenix as px
px.launch_app()

import llama_index.core
llama_index.core.set_global_handler("arize_phoenix", endpoint="http://localhost:6006/v1/traces")

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


## Visualizing steps of the preprocessing workflow

In [2]:

import os
from llama_index.utils.workflow import draw_all_possible_flows

from evidence_seeker.preprocessing.workflows import (
    PreprocessingWorkflow,
)

draw_all_possible_flows(
    PreprocessingWorkflow, filename="../TMP/PreprocessingWorkflow.html"
)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


<class 'NoneType'>
<class 'evidence_seeker.preprocessing.workflows.ListAscriptiveClaimsEvent'>
<class 'llama_index.core.workflow.events.StopEvent'>
<class 'evidence_seeker.preprocessing.workflows.ListDescriptiveClaimsEvent'>
<class 'evidence_seeker.preprocessing.workflows.NegateClaimEvent'>
<class 'evidence_seeker.preprocessing.workflows.StartedNegatingClaims'>
<class 'evidence_seeker.preprocessing.workflows.NegateClaimEvent'>
<class 'evidence_seeker.preprocessing.workflows.StartedNegatingClaims'>
<class 'evidence_seeker.preprocessing.workflows.NegateClaimEvent'>
<class 'evidence_seeker.preprocessing.workflows.StartedNegatingClaims'>
<class 'evidence_seeker.preprocessing.workflows.CollectClarifiedClaimsEvent'>
<class 'evidence_seeker.preprocessing.workflows.ListNormativeClaimsEvent'>
<class 'evidence_seeker.preprocessing.workflows.NormativeAnalysisEvent'>
<class 'evidence_seeker.preprocessing.workflows.DescriptiveAnalysisEvent'>
<class 'evidence_seeker.preprocessing.workflows.Ascriptiv

I0000 00:00:1734196016.452777  549171 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1734196016.491641  549171 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


## Save default config

In [5]:
import pathlib
import yaml

from evidence_seeker import ClaimPreprocessingConfig

configfile = pathlib.Path("../configs/preprocessing_config_default.yaml")
#configfile = pathlib.Path("../src/evidence_seeker/package_data/config/preprocessing_config.yaml")

default_config = ClaimPreprocessingConfig(
    used_model_key="<key_model_one>",
    env_file="../.env",
    models={
        '<key_model_one>': {
            "name": "<your_model_name>",
            "description": "<description_of_your_model>",
            "base_url": "<your_base_url>",
            "model": "<your_model_identifier>",
            "backend_type": "<backend_type_of_your_model>",
            "max_tokens": 1024,
            "temperature": 0.2,
            "api_key": "<your_api_key_name>",
            "timeout": 260
        },
        '<key_model_two>': {
            "name": "<your_model_name>",
            "description": "<description_of_your_model>",
            "base_url": "<your_base_url>",
            "model": "<your_model_identifier>",
            "backend_type": "<backend_type_of_your_model>",
            "max_tokens": 1024,
            "temperature": 0.2,
            "api_key": "<your_api_key_name>",
            "timeout": 260
        },
    }
)
default_config_dict = default_config.model_dump()

configfile.write_text(yaml.dump(default_config_dict))


[32m2025-07-22 07:47:06.419[0m | [1mINFO    [0m | [36mevidence_seeker.preprocessing.config[0m:[36mload_env_file[0m:[36m75[0m - [1mLoaded environment variables from '../.env'[0m


8042

## Running ClaimPreprocessor

### Example inputs

In [1]:
example_inputs = [
    "Die Osterweiterung hat die EU-Institutionen nachhaltig geschwächt.",
    "In den knapp 70 Jahren seit ihrer Gründung hat es in der Bundeswehr immer wieder rechtsextremistische Vorfälle gegeben.",
    "In der Bundeswehr gibt es keinen politischen Extremismus.",
    "Die Bundesrepublik verfolgt eine orientierungslose Anpassungspolitik, weshalb zunehmend andere Staaten und Institutionen die deutsche Außen- und Sicherheitspolitik beeinflussen und steuern.",
]

example_ascriptions = [
    "Petra Krüger behauptet, dass es in der Bundeswehr keinen politischen Extremismus gibt.",
    "Der indische Premierminister Modi hält Putin für seinen Freund.",
    "Heike Krieger zufolge verbietet das humanitäre Völkerrecht Kriegsführung nicht per se.",
    "Jörn Leonhard schreibt, dass die Anzahl hybrider Kriege zugenommen hat.",
    "Laut Andreas Schüller sind die Genfer Konventionen oft hinter ihrem Anspruch, die Zivilbevölkerung zu schützen, zurückgeblieben."
    "Der Autorin zufolge verbietet das humanitäre Völkerrecht Kriegsführung nicht per se.",
    "Tim Smith schreibt, dass die Anzahl hybrider Kriege zugenommen hat.",
    "Laut dem Autor sind die Genfer Konventionen oft hinter ihrem Anspruch, die Zivilbevölkerung zu schützen, zurückgeblieben."
    "Claudia Gatzka schreibt, dass sich ihre Einstellung zur parteipolitischen Lage in Deutschland mit ihrem großen Freundeskreis deckt.",
    "Die Autorin schreibt, dass sich ihre Einstellung zur parteipolitischen Lage in Deutschland mit ihrem großen Freundeskreis deckt.",
]


### Initializing config object

In [1]:
from evidence_seeker import ClaimPreprocessingConfig

preprocessor = None
config = ClaimPreprocessingConfig(
    timeout=1200,
    env_file="../.env",
    used_model_key="together.ai",
    models={
        'lmstudio': {
                "name": "mllama-3.2-1b-instruct",
                "description": "Local model served via LMStudio",
                "base_url": "http://127.0.0.1:1234/v1/",
                "model": "llama-3.2-1b-instruct",
                "backend_type": "openai",
                "max_tokens": 1024,
                "temperature": 0.2,
                "api_key": "not_needed",
                "timeout": 260
            },
        'together.ai': {
            "name": "Meta-Llama-3-Instruct",
            "description": "Model served via Together.ai over HuggingFace",
            "base_url": "https://router.huggingface.co/together/v1",
            "model": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
            "api_key_name": "hf_debatelab_inference_provider",
            "backend_type": "openai",
            "default_headers": {"X-HF-Bill-To": "DebateLabKIT"},
            "max_tokens": 1024,
            "temperature": 0.2,
            "timeout": 260
        },
        'hf_inference_api': {
            "name": "meta-llama/Llama-3.1-8B-Instruct",
            "description": "Model served over HuggingFace Inference API",
            "base_url": "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.1-8B-Instruct/v1",
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "api_key_name": "hf_debatelab_inference_provider",
            "backend_type": "tgi",
            "default_headers": {"X-HF-Bill-To": "DebateLabKIT"},
            "max_tokens": 1024,
            "temperature": 0.2,
            "timeout": 260
        },
    }
)

[32m2025-07-22 11:41:52.402[0m | [1mINFO    [0m | [36mevidence_seeker.preprocessing.config[0m:[36mload_env_file[0m:[36m77[0m - [1mLoaded environment variables from '../.env'[0m


### Instantiating preprocessor from yaml config file

In [4]:
from evidence_seeker import ClaimPreprocessor

config_file = "../configs/preprocessing_config_default.yaml" 

preprocessor = ClaimPreprocessor.from_config_file(config_file)



### Loading config from a yaml string

In [4]:
from evidence_seeker import ClaimPreprocessingConfig
import yaml

# Define a multiline YAML string
yaml_string = """
timeout: 1200
env_file: ../.env
used_model_key: together.ai
models:
  together.ai:
    name: Meta-Llama-3-Instruct
    description: Model served via Together.ai over HuggingFace
    base_url: https://router.huggingface.co/together/v1
    model: meta-llama/Llama-3.2-3B-Instruct-Turbo
    api_key_name: hf_debatelab_inference_provider
    backend_type: openai
    default_headers: 
      X-HF-Bill-To: DebateLabKIT
    max_tokens: 2048
    temperature: 0.2
    timeout: 260
  thages:
    name: Meta-Llama-3-Instruct
    description: Thages CompPhil²MMAE model (can only be used within KIT net)
    base_url: http://thages.philosophie.kit.edu:8080/v1
    model: meta-llama/Llama-3.1-8B-Instruct
    api_key_name: thages
    backend_type: tgi
    max_tokens: 1024
    temperature: 0.2
    timeout: 260

"""

config = ClaimPreprocessingConfig(**yaml.safe_load(yaml_string))
preprocessor = None
# Parameters that are not specified in the YAML string will be set to their default values
print(config.system_prompt)
print(config.used_model_key)
print(config.models["together.ai"])



[32m2025-07-22 12:53:15.219[0m | [1mINFO    [0m | [36mevidence_seeker.preprocessing.config[0m:[36mload_env_file[0m:[36m77[0m - [1mLoaded environment variables from '../.env'[0m


You are a helpful assistant with outstanding expertise in critical thinking and logico-semantic analysis. 
You have a background in philosophy and experience in fact checking and debate analysis.
You read instructions carefully and follow them precisely. You give concise and clear answers.
together.ai
{'name': 'Meta-Llama-3-Instruct', 'description': 'Model served via Together.ai over HuggingFace', 'base_url': 'https://router.huggingface.co/together/v1', 'model': 'meta-llama/Llama-3.2-3B-Instruct-Turbo', 'api_key_name': 'hf_debatelab_inference_provider', 'backend_type': 'openai', 'default_headers': {'X-HF-Bill-To': 'DebateLabKIT'}, 'max_tokens': 2048, 'temperature': 0.2, 'timeout': 260}


### Running the preprocessor

In [3]:
from evidence_seeker import ClaimPreprocessor
import asyncio

if preprocessor is None:
    preprocessor = ClaimPreprocessor(config=config)

coros = [preprocessor(claim=claim) for claim in example_inputs[:1]]
results = await asyncio.gather(*coros)

[32m2025-07-22 11:15:23.051[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.preprocessing.workflows[0m:[36mascriptive_analysis[0m:[36m195[0m - [34m[1mAnalysing ascriptive aspects of claim 'Die Osterweiterung hat die EU-Institutionen nachhaltig geschwächt.'.[0m
[32m2025-07-22 11:15:23.058[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.backend[0m:[36mget_openai_llm[0m:[36m236[0m - [34m[1mFetching api key via env var: hf_debatelab_inference_provider[0m
[32m2025-07-22 11:15:23.062[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.backend[0m:[36mget_openai_llm[0m:[36m249[0m - [34m[1mInstantiating OpenAILike model (model: meta-llama/Llama-3.2-3B-Instruct-Turbo,base_url: https://router.huggingface.co/together/v1).[0m
[32m2025-07-22 11:15:23.185[0m | [34m[1mDEBUG   [0m | [36mevidence_seeker.preprocessing.workflows[0m:[36mdescriptive_analysis[0m:[36m136[0m - [34m[1mAnalysing descriptive aspects of claim 'Die Osterweiterung hat die EU-Institutionen

In [8]:
from pprint import pprint

for clarified_claims in results:
    pprint([cc.model_dump() for cc in clarified_claims])

[{'average_confirmation': None,
  'confirmation_by_document': None,
  'documents': None,
  'evidential_uncertainty': None,
  'metadata': {},
  'n_evidence': None,
  'negation': 'Die Osterweiterung hat die EU-Institutionen nicht geschwächt',
  'statement_type': <StatementType.NORMATIVE: 'normative'>,
  'text': 'Die Osterweiterung hat die EU-Institutionen nachhaltig geschwächt',
  'uid': 'eba87f7e-f9a2-47ab-965a-bd180d42c42e',
  'verbalized_confirmation': None},
 {'average_confirmation': None,
  'confirmation_by_document': None,
  'documents': None,
  'evidential_uncertainty': None,
  'metadata': {},
  'n_evidence': None,
  'negation': 'Die EU-Institutionen sind stärker.',
  'statement_type': <StatementType.NORMATIVE: 'normative'>,
  'text': 'Die EU-Institutionen sind weniger stark',
  'uid': '24ed8673-6dcc-4f29-9ee5-afcd296a4052',
  'verbalized_confirmation': None},
 {'average_confirmation': None,
  'confirmation_by_document': None,
  'documents': None,
  'evidential_uncertainty': None,