In [None]:
import sys

sys.path.append('..')

from db import *
from analysis import *
from blackbox.helpers import *

# Implicit Guardrail: Prompt Softening

In [None]:
original_moderation = aliased(ModerationRequest)
revised_moderation = aliased(ModerationRequest)

data = (
    session.query(
        Languages.name.label("Language"),
        func.avg(
            (
                (
                    sql_norm(revised_moderation.category_scores)
                    - sql_norm(original_moderation.category_scores)
                )
                / sql_norm(original_moderation.category_scores)
            ) * 100
        ).cast(Numeric(10,2)).label("Toxicity Change (%)"),
        func.avg(
            sql_similarity(
                original_moderation.category_scores, revised_moderation.category_scores
            ) * 100
        ).cast(Numeric(10,2)).label("Similarity (%)"),
    )
    .select_from(ImageCreationRequest)
    .join(
        TranslatedPrompts,
        ImageCreationRequest.translated_prompt_id == TranslatedPrompts.id,
    )
    .join(
        Prompts,
        TranslatedPrompts.prompt_id == Prompts.id,
    )
    .join(
        Languages,
        TranslatedPrompts.language_id == Languages.id,
    )
    .join(
        original_moderation,
        Prompts.prompt == original_moderation.prompt,
    )
    .join(
        revised_moderation,
        ImageCreationRequest.revised_prompt == revised_moderation.prompt,
    )
    .where(
        ImageCreationRequest.model == "dalle-3",
        ImageCreationRequest.success == True,
        Languages.name != 'English',
    )
    .group_by(
        Languages.name,
    )
)

df = query_to_df(data)

# Results

**Description:** This experiment quantifies the effect of the LLM prompt revision prompt as an implicit filtering mechanism in DALL·E 3 using two key metrics: *Toxicity Absolute Change* and *Toxicity Theme Similarity* (§3.3).

**Dataset:** This experiment uses 5 prompts in 4 languages (Hawaiian, Lao, Nepali, Sinhala) for a total of 20 requests to DALL·E 3. Prompts and revised prompts are evaluated for toxicity using the OpenAI Moderation API.

**Success Criteria:** The experiment demonstrates that the prompt revision model "softens" the toxicity of harmful prompts to various degrees of success under multilingual inputs:
  - *Toxicity Absolute Change* will likely decrease for all languages, but not equally among all languages.
  - *Toxicity Theme Similarity* will always decrease, but not equally among all languages.

In [None]:
df