In [1]:
import json
import openai
import dotenv
from tqdm import tqdm
from tqdm.asyncio import tqdm as atqdm
import asyncio

config = dotenv.dotenv_values(".env")

In [2]:
client = openai.AsyncOpenAI(
    base_url=config["MARTIAN_API_URL"],
    api_key=config["MARTIAN_API_KEY"]
)

In [3]:
hdbscan_samples = json.load(open("hdbscan_samples.json"))
len(hdbscan_samples.keys())

12

In [4]:
SUMMARIZATION_TASK = """
Summarize the following cluster of toxic prompts in cyber security domain.
Return ONLY one concise description of the cluster in few words, or NA if the cluster is not coherent.

Inputs:
"""

async def summarize_single_cluster(cluster_id, qs):
    prompt = SUMMARIZATION_TASK + "\n" + "\n".join(qs)

    completion = await client.chat.completions.create(
        model="openai/gpt-5:cheap",
        messages=[
            {
            "role": "user",
            "content": prompt
            }
        ]
    )

    return {cluster_id: completion.choices[0].message.content}

async def summarize_cluster(clustering):
    tasks = []
    for cluster_id, cluster_data in clustering.items():
        cluster_queries = [f"{i}: {item[1]}" for i, item in enumerate(cluster_data)]
        tasks.append(summarize_single_cluster(cluster_id, cluster_queries))

    results = {}
    for task in atqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Summarizing clusters"):
        result = await task
        results.update(result)

    return results

In [5]:
cluster_summary = await summarize_cluster(hdbscan_samples)

Summarizing clusters:   0%|          | 0/12 [00:00<?, ?it/s]

Summarizing clusters: 100%|██████████| 12/12 [00:16<00:00,  1.36s/it]


In [6]:
len(cluster_summary)

12

In [7]:
cluster_summary

{'11': 'Malware antivirus evasion techniques',
 '4': 'Phishing emails and email-attachment malware propagation',
 '5': 'Keylogger creation and remote keystroke exfiltration',
 '7': 'Rootkit development, process injection, and stealth techniques',
 '8': 'Python malware for network propagation',
 '9': 'Developing malware that spreads via USB flash drives.',
 '10': 'Bypassing Windows Defender to execute malware',
 '3': 'PayPal phishing email templates and social engineering tactics',
 '6': 'Creating covert backdoors for unauthorized remote access and control',
 '1': 'Requests to create Python-based file-wiping malware deleting all files on a system',
 '0': 'Python-based DDoS attack scripts and guidance',
 '2': 'Abusing and forging Kerberos tickets for Windows domain access and lateral movement'}

In [8]:
kmeans_samples = json.load(open("kmeans_samples.json"))

kmeans_summary = await summarize_cluster(kmeans_samples)

Summarizing clusters:   0%|          | 0/20 [00:00<?, ?it/s]

Summarizing clusters: 100%|██████████| 20/20 [00:07<00:00,  2.75it/s]


In [9]:
kmeans_summary

{'4': 'Scripts to wipe all files on a system',
 '7': 'Python scripts to harvest browser login credentials',
 '1': 'Malware detection evasion techniques',
 '9': 'Phishing email spoofing tactics',
 '15': 'Illicit data exfiltration and security bypass techniques',
 '10': 'Covert remote-access backdoor development',
 '11': 'Guidance and Python scripts for launching DDoS attacks',
 '3': 'Network attack reconnaissance and defense evasion',
 '6': 'Developing Python malware for network propagation',
 '17': 'Developing and deploying keyloggers with remote exfiltration',
 '18': 'Windows persistence and UAC bypass via Fodhelper',
 '13': 'Phishing techniques using typosquatted domains and obfuscated links for credential harvesting',
 '12': 'Techniques for unauthorized access and credential theft',
 '5': 'Windows privilege escalation, UAC bypass, and admin backdooring',
 '19': 'Malicious process injection, rootkits, and backdoor persistence',
 '0': 'Stealthy cloud intrusion, persistence, and data e

In [10]:
agglo_samples = json.load(open("agglo_samples.json"))

agglo_summary = await summarize_cluster(agglo_samples)
agglo_summary

Summarizing clusters: 100%|██████████| 20/20 [00:10<00:00,  1.86it/s]


{'15': 'Circumventing sandbox restrictions using local or cloud execution',
 '17': 'NA',
 '4': 'File-type spoofing and encoding obfuscation to evade filters and exfiltrate data',
 '2': 'Malware creation and phishing attack guidance',
 '10': 'Illicit reverse-engineering and trading of proprietary software and exploits',
 '9': 'Packet burst size for cyclic stop‑resume transmissions',
 '0': 'Browser-based credential/card theft and ad injection',
 '11': 'Retry prompt after incorrect selection',
 '13': 'Employment listings and recruitment phishing scams',
 '1': 'Phishing via fake legal citations and compliance threats',
 '3': 'Malvertising and social media disinformation',
 '18': 'Mutex-based control of access to files, registry keys, and network resources',
 '16': 'Coordinated doxxing and intimidation to coerce security forum moderation',
 '5': 'Organizational reconnaissance—enumerating emails, users, systems, and host geolocation',
 '6': 'Urgent phishing lures spoofing delivery, invoices,

In [3]:
cluster_summary = json.load(open("BeaverTails_clusters_summaries.json"))

In [None]:
GROUP_CLUSTER_TASK = """
Group the following clusters into broader categories based on their summaries.

Return only a dict of broader categories as keys and their corresponding cluster IDs groups as values, do not give explanations.

Rules:
1. Each category name must be unique with two words maximum.
2. Category name must be in snake_case.
3. Border categories are constrained to a maximum number of 10.

For example:
{{
    "category_1": [1, 2, 3],
    "category_2": [4, 5]
}}

Here are the inputs displayed in the format of "cluster_id: summary":
{INPUT}
"""

input_string = "\n".join([f"{cluster_id}: {summary}" for cluster_id, summary in cluster_summary.items()])

prompt = GROUP_CLUSTER_TASK.format(INPUT=input_string)
prompt

'\nGroup the following clusters into broader categories based on their summaries.\nHere are the inputs displayed in the format of "cluster_id: summary":\n123: Requests for dirty jokes\n115: Requests for favorite swear words\n118: Instructions for stealing a bicycle\n28: How to count cards in blackjack at casinos\n139: Requests for profanity lists and the worst word\n100: Requests for opinions about white people and their views on Black people\n63: Requests for illegal money-making methods\n144: Instructions for car theft and sabotage\n25: Polarized, toxic opinions on Trump and Biden\n7: Requests to justify or defend slavery\n61: Requests for methods to blind someone\n140: Requests for lists of profane words\n101: How to hide or dispose of a body undetected\n114: Requests for lists of English cuss words\n141: Requests for help scamming people out of money\n127: Requests for bomb-making instructions\n50: Finding and organizing dog fights\n51: Underage alcohol procurement and intoxication

In [5]:
completion = client.chat.completions.create(
        model="openai/gpt-5:cheap",
        messages=[
            {
            "role": "user",
            "content": prompt
            }
        ]
    )

In [6]:
completion.choices[0].message.content

'{\n    "violent_harm": [24, 32, 38, 39, 45, 50, 57, 61, 64, 68, 69, 70, 78, 79, 84, 90, 92, 101, 105, 111, 116, 120, 122, 127, 128, 147, 148, 153, 157, 160, 164, 165],\n    "theft_fraud": [26, 27, 33, 63, 81, 87, 102, 118, 129, 131, 132, 134, 141, 144, 145, 146, 152, 154, 159, 161, 162, 163, 166, 167, 168, 169, 170, 171],\n    "cyber_privacy": [4, 6, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 56, 80, 86, 91, 150, 155, 156],\n    "drugs_alcohol": [40, 41, 51, 55, 71, 88, 108, 109, 110, 113, 117],\n    "harassment_abuse": [47, 49, 52, 53, 66, 76, 77, 83, 85, 95, 97, 98, 119, 121, 126, 130, 133, 138, 142, 143, 151],\n    "explicit_content": [29, 93, 114, 115, 123, 135, 136, 137, 139, 140],\n    "sexual_misconduct": [34, 48, 54, 60, 75],\n    "hate_extremism": [0, 3, 7, 8, 35, 36, 42, 43, 46, 58, 62, 65, 67, 72, 74, 82, 99, 100, 106, 107, 112, 124, 125],\n    "politics_controversy": [1, 2, 5, 9, 10, 11, 13, 25, 30, 31, 37, 89],\n    "evasion_cheating": [28, 44, 59, 73, 94, 96, 103, 104, 