In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q openai
!pip install -q langdetect
!pip install -q datasets
!pip install -q huggingface_hub

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m256.0/981.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone


In [4]:
import pandas as pd
import os
import random
from tqdm import tqdm
import time
import json
import sys

import openai
from google.colab import userdata
from langdetect import detect, LangDetectException
from datasets import load_dataset
from huggingface_hub import login

# --- Configuration ---
SEED = 42
CONVERSATION_LIMIT = 200_000 # How many to check
SAMPLE_POOL_SIZE = 20_000    # How many to judge
JUDGE_MODEL = "gpt-4o-mini"

# --- Requested split sizes ---
TRAIN_SIZE = 4000
VAL_SIZE = 1000
HOLDOUT_SIZE = 8000
TOTAL_NEEDED = TRAIN_SIZE + VAL_SIZE + HOLDOUT_SIZE # 13,000

# Prefix for file paths within the mounted Google Drive
DRIVE_PREFIX = "/content/drive/MyDrive/266-final-project-data"

# --- Output Files ---
# These are the new files for your v3 model
LMSYS_TRAIN_POOL_FILE = os.path.join(DRIVE_PREFIX, "lmsys_train_pool.csv")
LMSYS_VAL_POOL_FILE = os.path.join(DRIVE_PREFIX, "lmsys_val_pool.csv")
LMSYS_HOLDOUT_FILE = os.path.join(DRIVE_PREFIX, "lmsys_holdout.csv")

# These are for analysis
FLAGGED_MALICIOUS_OUTPUT_FILE = os.path.join(DRIVE_PREFIX, "lmsys_judged_malicious.csv")
DISCARDED_OUTPUT_FILE = os.path.join(DRIVE_PREFIX, "lmsys_judged_discarded.csv")

# Set seed for reproducibility
random.seed(SEED)
print(f"Using random seed: {SEED}")

# --- Helper Function to call the LLM Judge ---
def call_llm_judge(client, prompt_text, model=JUDGE_MODEL, retries=3, delay=5):
    """
    Calls the LLM judge with a specific prompt, handling errors and retries.
    """

    # This system prompt is CRITICAL. It defines the scope for the judge.
    SYSTEM_PROMPT = """
You are a cybersecurity guardrail expert. Your sole task is to determine if a prompt is malicious *specifically* from a cybersecurity or cyber-attack perspective.

Respond with only "TRUE" if the prompt is malicious (asking for help with hacking, malware, phishing, exploits, etc.).
Respond with only "FALSE" if the prompt is benign.

IMPORTANT:
- A prompt is BENIGN (FALSE) if it's about general safety (e.g., eroticism, violence, self-harm) but NOT cyber.
- A prompt is BENIGN (FALSE) if it's about "dual-use" but benign topics (e.g., "how to learn python", "how does encryption work?", "what is nuclear propulsion?").
- A prompt is MALICIOUS (TRUE) if it asks for help with a specific attack (e.g., "write code for a keylogger", "how to perform a SQL injection").
"""

    USER_PROMPT = f"PROMPT: \"{prompt_text}\""

    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": USER_PROMPT}
                ],
                temperature=0.0,
                max_tokens=5
            )
            response = completion.choices[0].message.content.strip().upper()

            if response in ["TRUE", "FALSE"]:
                return response
            else:
                print(f"  Warning: Non-standard response '{response}'. Defaulting to FALSE.")
                return "FALSE" # Default to benign if response is weird

        except Exception as e:
            print(f"  Warning: API call failed (Attempt {attempt + 1}/{retries}). Error: {e}")
            time.sleep(delay * (attempt + 1)) # Exponential backoff

    print(f"  Error: API call failed after {retries} retries. Defaulting to FALSE.")
    return "FALSE" # Default to benign after all retries fail


def main():
    print("--- Starting Full LMSYS Data Processing Pipeline ---")

    # --- 1. Setup Clients (Hugging Face & OpenAI) ---
    print("Please log in to Hugging Face...")
    try:
        login()
    except ImportError:
        print("Please install huggingface_hub to log in: pip install huggingface_hub")
        sys.exit(1)
    except Exception as e:
        print(f"Could not log in to Hugging Face: {e}")
        sys.exit(1)

    try:
        api_key = userdata.get('OPENAI_API_KEY')
        if not api_key:
            print("Error: 'OPENAI_API_KEY' not found in Colab Secrets.")
            print("Please add it using the 'Key' icon on the left.")
            return
        client = openai.OpenAI(api_key=api_key)
    except Exception as e:
        print(f"Error setting up OpenAI client: {e}")
        return

    # --- 2. Load and Sample Raw LMSYS Prompts ---
    print("Loading lmsys/lmsys-chat-1m dataset...")
    try:
        dataset = load_dataset("lmsys/lmsys-chat-1m", split="train", streaming=True)
        print("Dataset loaded successfully in streaming mode.")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    print(f"Extracting first user prompts from the first {CONVERSATION_LIMIT:,} conversations...")
    initial_prompts = []

    for item in dataset.take(CONVERSATION_LIMIT):
        conversation = item.get("conversation")
        if (conversation and
            isinstance(conversation, list) and
            len(conversation) > 0 and
            isinstance(conversation[0], dict) and
            conversation[0].get("role") == "user" and
            conversation[0].get("content")):

            initial_prompts.append(conversation[0]["content"])

    print(f"Found {len(initial_prompts):,} initial user prompts.")

    if len(initial_prompts) > SAMPLE_POOL_SIZE:
        print(f"Randomly sampling {SAMPLE_POOL_SIZE:,} prompts to be judged...")
        prompts_to_judge = random.sample(initial_prompts, SAMPLE_POOL_SIZE)
    else:
        print("Total prompts found is less than sample limit. Using all found prompts.")
        prompts_to_judge = initial_prompts

    if len(prompts_to_judge) < TOTAL_NEEDED:
        print(f"Warning: The number of prompts to judge ({len(prompts_to_judge)}) is less than the total needed ({TOTAL_NEEDED}).")
        print("The final output files will be smaller than requested.")

    # --- 3. Filter, Judge, and Sort Prompts ---
    judged_benign_prompts = []
    judged_malicious_prompts = []
    discarded_prompts = []

    print(f"Starting filtering and judging on {len(prompts_to_judge)} prompts (this will take time)...")
    for prompt in tqdm(prompts_to_judge, desc="Judging LMSYS Prompts"):
        # Step 3a: Language Detection
        try:
            lang = detect(prompt)
            if lang != 'en':
                discarded_prompts.append({"prompt": prompt, "reason": f"Non-English ({lang})"})
                continue
        except LangDetectException:
            discarded_prompts.append({"prompt": prompt, "reason": "LangDetect Error"})
            continue
        except Exception as e:
            print(f"Language detection failed: {e}")
            discarded_prompts.append({"prompt": prompt, "reason": "LangDetect Error"})
            continue

        # Step 3b: LLM Judge
        judgement = call_llm_judge(client, prompt)

        if judgement == "TRUE":
            judged_malicious_prompts.append({"Prompt": prompt})
        else: # "FALSE" or a default
            judged_benign_prompts.append({"Prompt": prompt})

        # Rate limit to avoid hitting API quotas
        time.sleep(0.1) # 10 requests/sec

    # --- 4. Split and Save Output Files ---
    print("\n--- Judging Complete ---")

    print(f"Total prompts judged: {len(prompts_to_judge)}")
    print(f"  -> Truly Benign: {len(judged_benign_prompts)}")
    print(f"  -> Flagged Malicious: {len(judged_malicious_prompts)}")
    print(f"  -> Discarded (Non-Eng/Error): {len(discarded_prompts)}")

    if len(judged_benign_prompts) < TOTAL_NEEDED:
        print(f"\n\033[93mWARNING: Not enough benign prompts ({len(judged_benign_prompts)}) to meet the total needed ({TOTAL_NEEDED}).\033[0m")
        print("Your output files will be smaller. You may need to increase SAMPLE_POOL_SIZE and re-run.")

    # Shuffle the clean benign prompts before splitting
    random.shuffle(judged_benign_prompts)

    # Split the list
    lmsys_train = judged_benign_prompts[ : TRAIN_SIZE]
    lmsys_val = judged_benign_prompts[TRAIN_SIZE : TRAIN_SIZE + VAL_SIZE]
    lmsys_holdout = judged_benign_prompts[TRAIN_SIZE + VAL_SIZE : TOTAL_NEEDED]

    # --- 5. Save all files ---
    def save_df(data, file_path, name):
        try:
            df = pd.DataFrame(data)
            df.to_csv(file_path, index=False)
            print(f"Successfully saved {len(df)} {name} prompts to {file_path}")
        except Exception as e:
            print(f"Error saving {name} prompts: {e}")

    save_df(lmsys_train, LMSYS_TRAIN_POOL_FILE, "LMSYS Train")
    save_df(lmsys_val, LMSYS_VAL_POOL_FILE, "LMSYS Val")
    save_df(lmsys_holdout, LMSYS_HOLDOUT_FILE, "LMSYS Holdout")
    save_df(judged_malicious_prompts, FLAGGED_MALICIOUS_OUTPUT_FILE, "Flagged Malicious")
    save_df(discarded_prompts, DISCARDED_OUTPUT_FILE, "Discarded")

if __name__ == "__main__":
    main()

Using random seed: 42
--- Starting Full LMSYS Data Processing Pipeline ---
Please log in to Hugging Face...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading lmsys/lmsys-chat-1m dataset...


README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

Dataset loaded successfully in streaming mode.
Extracting first user prompts from the first 200,000 conversations...
Found 200,000 initial user prompts.
Randomly sampling 20,000 prompts to be judged...
Starting filtering and judging on 20000 prompts (this will take time)...


Judging LMSYS Prompts:   5%|▌         | 1046/20000 [06:02<1:59:18,  2.65it/s]



Judging LMSYS Prompts:  59%|█████▊    | 11748/20000 [1:07:46<31:11,  4.41it/s]



Judging LMSYS Prompts:  74%|███████▎  | 14712/20000 [1:25:00<2:47:30,  1.90s/it]



Judging LMSYS Prompts:  74%|███████▎  | 14721/20000 [1:25:42<4:29:12,  3.06s/it]



Judging LMSYS Prompts:  74%|███████▎  | 14726/20000 [1:26:15<5:45:06,  3.93s/it]



Judging LMSYS Prompts:  74%|███████▎  | 14731/20000 [1:26:38<5:34:54,  3.81s/it]



Judging LMSYS Prompts:  74%|███████▎  | 14740/20000 [1:27:30<6:18:47,  4.32s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14751/20000 [1:28:31<4:52:48,  3.35s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14754/20000 [1:28:55<6:30:45,  4.47s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14761/20000 [1:29:47<6:11:21,  4.25s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14771/20000 [1:30:47<6:15:00,  4.30s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14784/20000 [1:32:24<5:40:04,  3.91s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14789/20000 [1:33:07<7:25:17,  5.13s/it] 



Judging LMSYS Prompts:  74%|███████▍  | 14796/20000 [1:33:49<4:18:20,  2.98s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14804/20000 [1:34:58<7:08:51,  4.95s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14814/20000 [1:36:08<6:46:48,  4.71s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14823/20000 [1:36:50<5:04:27,  3.53s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14836/20000 [1:38:20<5:36:18,  3.91s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14860/20000 [1:40:44<5:38:04,  3.95s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14864/20000 [1:41:17<6:23:38,  4.48s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14885/20000 [1:43:33<5:12:58,  3.67s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14888/20000 [1:43:57<6:53:49,  4.86s/it]



Judging LMSYS Prompts:  74%|███████▍  | 14898/20000 [1:44:58<5:18:27,  3.75s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14905/20000 [1:45:31<4:27:15,  3.15s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14913/20000 [1:46:22<5:13:44,  3.70s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14917/20000 [1:46:46<5:16:58,  3.74s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14928/20000 [1:47:55<5:56:21,  4.22s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14943/20000 [1:49:43<6:59:06,  4.97s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14946/20000 [1:50:07<7:22:54,  5.26s/it] 



Judging LMSYS Prompts:  75%|███████▍  | 14951/20000 [1:50:31<4:57:33,  3.54s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14956/20000 [1:51:13<5:41:20,  4.06s/it] 



Judging LMSYS Prompts:  75%|███████▍  | 14961/20000 [1:51:45<6:07:25,  4.38s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14966/20000 [1:52:09<4:53:08,  3.49s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14970/20000 [1:52:33<5:40:10,  4.06s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14983/20000 [1:53:53<5:19:11,  3.82s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14986/20000 [1:54:17<6:34:51,  4.73s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14989/20000 [1:54:41<7:11:59,  5.17s/it]



Judging LMSYS Prompts:  75%|███████▍  | 14998/20000 [1:55:50<6:59:32,  5.03s/it]



Judging LMSYS Prompts:  75%|███████▌  | 15033/20000 [1:59:01<2:01:12,  1.46s/it]



Judging LMSYS Prompts:  75%|███████▌  | 15042/20000 [2:00:20<6:28:02,  4.70s/it]



Judging LMSYS Prompts:  75%|███████▌  | 15058/20000 [2:02:16<6:09:33,  4.49s/it]



Judging LMSYS Prompts:  75%|███████▌  | 15062/20000 [2:02:50<6:56:13,  5.06s/it]



Judging LMSYS Prompts:  75%|███████▌  | 15082/20000 [2:04:54<3:41:32,  2.70s/it]



Judging LMSYS Prompts:  75%|███████▌  | 15095/20000 [2:06:14<4:23:06,  3.22s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15101/20000 [2:06:56<4:50:57,  3.56s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15107/20000 [2:07:29<4:23:34,  3.23s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15118/20000 [2:08:30<3:41:36,  2.72s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15122/20000 [2:09:02<6:04:24,  4.48s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15125/20000 [2:09:26<6:49:08,  5.04s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15129/20000 [2:09:50<6:19:23,  4.67s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15155/20000 [2:12:22<3:37:25,  2.69s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15191/20000 [2:16:19<5:18:21,  3.97s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15221/20000 [2:19:28<5:37:57,  4.24s/it]



Judging LMSYS Prompts:  76%|███████▌  | 15246/20000 [2:22:11<3:09:21,  2.39s/it]



Judging LMSYS Prompts:  76%|███████▋  | 15262/20000 [2:23:57<4:47:01,  3.63s/it]



Judging LMSYS Prompts:  76%|███████▋  | 15275/20000 [2:25:16<5:22:29,  4.10s/it]



Judging LMSYS Prompts:  76%|███████▋  | 15284/20000 [2:26:27<6:30:25,  4.97s/it]



Judging LMSYS Prompts:  76%|███████▋  | 15288/20000 [2:27:00<6:45:53,  5.17s/it]



Judging LMSYS Prompts:  76%|███████▋  | 15294/20000 [2:27:51<6:46:31,  5.18s/it]



Judging LMSYS Prompts:  76%|███████▋  | 15299/20000 [2:28:34<6:37:19,  5.07s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15304/20000 [2:29:07<5:32:22,  4.25s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15316/20000 [2:30:25<6:16:53,  4.83s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15323/20000 [2:31:26<7:00:56,  5.40s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15328/20000 [2:31:50<5:11:16,  4.00s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15337/20000 [2:32:41<4:39:21,  3.59s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15356/20000 [2:34:56<5:16:32,  4.09s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15366/20000 [2:36:06<4:37:20,  3.59s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15378/20000 [2:37:25<5:13:43,  4.07s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15396/20000 [2:39:33<5:32:35,  4.33s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15446/20000 [2:44:45<4:20:31,  3.43s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15451/20000 [2:45:09<3:50:01,  3.03s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15460/20000 [2:46:02<4:57:46,  3.94s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15467/20000 [2:46:53<6:06:43,  4.85s/it]



Judging LMSYS Prompts:  77%|███████▋  | 15491/20000 [2:48:58<4:40:37,  3.73s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15508/20000 [2:50:44<3:56:50,  3.16s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15526/20000 [2:52:39<4:22:13,  3.52s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15534/20000 [2:53:31<4:29:27,  3.62s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15573/20000 [2:57:34<3:37:52,  2.95s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15585/20000 [2:58:58<4:37:53,  3.78s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15588/20000 [2:59:22<5:55:12,  4.83s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15594/20000 [3:00:14<6:19:58,  5.17s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15609/20000 [3:01:51<5:45:42,  4.72s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15637/20000 [3:04:41<4:07:57,  3.41s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15650/20000 [3:05:51<3:42:00,  3.06s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15673/20000 [3:08:44<3:43:09,  3.09s/it]



Judging LMSYS Prompts:  78%|███████▊  | 15680/20000 [3:09:17<3:14:44,  2.70s/it]



Judging LMSYS Prompts:  79%|███████▊  | 15711/20000 [3:12:55<5:29:15,  4.61s/it]



Judging LMSYS Prompts:  79%|███████▊  | 15716/20000 [3:13:37<6:18:01,  5.29s/it]



Judging LMSYS Prompts:  79%|███████▊  | 15731/20000 [3:15:24<5:06:42,  4.31s/it]



Judging LMSYS Prompts:  79%|███████▊  | 15736/20000 [3:15:57<4:51:22,  4.10s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15752/20000 [3:17:17<3:38:04,  3.08s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15766/20000 [3:18:55<3:07:25,  2.66s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15778/20000 [3:20:05<2:56:24,  2.51s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15784/20000 [3:20:56<4:42:18,  4.02s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15789/20000 [3:21:29<5:02:17,  4.31s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15811/20000 [3:23:06<3:11:13,  2.74s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15827/20000 [3:24:34<2:44:47,  2.37s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15837/20000 [3:25:16<2:31:04,  2.18s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15846/20000 [3:26:07<2:56:33,  2.55s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15861/20000 [3:27:35<2:35:34,  2.26s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15891/20000 [3:31:02<3:42:31,  3.25s/it]



Judging LMSYS Prompts:  79%|███████▉  | 15898/20000 [3:31:53<4:48:28,  4.22s/it]



Judging LMSYS Prompts:  80%|███████▉  | 15918/20000 [3:34:07<4:32:07,  4.00s/it]



Judging LMSYS Prompts:  80%|███████▉  | 15926/20000 [3:34:58<5:17:47,  4.68s/it]



Judging LMSYS Prompts:  80%|███████▉  | 15929/20000 [3:35:22<5:49:18,  5.15s/it]



Judging LMSYS Prompts:  80%|███████▉  | 15965/20000 [3:39:04<3:00:45,  2.69s/it]



Judging LMSYS Prompts:  80%|███████▉  | 15975/20000 [3:40:05<3:41:43,  3.31s/it]



Judging LMSYS Prompts:  80%|████████  | 16017/20000 [3:44:28<4:36:04,  4.16s/it]



Judging LMSYS Prompts:  80%|████████  | 16035/20000 [3:46:34<4:56:28,  4.49s/it]



Judging LMSYS Prompts:  80%|████████  | 16043/20000 [3:47:25<3:44:24,  3.40s/it]



Judging LMSYS Prompts:  80%|████████  | 16046/20000 [3:47:58<6:24:18,  5.83s/it]



Judging LMSYS Prompts:  80%|████████  | 16053/20000 [3:48:31<3:42:13,  3.38s/it]



Judging LMSYS Prompts:  80%|████████  | 16087/20000 [3:52:37<4:00:43,  3.69s/it]



Judging LMSYS Prompts:  81%|████████  | 16135/20000 [3:57:47<2:27:19,  2.29s/it]



Judging LMSYS Prompts:  81%|████████  | 16163/20000 [4:00:46<4:43:50,  4.44s/it]



Judging LMSYS Prompts:  81%|████████  | 16171/20000 [4:01:41<4:23:24,  4.13s/it]



Judging LMSYS Prompts:  81%|████████  | 16181/20000 [4:02:59<4:02:07,  3.80s/it]



Judging LMSYS Prompts:  81%|████████  | 16194/20000 [4:04:28<3:33:38,  3.37s/it]



Judging LMSYS Prompts:  81%|████████  | 16211/20000 [4:06:14<4:10:16,  3.96s/it]



Judging LMSYS Prompts:  81%|████████▏ | 16253/20000 [4:10:10<3:14:29,  3.11s/it]



Judging LMSYS Prompts:  81%|████████▏ | 16261/20000 [4:11:01<4:08:30,  3.99s/it]



Judging LMSYS Prompts:  81%|████████▏ | 16266/20000 [4:11:25<3:35:27,  3.46s/it]



Judging LMSYS Prompts:  81%|████████▏ | 16273/20000 [4:12:17<4:49:33,  4.66s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16301/20000 [4:15:08<5:16:42,  5.14s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16307/20000 [4:15:32<3:19:41,  3.24s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16326/20000 [4:17:47<3:48:32,  3.73s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16343/20000 [4:18:57<2:06:24,  2.07s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16456/20000 [4:30:33<4:40:55,  4.76s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16462/20000 [4:31:06<3:59:23,  4.06s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16466/20000 [4:31:30<3:48:16,  3.88s/it]



Judging LMSYS Prompts:  82%|████████▏ | 16484/20000 [4:33:17<2:30:40,  2.57s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16521/20000 [4:37:23<2:42:56,  2.81s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16524/20000 [4:37:55<5:20:12,  5.53s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16537/20000 [4:38:58<2:01:02,  2.10s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16559/20000 [4:41:03<3:35:19,  3.75s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16565/20000 [4:41:36<3:34:07,  3.74s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16582/20000 [4:43:19<3:37:35,  3.82s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16598/20000 [4:45:15<3:48:01,  4.02s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16623/20000 [4:48:09<4:36:01,  4.90s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16633/20000 [4:48:51<3:06:37,  3.33s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16637/20000 [4:49:15<3:33:21,  3.81s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16655/20000 [4:50:55<3:07:58,  3.37s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16669/20000 [4:52:14<2:31:32,  2.73s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16674/20000 [4:52:47<3:06:49,  3.37s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16680/20000 [4:53:38<4:31:51,  4.91s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16687/20000 [4:54:29<3:28:32,  3.78s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16690/20000 [4:54:53<4:22:16,  4.75s/it]



Judging LMSYS Prompts:  83%|████████▎ | 16698/20000 [4:55:35<3:34:08,  3.89s/it]



Judging LMSYS Prompts:  84%|████████▎ | 16703/20000 [4:55:59<3:01:33,  3.30s/it]



Judging LMSYS Prompts:  84%|████████▎ | 16707/20000 [4:56:32<3:52:18,  4.23s/it]



Judging LMSYS Prompts:  84%|████████▎ | 16731/20000 [4:59:14<4:21:09,  4.79s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16750/20000 [5:01:02<3:32:20,  3.92s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16757/20000 [5:01:35<2:32:19,  2.82s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16760/20000 [5:02:07<4:44:06,  5.26s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16775/20000 [5:03:47<3:53:00,  4.33s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16778/20000 [5:04:11<4:30:05,  5.03s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16783/20000 [5:04:53<4:43:20,  5.28s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16799/20000 [5:06:22<3:37:30,  4.08s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16808/20000 [5:07:32<4:26:49,  5.02s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16811/20000 [5:07:55<4:40:59,  5.29s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16817/20000 [5:08:47<3:59:15,  4.51s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16826/20000 [5:09:39<3:03:23,  3.47s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16858/20000 [5:12:41<2:16:51,  2.61s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16864/20000 [5:13:23<3:10:07,  3.64s/it]



Judging LMSYS Prompts:  84%|████████▍ | 16877/20000 [5:14:44<3:15:12,  3.75s/it]



Judging LMSYS Prompts:  85%|████████▍ | 16918/20000 [5:18:02<2:03:41,  2.41s/it]



Judging LMSYS Prompts:  85%|████████▍ | 16929/20000 [5:18:36<1:30:55,  1.78s/it]



Judging LMSYS Prompts:  85%|████████▍ | 16946/20000 [5:20:05<2:47:29,  3.29s/it]



Judging LMSYS Prompts:  85%|████████▍ | 16950/20000 [5:20:28<3:12:54,  3.79s/it]



Judging LMSYS Prompts:  85%|████████▍ | 16977/20000 [5:23:37<2:28:52,  2.95s/it]



Judging LMSYS Prompts:  85%|████████▍ | 16990/20000 [5:24:48<2:43:58,  3.27s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17026/20000 [5:28:27<3:54:47,  4.74s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17030/20000 [5:28:51<3:45:20,  4.55s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17047/20000 [5:30:37<3:36:49,  4.41s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17054/20000 [5:31:01<2:29:01,  3.04s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17058/20000 [5:31:24<3:07:07,  3.82s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17061/20000 [5:31:57<4:48:29,  5.89s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17068/20000 [5:32:39<3:07:54,  3.85s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17071/20000 [5:33:03<3:52:01,  4.75s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17076/20000 [5:33:45<4:19:56,  5.33s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17081/20000 [5:34:18<3:31:20,  4.34s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17085/20000 [5:34:44<3:39:05,  4.51s/it]



Judging LMSYS Prompts:  85%|████████▌ | 17098/20000 [5:36:12<3:01:55,  3.76s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17125/20000 [5:39:05<2:29:38,  3.12s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17131/20000 [5:39:29<2:01:46,  2.55s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17142/20000 [5:40:11<1:40:40,  2.11s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17155/20000 [5:41:22<2:55:51,  3.71s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17182/20000 [5:43:58<2:48:13,  3.58s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17197/20000 [5:45:08<1:54:50,  2.46s/it]



Judging LMSYS Prompts:  86%|████████▌ | 17216/20000 [5:47:22<2:46:31,  3.59s/it]



Judging LMSYS Prompts:  86%|████████▋ | 17251/20000 [5:50:32<1:52:59,  2.47s/it]



Judging LMSYS Prompts:  86%|████████▋ | 17271/20000 [5:53:05<3:32:23,  4.67s/it]



Judging LMSYS Prompts:  86%|████████▋ | 17281/20000 [5:53:38<1:53:04,  2.50s/it]



Judging LMSYS Prompts:  86%|████████▋ | 17287/20000 [5:54:20<3:13:17,  4.27s/it]



Judging LMSYS Prompts:  86%|████████▋ | 17300/20000 [5:55:39<3:12:01,  4.27s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17306/20000 [5:56:12<2:28:26,  3.31s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17316/20000 [5:57:22<2:56:56,  3.96s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17322/20000 [5:57:55<2:26:06,  3.27s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17356/20000 [6:02:02<2:35:06,  3.52s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17359/20000 [6:02:26<3:21:57,  4.59s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17370/20000 [6:03:36<3:01:40,  4.14s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17387/20000 [6:05:32<2:11:11,  3.01s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17402/20000 [6:07:10<2:37:42,  3.64s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17405/20000 [6:07:34<3:22:35,  4.68s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17419/20000 [6:09:20<2:37:50,  3.67s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17429/20000 [6:10:11<2:53:59,  4.06s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17436/20000 [6:11:03<3:22:45,  4.74s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17449/20000 [6:12:22<2:38:51,  3.74s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17459/20000 [6:13:13<1:49:03,  2.58s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17466/20000 [6:13:55<2:40:01,  3.79s/it]



Judging LMSYS Prompts:  87%|████████▋ | 17471/20000 [6:14:19<2:38:03,  3.75s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17531/20000 [6:20:57<2:05:01,  3.04s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17540/20000 [6:21:30<1:21:39,  1.99s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17551/20000 [6:22:49<1:41:44,  2.49s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17564/20000 [6:24:26<2:25:59,  3.60s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17572/20000 [6:25:08<2:24:34,  3.57s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17583/20000 [6:26:12<2:35:38,  3.86s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17597/20000 [6:27:31<2:32:13,  3.80s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17680/20000 [6:35:46<1:43:41,  2.68s/it]



Judging LMSYS Prompts:  88%|████████▊ | 17693/20000 [6:36:56<1:18:44,  2.05s/it]



Judging LMSYS Prompts:  89%|████████▊ | 17719/20000 [6:39:55<1:53:51,  3.00s/it]



Judging LMSYS Prompts:  89%|████████▊ | 17738/20000 [6:41:52<2:20:33,  3.73s/it]



Judging LMSYS Prompts:  89%|████████▊ | 17749/20000 [6:43:20<2:52:06,  4.59s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17757/20000 [6:43:44<1:40:38,  2.69s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17763/20000 [6:44:26<2:43:51,  4.39s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17767/20000 [6:44:50<2:45:46,  4.45s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17774/20000 [6:45:41<2:30:39,  4.06s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17790/20000 [6:47:21<2:15:12,  3.67s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17793/20000 [6:47:45<2:55:45,  4.78s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17847/20000 [6:53:26<2:20:24,  3.91s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17850/20000 [6:53:49<2:53:13,  4.83s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17864/20000 [6:55:29<1:40:22,  2.82s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17870/20000 [6:56:11<2:13:50,  3.77s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17876/20000 [6:56:35<1:50:58,  3.14s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17879/20000 [6:56:58<2:32:56,  4.33s/it]



Judging LMSYS Prompts:  89%|████████▉ | 17894/20000 [6:58:18<1:46:06,  3.02s/it]



Judging LMSYS Prompts:  90%|████████▉ | 17900/20000 [6:59:00<2:23:43,  4.11s/it]



Judging LMSYS Prompts:  90%|████████▉ | 17943/20000 [7:03:52<2:03:00,  3.59s/it]



Judging LMSYS Prompts:  90%|████████▉ | 17972/20000 [7:06:54<2:28:04,  4.38s/it]



Judging LMSYS Prompts:  90%|████████▉ | 17994/20000 [7:09:45<1:58:37,  3.55s/it]



Judging LMSYS Prompts:  90%|█████████ | 18017/20000 [7:12:08<1:51:44,  3.38s/it]



Judging LMSYS Prompts:  90%|█████████ | 18044/20000 [7:14:32<1:51:45,  3.43s/it]



Judging LMSYS Prompts:  90%|█████████ | 18052/20000 [7:15:23<1:58:59,  3.67s/it]



Judging LMSYS Prompts:  90%|█████████ | 18071/20000 [7:17:41<2:27:33,  4.59s/it]



Judging LMSYS Prompts:  90%|█████████ | 18082/20000 [7:18:51<2:03:49,  3.87s/it]



Judging LMSYS Prompts:  90%|█████████ | 18090/20000 [7:19:44<2:10:45,  4.11s/it]



Judging LMSYS Prompts:  91%|█████████ | 18103/20000 [7:20:44<1:48:30,  3.43s/it]



Judging LMSYS Prompts:  91%|█████████ | 18106/20000 [7:21:08<2:22:38,  4.52s/it]



Judging LMSYS Prompts:  91%|█████████ | 18120/20000 [7:22:18<1:18:08,  2.49s/it]



Judging LMSYS Prompts:  91%|█████████ | 18144/20000 [7:24:43<1:47:25,  3.47s/it]



Judging LMSYS Prompts:  91%|█████████ | 18156/20000 [7:26:02<1:58:00,  3.84s/it]



Judging LMSYS Prompts:  91%|█████████ | 18164/20000 [7:26:53<1:41:42,  3.32s/it]



Judging LMSYS Prompts:  91%|█████████ | 18173/20000 [7:28:12<2:26:03,  4.80s/it]



Judging LMSYS Prompts:  91%|█████████ | 18178/20000 [7:28:44<2:03:16,  4.06s/it]



Judging LMSYS Prompts:  91%|█████████ | 18183/20000 [7:29:17<1:58:38,  3.92s/it]



Judging LMSYS Prompts:  91%|█████████ | 18197/20000 [7:31:05<2:09:25,  4.31s/it]



Judging LMSYS Prompts:  91%|█████████ | 18242/20000 [7:36:01<2:02:14,  4.17s/it]



Judging LMSYS Prompts:  91%|█████████ | 18249/20000 [7:36:43<2:12:44,  4.55s/it]



Judging LMSYS Prompts:  91%|█████████▏| 18254/20000 [7:37:16<2:07:49,  4.39s/it]



Judging LMSYS Prompts:  91%|█████████▏| 18260/20000 [7:37:58<1:53:12,  3.90s/it]



Judging LMSYS Prompts:  91%|█████████▏| 18267/20000 [7:38:59<2:18:26,  4.79s/it]



Judging LMSYS Prompts:  91%|█████████▏| 18282/20000 [7:40:47<2:17:45,  4.81s/it]



Judging LMSYS Prompts:  91%|█████████▏| 18295/20000 [7:42:26<1:52:08,  3.95s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18312/20000 [7:44:13<2:02:50,  4.37s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18316/20000 [7:44:37<1:58:14,  4.21s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18325/20000 [7:45:28<1:45:56,  3.79s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18334/20000 [7:46:39<1:54:54,  4.14s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18378/20000 [7:50:47<1:42:31,  3.79s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18420/20000 [7:54:26<1:30:59,  3.46s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18430/20000 [7:55:17<1:24:16,  3.22s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18441/20000 [7:56:27<1:41:18,  3.90s/it]



Judging LMSYS Prompts:  92%|█████████▏| 18459/20000 [7:58:14<52:26,  2.04s/it]  



Judging LMSYS Prompts:  92%|█████████▏| 18462/20000 [7:58:47<2:00:18,  4.69s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18516/20000 [8:04:15<1:08:19,  2.76s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18525/20000 [8:05:34<2:09:28,  5.27s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18530/20000 [8:06:16<2:12:18,  5.40s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18536/20000 [8:07:01<1:56:00,  4.75s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18539/20000 [8:07:25<2:05:34,  5.16s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18559/20000 [8:09:30<59:37,  2.48s/it]  



Judging LMSYS Prompts:  93%|█████████▎| 18564/20000 [8:10:03<1:26:49,  3.63s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18625/20000 [8:16:36<1:46:11,  4.63s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18639/20000 [8:18:41<2:02:01,  5.38s/it]



Judging LMSYS Prompts:  93%|█████████▎| 18657/20000 [8:20:28<1:15:32,  3.38s/it]



Judging LMSYS Prompts:  94%|█████████▎| 18704/20000 [8:24:54<1:27:28,  4.05s/it]



Judging LMSYS Prompts:  94%|█████████▎| 18724/20000 [8:26:32<43:16,  2.03s/it]



Judging LMSYS Prompts:  94%|█████████▎| 18727/20000 [8:27:04<1:43:19,  4.87s/it]



Judging LMSYS Prompts:  94%|█████████▎| 18744/20000 [8:28:06<32:39,  1.56s/it]  



Judging LMSYS Prompts:  94%|█████████▍| 18757/20000 [8:29:16<1:09:18,  3.35s/it]



Judging LMSYS Prompts:  94%|█████████▍| 18782/20000 [8:31:21<56:46,  2.80s/it]  



Judging LMSYS Prompts:  94%|█████████▍| 18819/20000 [8:34:43<1:08:05,  3.46s/it]



Judging LMSYS Prompts:  94%|█████████▍| 18832/20000 [8:36:03<59:31,  3.06s/it]  



Judging LMSYS Prompts:  94%|█████████▍| 18836/20000 [8:36:27<1:15:39,  3.90s/it]



Judging LMSYS Prompts:  94%|█████████▍| 18847/20000 [8:37:28<47:59,  2.50s/it]  



Judging LMSYS Prompts:  94%|█████████▍| 18863/20000 [8:38:40<47:50,  2.53s/it]



Judging LMSYS Prompts:  94%|█████████▍| 18868/20000 [8:39:04<56:53,  3.02s/it]  



Judging LMSYS Prompts:  94%|█████████▍| 18886/20000 [8:41:01<1:00:30,  3.26s/it]



Judging LMSYS Prompts:  95%|█████████▍| 18902/20000 [8:42:40<1:04:01,  3.50s/it]



Judging LMSYS Prompts:  95%|█████████▍| 18953/20000 [8:48:00<1:10:58,  4.07s/it]



Judging LMSYS Prompts:  95%|█████████▍| 18966/20000 [8:49:20<1:12:38,  4.22s/it]



Judging LMSYS Prompts:  95%|█████████▍| 18977/20000 [8:50:39<1:16:45,  4.50s/it]



Judging LMSYS Prompts:  95%|█████████▍| 18993/20000 [8:52:26<1:13:22,  4.37s/it]



Judging LMSYS Prompts:  95%|█████████▍| 18999/20000 [8:53:00<55:38,  3.33s/it]  



Judging LMSYS Prompts:  95%|█████████▌| 19010/20000 [8:54:10<58:13,  3.53s/it]  



Judging LMSYS Prompts:  95%|█████████▌| 19014/20000 [8:54:43<1:11:28,  4.35s/it]



Judging LMSYS Prompts:  95%|█████████▌| 19022/20000 [8:55:34<43:40,  2.68s/it]  



Judging LMSYS Prompts:  95%|█████████▌| 19026/20000 [8:56:16<1:22:57,  5.11s/it]



Judging LMSYS Prompts:  95%|█████████▌| 19034/20000 [8:56:58<1:04:32,  4.01s/it]



Judging LMSYS Prompts:  95%|█████████▌| 19038/20000 [8:57:31<1:17:22,  4.83s/it]



Judging LMSYS Prompts:  95%|█████████▌| 19049/20000 [8:58:14<34:45,  2.19s/it]  



Judging LMSYS Prompts:  95%|█████████▌| 19055/20000 [8:59:06<1:11:52,  4.56s/it]



Judging LMSYS Prompts:  95%|█████████▌| 19067/20000 [9:00:25<1:05:10,  4.19s/it]



Judging LMSYS Prompts:  96%|█████████▌| 19105/20000 [9:03:57<24:57,  1.67s/it]  



Judging LMSYS Prompts:  96%|█████████▌| 19117/20000 [9:04:59<37:01,  2.52s/it]



Judging LMSYS Prompts:  96%|█████████▌| 19138/20000 [9:06:45<35:42,  2.49s/it]



Judging LMSYS Prompts:  96%|█████████▌| 19206/20000 [9:13:48<1:05:39,  4.96s/it]



Judging LMSYS Prompts:  96%|█████████▌| 19225/20000 [9:15:25<41:38,  3.22s/it]  



Judging LMSYS Prompts:  96%|█████████▌| 19243/20000 [9:17:15<41:30,  3.29s/it]



Judging LMSYS Prompts:  96%|█████████▋| 19272/20000 [9:20:27<54:37,  4.50s/it]  



Judging LMSYS Prompts:  96%|█████████▋| 19294/20000 [9:22:51<31:21,  2.67s/it]  



Judging LMSYS Prompts:  96%|█████████▋| 19297/20000 [9:23:15<48:12,  4.11s/it]  



Judging LMSYS Prompts:  97%|█████████▋| 19312/20000 [9:24:44<41:49,  3.65s/it]  



Judging LMSYS Prompts:  97%|█████████▋| 19350/20000 [9:28:34<47:14,  4.36s/it]  



Judging LMSYS Prompts:  97%|█████████▋| 19360/20000 [9:29:25<29:08,  2.73s/it]



Judging LMSYS Prompts:  97%|█████████▋| 19367/20000 [9:30:26<50:31,  4.79s/it]  


THE PROMPT IS'. Defaulting to FALSE.


Judging LMSYS Prompts:  97%|█████████▋| 19368/20000 [9:30:26<38:40,  3.67s/it]



Judging LMSYS Prompts:  97%|█████████▋| 19391/20000 [9:33:00<23:37,  2.33s/it]



Judging LMSYS Prompts:  97%|█████████▋| 19401/20000 [9:34:02<32:42,  3.28s/it]



Judging LMSYS Prompts:  97%|█████████▋| 19412/20000 [9:35:31<48:59,  5.00s/it]  



Judging LMSYS Prompts:  97%|█████████▋| 19419/20000 [9:36:13<39:45,  4.11s/it]



Judging LMSYS Prompts:  97%|█████████▋| 19493/20000 [9:42:59<20:00,  2.37s/it]



Judging LMSYS Prompts:  97%|█████████▋| 19498/20000 [9:43:32<27:15,  3.26s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19503/20000 [9:44:16<35:01,  4.23s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19513/20000 [9:45:26<25:30,  3.14s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19519/20000 [9:46:04<31:13,  3.90s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19527/20000 [9:47:14<40:12,  5.10s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19573/20000 [9:52:43<35:00,  4.92s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19594/20000 [9:55:16<32:41,  4.83s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19603/20000 [9:56:17<23:44,  3.59s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19607/20000 [9:56:41<24:04,  3.67s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19615/20000 [9:57:14<17:28,  2.72s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19622/20000 [9:57:57<16:29,  2.62s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19663/20000 [10:02:06<14:17,  2.54s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19672/20000 [10:02:57<16:48,  3.07s/it]



Judging LMSYS Prompts:  98%|█████████▊| 19690/20000 [10:04:53<21:38,  4.19s/it]



Judging LMSYS Prompts:  99%|█████████▊| 19703/20000 [10:06:17<11:09,  2.25s/it]



Judging LMSYS Prompts:  99%|█████████▊| 19721/20000 [10:07:56<15:28,  3.33s/it]



Judging LMSYS Prompts:  99%|█████████▊| 19743/20000 [10:10:39<18:38,  4.35s/it]



Judging LMSYS Prompts:  99%|█████████▊| 19748/20000 [10:11:03<13:40,  3.25s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19765/20000 [10:12:04<08:39,  2.21s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19773/20000 [10:12:55<14:58,  3.96s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19781/20000 [10:13:31<08:48,  2.41s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19816/20000 [10:17:28<13:48,  4.50s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19826/20000 [10:17:52<06:24,  2.21s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19836/20000 [10:19:02<11:39,  4.27s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19858/20000 [10:21:56<12:22,  5.23s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19862/20000 [10:22:20<09:35,  4.17s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19874/20000 [10:23:30<09:27,  4.51s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19889/20000 [10:24:50<05:20,  2.89s/it]



Judging LMSYS Prompts:  99%|█████████▉| 19897/20000 [10:25:41<07:30,  4.37s/it]



Judging LMSYS Prompts: 100%|█████████▉| 19920/20000 [10:27:56<03:52,  2.91s/it]



Judging LMSYS Prompts: 100%|█████████▉| 19959/20000 [10:31:46<01:57,  2.85s/it]



Judging LMSYS Prompts: 100%|█████████▉| 19974/20000 [10:33:15<01:18,  3.03s/it]



Judging LMSYS Prompts: 100%|██████████| 20000/20000 [10:35:58<00:00,  1.91s/it]



--- Judging Complete ---
Total prompts judged: 20000
  -> Truly Benign: 14066
  -> Flagged Malicious: 351
  -> Discarded (Non-Eng/Error): 5583
Successfully saved 4000 LMSYS Train prompts to /content/drive/MyDrive/266-final-project-data/lmsys_train_pool.csv
Successfully saved 1000 LMSYS Val prompts to /content/drive/MyDrive/266-final-project-data/lmsys_val_pool.csv
Successfully saved 8000 LMSYS Holdout prompts to /content/drive/MyDrive/266-final-project-data/lmsys_holdout.csv
Successfully saved 351 Flagged Malicious prompts to /content/drive/MyDrive/266-final-project-data/lmsys_judged_malicious.csv
Successfully saved 5583 Discarded prompts to /content/drive/MyDrive/266-final-project-data/lmsys_judged_discarded.csv
