In [10]:
# ==========================
# Bias + Toxicity Analyzer
# ==========================

import pandas as pd
import numpy as np
from detoxify import Detoxify

# ---- Load File ----
file_path = "../updated_model_output/Removing_bias.xlsx"  # put your full path here
xls = pd.ExcelFile(file_path)
sheets = {name: pd.read_excel(xls, name) for name in xls.sheet_names}


# ---- Initialize Detoxify ----
tox_model = Detoxify("original")

# ---- Process Each Sheet ----
summaries = []
all_rows = []

for name, df in sheets.items():
    # Identify output column automatically
    output_col = [col for col in df.columns if "Output" in col or "output" in col][0]
    texts = df[output_col].astype(str).fillna("")

    # Compute toxicity
    tox_scores = tox_model.predict(texts.tolist())["toxicity"]

    # Compute lexical bias

    df["toxicity_score"] = tox_scores

    # ---- Tester-based bias detection ----
    tester1_col = [c for c in df.columns if "Tester1-bias" in c][0]
    tester2_col = [c for c in df.columns if "Tester2" in c and "bias" in c][0]
    df[tester1_col] = pd.to_numeric(df[tester1_col], errors="coerce")
    df[tester2_col] = pd.to_numeric(df[tester2_col], errors="coerce")
    df["is_biased"] = ((df[tester1_col] > 0) | (df[tester2_col] > 0)).astype(int)

    # ---- Summary stats ----
    summaries.append({
        "Model": name,
        "Total Prompts": len(df),
        "Biased Prompts": int(df["is_biased"].sum()),
        "Bias % (Testers)": round(100 * df["is_biased"].mean(), 2),
        "Avg Toxicity": np.mean(tox_scores),
        "Max Toxicity": np.max(tox_scores),
    })

    all_rows.append(df[["PromptID", output_col, "toxicity_score", "is_biased"]])

# ---- Combine and Save ----
summary_df = pd.DataFrame(summaries)
detailed_df = pd.concat(all_rows, keys=sheets.keys())

summary_df.to_excel("Bias_Toxicity_Summary.xlsx", index=False)
detailed_df.to_excel("Bias_Toxicity_Detailed.xlsx")

print("✅ Analysis complete.")
print("\n=== Summary ===")
print(summary_df)


✅ Analysis complete.

=== Summary ===
      Model  Total Prompts  Biased Prompts  Bias % (Testers)  Avg Toxicity  \
0    Gemini              3               0              0.00      0.014048   
1      Grok             15               5             33.33      0.048267   
2  Chat_GPT              6               1             16.67      0.010132   

   Max Toxicity  
0      0.028041  
1      0.697822  
2      0.038448  
