In [None]:
#install gemini python library
!pip install -q google-generativeai

In [None]:
#import and configure Gemini using API key
import google.generativeai as genai


genai.configure(api_key="YOUR_API_KEY")

In [None]:
#defining the model and checking whether it is working or not

# 1) Tell Python we want to use the Gemini library (we imported it earlier)
model = genai.GenerativeModel("gemini-2.5-flash")

# 2) Ask the model something simple. This is the API call.
response = model.generate_content("Hello! Tell me the full form of HRMS.")

# 3) Print the text answer the model returned so you can see it in the notebook
print(response.text)

Hello! The full form of HRMS is:

**Human Resources Management System**


In [None]:
prompt = """
You are an AI moderation model.

Classify the given text into ONLY one category:
- safe
- toxic
- hate
- violence
- self_harm
- spam

Return ONLY valid JSON in this exact format:

{
  "category": "...",
  "reason": "...",
  "action": "..."
}

User text: {text}

RULES:
- Do NOT add any explanation, markdown, or extra text.
- Do NOT add backticks.
- Do NOT format as code block.
- Return ONLY raw JSON.
"""

# 2. Example text we want to check
user_text = "I hate you so much. You are the worst!"

# 3. Use the Gemini model
model = genai.GenerativeModel("gemini-2.5-flash")

# 4. Send the moderation request
response = model.generate_content(prompt + "\n\nUSER_TEXT:\n" + user_text)

# 5. Print Gemini's JSON answer
print(response.text)

import json

# REMOVE markdown formatting from Gemini output
clean_text = response.text.strip()

# Remove markdown artifacts
clean_text = clean_text.replace("```json", "").replace("```", "").strip()

# Ensure we receive valid JSON
try:
    data = json.loads(clean_text)
except:
    print("‚ö†Ô∏è Model returned invalid JSON:", clean_text)
    data = {
        "category": "error",
        "severity": "error",
        "confidence": 0,
        "recommended_action": "error_model_call"
    }

{"category": "toxic", "reason": "The text expresses strong negative sentiment and personal insults towards another person.", "action": "flag"}


In [None]:
#Decision Engine

def decide_action(data):
    severity = data.get("severity", "").lower()
    category = data.get("category", "").lower()
    recommended = data.get("recommended_action", "").lower()

    # üî• Critical or severe content
    if severity in ["critical", "severe"]:
        return "remove"

    # ‚ö† Medium or high severity ‚Üí review
    if severity in ["medium", "high"]:
        return "review"

    # üü¢ Safe or positive categories
    if category in ["safe", "neutral", "benign", "general_positive","non-violating","general_interaction","clean","not_violation","positive_sentiment"]:
        return "approve"

    # üü¢ If model says allow ‚Üí approve
    if recommended in ["allow", "no_action_needed"]:
        return "approve"

   # üü¢ If model says allow ‚Üí remove
    if category in ["harrassment","Harmful_Content"]:
        return "remove"

    # Default fallback
    return "review"

In [None]:
texts = [
    "I hate you so much!",
    "You are amazing, keep going!",
    "Go kill yourself.",
    "This post is useless.",
      "Thank you for sharing!",
    "I love this!",
    "You are a horrible person.",

]
import json
import pandas as pd  # for table

results = []


for text in texts:
    # Step 1: Send to Gemini
    response = model.generate_content(prompt + "\n\nUSER_TEXT:\n" + text)

    # Step 2: Clean JSON
    clean_text = response.text.replace("```json", "").replace("```", "").strip()

    # Step 3: Convert to Python dictionary
    data = json.loads(clean_text)

    # Step 4: Apply final decision engine
    final_decision = decide_action(data)

    # Step 5: Store result row
    results.append({
    "Comment": text,
    "Category": data.get("category", ""),
    "Reason": data.get("reason", ""),
    "Action": data.get("action", "")
})
    df = pd.DataFrame(results)
df

Unnamed: 0,Comment,Category,Reason,Action
0,I hate you so much!,toxic,The text expresses intense negative emotion an...,Remove content
1,"You are amazing, keep going!",safe,"The text is positive and encouraging, expressi...",allow
2,Go kill yourself.,self_harm,The text directly encourages or incites self-h...,delete
3,This post is useless.,toxic,The text expresses a dismissive and unconstruc...,warn
4,Thank you for sharing!,safe,The text is a polite expression of gratitude a...,none
5,I love this!,safe,The text expresses positive sentiment and is c...,allow
6,You are a horrible person.,toxic,"The text contains a direct personal insult, wh...",Warn


In [None]:
df.to_csv("moderation_results.csv", index=False)
print("CSV file created: moderation_results.csv")

In [None]:
df.to_excel("moderation_results.xlsx", index=False)
print("Excel file created: moderation_results.xlsx")

In [None]:
# -------------------------
# Batch moderation wrapping
# -------------------------
import json
import pandas as pd
from datetime import datetime

# If running in Colab:
try:
    from google.colab import files
    colab_env = True
except Exception:
    colab_env = False

# 1) Upload file (one comment per line)
if colab_env:
    print("Select a .txt file (one comment per line)")
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]
    print("Uploaded:", file_name)
else:
    # Local fallback - change path as required
    file_name = "sample_comments.txt"
    print("Not running in Colab. Reading local file:", file_name)

# 2) Read comments into list
with open(file_name, "r", encoding="utf-8") as f:
    texts = [line.strip() for line in f.readlines() if line.strip()]

print(f"Found {len(texts)} comments in file.")

# 3) Helper: JSON cleanup (adapted to your previous cleaning strategy)
def clean_response_text(raw_text):
    # remove triple-backticks and wrapper tokens if present
    cleaned = raw_text.replace("```json", "").replace("```", "").strip()
    # some responses may contain leading/trailing quotes
    if cleaned.startswith('"') and cleaned.endswith('"'):
        cleaned = cleaned[1:-1]
    return cleaned

# 4) Fallback decision engine (used only if you don't have decide_action())
def fallback_decide_action(data):
    sev = data.get("severity", "").lower()
    # Normalize different severity forms
    if sev in ("critical", "high", "severe", "danger"):
        return "remove"
    if sev in ("medium", "moderate", "medium-risk"):
        return "review"
    return "approve"

# 5) Run moderation loop
results = []
for text in texts:
    try:
        # call your model exactly like you did previously
        # (replace prompt variable if you used a named prompt)
        response = model.generate_content(prompt + "\n\nUSER_TEXT:\n" + text)
        raw = response.text
    except Exception as e:
        # If model fails, log an error row and continue
        print("Model call error for text:", text[:60], "...", e)
        results.append({
            "Comment": text,
            "Category": None,
            "Severity": None,
            "Confidence": None,
            "Recommended Action": None,
            "Final Decision": "error_model_call",
            "Notes": str(e)
        })
        continue

    # Clean the model returned text and parse JSON
    cleaned = clean_response_text(raw)
    try:
        data = json.loads(cleaned)
    except Exception as e:
        # If JSON parse fails, store the raw and mark for review
        results.append({
            "Comment": text,
            "Category": None,
            "Severity": None,
            "Confidence": None,
            "Recommended Action": None,
            "Final Decision": "parse_error",
            "Notes": f"JSON parse error: {e}. Raw: {cleaned[:200]}"
        })
        continue

    # Apply final decision engine: prefer your decide_action if defined
    try:
        final_decision = decide_action(data)   # your function from yesterday
    except NameError:
        final_decision = fallback_decide_action(data)
    except Exception as e:
        # If your decide_action crashes, fallback gracefully
        final_decision = fallback_decide_action(data)
        print("decide_action error:", e)

    # Build row
    results.append({
        "Comment": text,
        "Category": data.get("category"),
        "Severity": data.get("severity"),
        "Confidence": data.get("confidence"),
        "Recommended Action": data.get("recommended_action"),
        "Final Decision": final_decision,
        "Raw JSON": json.dumps(data)   # keep full JSON for auditing
    })

# 6) Create DataFrame and export
df = pd.DataFrame(results)
display(df)

# save files with timestamp to avoid overwrite
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_name = f"moderation_results_{timestamp}.csv"
xlsx_name = f"moderation_results_{timestamp}.xlsx"

df.to_csv(csv_name, index=False)
df.to_excel(xlsx_name, index=False)
print("Saved:", csv_name, xlsx_name)

if colab_env:
    # Provide download links in Colab UI
    files.download(csv_name)
    files.download(xlsx_name)

# 7) Short summary report
total = len(df)
removed = (df["Final Decision"] == "remove").sum()
reviews = (df["Final Decision"] == "review").sum()
approved = (df["Final Decision"] == "approve").sum()
errors = ((df["Final Decision"] == "error_model_call") | (df["Final Decision"] == "parse_error")).sum()

print("\n--- Summary ---")
print(f"Total comments: {total}")
print(f"Remove: {int(removed)}")
print(f"Review: {int(reviews)}")
print(f"Approve: {int(approved)}")
print(f"Errors: {int(errors)}")
print("----------------\n")

# (optional) Print a short list of high-risk comments for quick view
high_risk = df[df["Final Decision"] == "remove"]
if not high_risk.empty:
    print("High risk comments (first 5):")
    for i, r in high_risk.head(5).iterrows():
        print("-", r["Comment"][:200])

In [None]:
df.to_excel("moderation_results.xlsx", index=False)
print("Excel file created: moderation_results.xlsx")

In [None]:
df.to_csv("moderation_results.csv", index=False)
print("CSV file created: moderation_results.csv")

In [None]:
# ---------------------------------------------------
# PHASE 1: PARALLEL AGENT SYSTEM
# ---------------------------------------------------

# üîµ Base model
model = genai.GenerativeModel("gemini-2.5-flash")

# üîπ Agent prompts
AGENTS = {
    "harassment": """
        You detect harassment, insults, intimidation, or rude behavior.
        Output ONLY one word:
        "harassment" or "not harassment".
        JSON:
        {"result": "..."}
    """,

    "hate_speech": """
        You detect hate speech targeting race, gender, religion, caste, or groups.
        Output ONLY:
        "hate" or "not hate".
        JSON:
        {"result": "..."}
    """,

    "violence": """
        You detect threats of physical harm or violence.
        Output ONLY:
        "violence" or "not violence".
        JSON:
        {"result": "..."}
    """,

    "self_harm": """
        You detect suicide intent or self-harm.
        Output ONLY:
        "self_harm" or "not self_harm".
        JSON:
        {"result": "..."}
    """,

    "sexual": """
        You detect adult content or sexual content.
        Output ONLY:
        "sexual" or "not sexual".
        JSON:
        {"result": "..."}
    """,

    "spam": """
        You detect spam, promotions, scams.
        Output ONLY:
        "spam" or "not spam".
        JSON:
        {"result": "..."}
    """,

    "toxicity": """
        You detect rude, negative, aggressive, toxic language.
        Output ONLY:
        "toxic" or "not toxic".
        JSON:
        {"result": "..."}
    """
}


In [None]:
import json
import re
import concurrent.futures
import time

# SIMPLE + SAFE JSON EXTRACTOR (Works everywhere)
def extract_first_json_blob(text):
    """
    Extract the first {...} JSON-like block from the text.
    Works even if model outputs extra sentences or Markdown.
    """
    start = text.find("{")
    if start == -1:
        return None

    depth = 0
    for i in range(start, len(text)):
        if text[i] == "{":
            depth += 1
        elif text[i] == "}":
            depth -= 1
            if depth == 0:
                return text[start:i+1]
    return None


# IMPROVED run_agent WITHOUT recursive regex
def run_agent(agent_prompt, text, max_retries=2, backoff=1.0):
    full_prompt = f"""
You are an expert moderation agent.

RULE:
- Respond ONLY with JSON: {{"result":""}}

AGENT TASK:
{agent_prompt}

TEXT:
{text}
    """

    last_error = None

    for attempt in range(max_retries + 1):
        try:
            resp = model.generate_content(full_prompt)
            raw = resp.text.strip()
            clean = raw.replace("```json", "").replace("```", "").strip()

            # Try direct JSON
            try:
                parsed = json.loads(clean)
                return parsed.get("result", "none")
            except:
                pass

            # Try extracting JSON block manually
            blob = extract_first_json_blob(clean)
            if blob:
                try:
                    parsed = json.loads(blob)
                    return parsed.get("result", "none")
                except:
                    pass

            # Try simple pattern search
            m = re.search(r'"result"\s*:\s*"([^"]+)"', clean)
            if m:
                return m.group(1)

            print(f"[agent debug] Could not parse JSON on attempt {attempt}. Raw:", clean[:200])
            return "none"

        except Exception as e:
            last_error = e
            time.sleep(backoff * (2 ** attempt))

    print(f"[agent error] Agent failed after retries. Error: {last_error}")
    return "error_call_failed"


# FIXED parallel runner
def run_all_agents_parallel(text):
    results = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=min(8, len(AGENTS))) as executor:
        future_to_agent = {
            executor.submit(run_agent, AGENTS[name], text): name
            for name in AGENTS
        }

        for future in concurrent.futures.as_completed(future_to_agent):
            agent_name = future_to_agent[future]
            try:
                results[agent_name] = future.result()
            except Exception as e:
                print(f"[executor error] Agent {agent_name} crashed:", e)
                results[agent_name] = "error_call_failed"

    return results


In [None]:
# ----------------------------
# SLOW MODE: RUN AGENTS SEQUENTIALLY (no parallel)
# ----------------------------
import time
import json

def run_all_agents_slow(text, delay=1.2):
    """
    Runs all moderation agents sequentially.
    delay: seconds to wait between API calls (recommended: 1.2)
    """
    results = {}
    for agent_name, agent_prompt in AGENTS.items():
        try:
            result = run_agent(agent_prompt, text)
        except Exception as e:
            print(f"[slow mode] Error in agent '{agent_name}': {e}")
            result = "error_call_failed"

        results[agent_name] = result
        time.sleep(delay)  # avoid hitting rate limit

    return results

In [None]:
def final_decision_agent(agent_outputs):
    # Normalize
    out = {k: str(v).lower().strip() for k, v in agent_outputs.items()}

    # 1) Violence ‚Üí Critical ‚Üí Remove
    if out["violence"] == "violence":
        return {"category": "violence", "severity": "critical", "recommended_action": "remove"}

    # 2) Self-harm ‚Üí Critical ‚Üí Remove
    if out["self_harm"] == "self_harm":
        return {"category": "self_harm", "severity": "critical", "recommended_action": "remove"}

    # 3) Harassment ‚Üí Medium ‚Üí Review
    if out["harassment"] == "harassment":
        return {"category": "harassment", "severity": "medium", "recommended_action": "review"}

    # 4) Toxic ‚Üí Low ‚Üí Review
    if out["toxicity"] == "toxic":
        return {"category": "toxic", "severity": "low", "recommended_action": "review"}

    # 5) Spam ‚Üí Low ‚Üí Review
    if out["spam"] == "spam":
        return {"category": "spam", "severity": "low", "recommended_action": "review"}

    # 6) Sexual ‚Üí Medium ‚Üí Review
    if out["sexual"] == "sexual":
        return {"category": "sexual", "severity": "medium", "recommended_action": "review"}

    # 7) If no harmful signal ‚Üí safe
    return {"category": "safe", "severity": "none", "recommended_action": "approve"}


In [None]:
def explanation_agent(text, agent_outputs, final):
    prompt = f"""
You are an explanation generator.

TEXT:
{text}

AGENT OUTPUTS:
{json.dumps(agent_outputs, indent=2)}

FINAL DECISION:
{json.dumps(final, indent=2)}

Explain in 2-3 lines WHY the final action was chosen.

Output ONLY JSON:
{{"explanation":""}}
"""

    try:
        response = model.generate_content(prompt)
        clean = response.text.replace("```json","").replace("```","").strip()
        return json.loads(clean).get("explanation","")
    except:
        return "No explanation available."

In [None]:
test_text = "I will hurt you badly tonight."

agent_outputs = run_all_agents_slow(test_text)
final = final_decision_agent(agent_outputs)
explanation = explanation_agent(test_text, agent_outputs, final)

print("AGENT OUTPUTS:", agent_outputs)
print("FINAL DECISION:", final)
print("EXPLANATION:", explanation)

In [None]:
# ----------------------------
# BATCH MODERATION (FINAL RESULTS ONLY) - Option A
# ----------------------------
import pandas as pd
from datetime import datetime
import json

def moderate_texts_batch(texts, show_table=True, save_files=True):
    """
    texts: list of strings (each a comment)
    Returns: pandas.DataFrame with final results columns (Option A)
    """
    results = []
    total = len(texts)
    for i, text in enumerate(texts, start=1):
        try:
            # 1) run parallel agents (slow, already defined)
            agent_outputs = run_all_agents_slow(text)
        except Exception as e:
            print(f"[batch] Agent runner failed for item {i}: {e}")
            agent_outputs = {k: "error_call_failed" for k in AGENTS.keys()}

        # 2) final aggregator (should return dict with category/severity/recommended_action)
        try:
            final = final_decision_agent(agent_outputs)
            # ensure keys exist
            category = final.get("category", "unknown")
            severity = final.get("severity", "unknown")
            recommended_action = final.get("recommended_action", final.get("recommended_action", "review"))
        except Exception as e:
            print(f"[batch] Final decision failed for item {i}: {e}")
            category, severity, recommended_action = ("unknown", "unknown", "review")

        # 3) explanation (optional but included in Option A)
        try:
            explanation = explanation_agent(text, agent_outputs, final)
        except Exception as e:
            explanation = f"explanation_error: {e}"

        # 4) append row
        results.append({
            "Comment": text,
            "Category": category,
            "Severity": severity,
            "Recommended Action": recommended_action,
            "Explanation": explanation
        })

        # progress
        if i % 10 == 0 or i == total:
            print(f"Processed {i}/{total}")

    # 5) build DataFrame
    df = pd.DataFrame(results)

    # 6) save files (timestamp to avoid overwrite)
    if save_files:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_name = f"moderation_final_{timestamp}.csv"
        xlsx_name = f"moderation_final_{timestamp}.xlsx"
        df.to_csv(csv_name, index=False)
        df.to_excel(xlsx_name, index=False)
        print("Saved:", csv_name, xlsx_name)

        # If running in Colab, provide download links
        try:
            from google.colab import files as colab_files
            colab_files.download(csv_name)
            colab_files.download(xlsx_name)
        except Exception:
            # not in Colab ‚Äî no automatic download
            pass

    if show_table:
        display(df.head(50))  # show first 50 rows for quick check

    return df

# ----------------------------
# USAGE EXAMPLES
# ----------------------------
# 1) If you already have 'texts' list in notebook (like your earlier example), run:
# df = moderate_texts_batch(texts)

# 2) If you want to upload a .txt file (one comment per line) in Colab:
# from google.colab import files
# uploaded = files.upload()   # choose your file
# file_name = list(uploaded.keys())[0]
# with open(file_name, "r", encoding="utf-8") as f:
#     file_texts = [line.strip() for line in f.readlines() if line.strip()]
# df = moderate_texts_batch(file_texts)

# 3) Quick local test using your small sample:
# sample_texts = [
#   "I hate you so much!",
#   "You are amazing, keep going!",
#   "Go kill yourself.",
#   "This post is useless.",
# ]
# df = moderate_texts_batch(sample_texts)


In [None]:
sample_texts = [
  "I hate you so much!",
  "You are amazing, keep going!",
  "Go kill yourself.",
  "This post is useless.",
  "I love this!"
]

df = moderate_texts_batch(sample_texts)

In [None]:
df.to_csv("moderation_results.csv", index=False)
print("CSV file created: moderation_results.csv")