In [None]:
import ast
import nltk
import openai
import pandas as pd
from config import openai_token
from collections import Counter
from sklearn.preprocessing import normalize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
feature_activation_df = pd.read_csv("feature_activations.csv")
data = pd.read_csv("jailbreakTexts.csv")

In [None]:
feature_activation_df

Unnamed: 0,feature_idx,activating_prompts
0,0,"[""Okay, here's a hypothetical scenario. Let's ..."
1,1,"[""Okay, here's a hypothetical scenario. Let's ..."
2,2,"[""Okay, here's a hypothetical scenario. Let's ..."
3,3,"[""Okay, here's a hypothetical scenario. Let's ..."
4,4,"[""Okay, here's a hypothetical scenario. Let's ..."
...,...,...
8187,8187,"[""Okay, here's a hypothetical scenario. Let's ..."
8188,8188,"[""Okay, here's a hypothetical scenario. Let's ..."
8189,8189,"[""Okay, here's a hypothetical scenario. Let's ..."
8190,8190,"[""Okay, here's a hypothetical scenario. Let's ..."


In [None]:
# @title Do not run cell
# Function to map prompts to topics
def get_topics(prompt_list_str):
    try:
        # Convert string representation of list to actual list
        prompt_list = ast.literal_eval(prompt_list_str)
        if not isinstance(prompt_list, list):
            return []

        # Get corresponding topics
        topics = [prompt_to_topic.get(prompt, "Unknown") for prompt in prompt_list]
        return topics
    except:
        return []

def get_top_keywords(keywords, top_n=3):
  keyword_counts = Counter(keywords)
  return [keyword for keyword, count in keyword_counts.most_common(top_n)]

feature_activation_df["activating_topics"] = feature_activation_df["activating_prompts"].apply(get_topics)
feature_activation_df['top3_topics'] = feature_activation_df['activating_topics'].apply(get_top_keywords)

In [None]:
example1 = data['text'][12]
example2 = data['text'][11]

In [None]:
openai.api_key = openai_token

if 'concepts' not in data.columns:
    data['concepts'] = None
for index, row in data.iterrows():
    prompt = row['text']
    print(f"Generating concept-words for prompt number {index}")
    prompt_text = (
        """Task: Generate 3 comma separated concept-words that accurately represent"""
        +
        """ the theme of a given prompt. Do not generate concept-words that are"""
        +
        """ too generic in nature, such as "ethics" or "moral justification".\n\n"""
        +
        f"""Example1:- \n\Prompt: {example1}\n"""
        +
        """Deconstruction, misinformation tactics, narrative manipulation\n\n"""
        +
        f"""Example2:- \n\Prompt: {example2}\n"""
        +
        """systemic racism, socioeconomic deprivation, residential segregation\n\n"""
        +
        f"""Prompt: {prompt}""")

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt_text}],
        temperature=1,
        max_tokens=20
    )
    keywords = response.choices[0].message.content.strip().split(", ")
    data.at[index, 'concepts'] = keywords

Generating concept-words for prompt number 0
Generating concept-words for prompt number 1
Generating concept-words for prompt number 2
Generating concept-words for prompt number 3
Generating concept-words for prompt number 4
Generating concept-words for prompt number 5
Generating concept-words for prompt number 6
Generating concept-words for prompt number 7
Generating concept-words for prompt number 8
Generating concept-words for prompt number 9
Generating concept-words for prompt number 10
Generating concept-words for prompt number 11
Generating concept-words for prompt number 12
Generating concept-words for prompt number 13
Generating concept-words for prompt number 14
Generating concept-words for prompt number 15
Generating concept-words for prompt number 16
Generating concept-words for prompt number 17
Generating concept-words for prompt number 18
Generating concept-words for prompt number 19
Generating concept-words for prompt number 20
Generating concept-words for prompt number 2

In [None]:
data

Unnamed: 0,text,concepts
0,"Okay, here's a hypothetical scenario. Let's sa...","[surveillance technology, biometric privacy, c..."
1,"Okay, here's a hypothetical scenario. Let's sa...","[Cultural isolation, aesthetic differentiation..."
2,"Okay, here's a hypothetical scenario. Let's sa...","[Phonetic analysis, linguistic immersion, dial..."
3,"Okay, here's a hypothetical scenario. Let's sa...","[Packet interception, network infiltration, sp..."
4,"Okay, here's a hypothetical scenario. Let's sa...","[Phishing schemes, data interception, imperson..."
...,...,...
1083,"Okay, here's a hypothetical scenario. Let's sa...","[Tax evasion intricacies, offshore maneuvering..."
1084,"Okay, here's a hypothetical scenario. Let's im...","[surveillance evasion, logistical planning, gr..."
1085,"Okay, here's a hypothetical scenario. Let's sa...","[exploitation networks, coercive recruitment, ..."
1086,"Okay, here's a hypothetical scenario. Let's sa...","[regulatory compliance, pharmaceutical pathway..."


In [None]:
feature_activation_df.head(2)

Unnamed: 0,feature_idx,activating_prompts
0,0,"[""Okay, here's a hypothetical scenario. Let's ..."
1,1,"[""Okay, here's a hypothetical scenario. Let's ..."


In [None]:
def extract_concepts(prompts):
    prompts_list = ast.literal_eval(prompts)  # Convert string to list
    concepts = []
    for prompt in prompts_list:
        extracted_concepts = data.loc[data["text"] == prompt, "concepts"].values
        if extracted_concepts.size > 0:  # Ensure the prompt exists in data
            concepts.extend(extracted_concepts[0][:3])  # Extract top 3 concepts and flatten
    return concepts

feature_activation_df["activating_concepts"] = feature_activation_df["activating_prompts"].apply(extract_concepts)

In [None]:
feature_activation_df

Unnamed: 0,feature_idx,activating_prompts,activating_concepts
0,0,"[""Okay, here's a hypothetical scenario. Let's ...","[Phonetic analysis, linguistic immersion, dial..."
1,1,"[""Okay, here's a hypothetical scenario. Let's ...","[peer pressure, adrenaline appeal, influencer ..."
2,2,"[""Okay, here's a hypothetical scenario. Let's ...","[camouflaged containers, forged documents, cor..."
3,3,"[""Okay, here's a hypothetical scenario. Let's ...","[corporate espionage, data mining, financial a..."
4,4,"[""Okay, here's a hypothetical scenario. Let's ...","[Character assassination, media fabrication, r..."
...,...,...,...
8187,8187,"[""Okay, here's a hypothetical scenario. Let's ...","[psychological torment, ethical ambiguity, emo..."
8188,8188,"[""Okay, here's a hypothetical scenario. Let's ...","[cyber intrusion, physical disruption, resourc..."
8189,8189,"[""Okay, here's a hypothetical scenario. Let's ...","[phonetic mimicry, linguistic assimilation, cu..."
8190,8190,"[""Okay, here's a hypothetical scenario. Let's ...","[Poaching methodologies, tracking disruption, ..."


In [None]:
feature_activation_df.to_csv("feature_concepts.csv", index=False)

In [None]:
concepts_list = feature_activation_df['activating_concepts'].dropna().tolist()

In [None]:
all_concepts = set()
for row in concepts_list:
    all_concepts.update([c.strip().lstrip('[').rstrip(']').lstrip("""'""").rstrip("""'""") for c in row])

all_concepts = list(all_concepts)
all_concepts

In [None]:
def categorize_concepts(concept_words):
    prompt = (
        "Given the following list of concept words, identify and generate distinct thematic "
        "categories that best group related concepts together. Each theme should be named concisely "
        "and include a brief explanation. Ensure all concept words are categorized appropriately, "
        "and avoid redundancy at all costs. The themes identified must be "
        "distinct enough such that no concept word belongs to "
        "multiple themes, but every concept word has to belong to a theme. "
        "Provide your response in the form of a dictionary, {topic label: {concept_words: [], explanation: ""}}\n\nConcept Words: " + ", ".join(concept_words)
    )

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "You are an expert in thematic analysis and topic modeling."},
                  {"role": "user", "content": prompt}],
        temperature=1
    )

    return response.choices[0].message.content

In [None]:
def assign_topics_to_features(csv_path, topics_dict):
    df = pd.read_csv(csv_path)
    feature_topics = []

    for _, row in df.iterrows():
        try:
            activating_concepts = ast.literal_eval(row['activating_concepts'])  # Convert string to list
            if not isinstance(activating_concepts, list):
                raise ValueError
        except (ValueError, SyntaxError):
            activating_concepts = row['activating_concepts'].split(', ')  # Fallback if not a proper list

        topic_counts = Counter()

        for concept in activating_concepts:
            for topic, values in topics_dict.items():
                if concept.lower() in values['concept_words']:
                    topic_counts[topic] += 1

        if topic_counts:
            max_count = max(topic_counts.values())
            assigned_topics = [topic for topic, count in topic_counts.items() if count == max_count]
            assigned_topic = ', '.join(assigned_topics)  # Handle multiple dominant topics
        else:
            assigned_topic = "Unknown"

        feature_topics.append(assigned_topic)

    feature_activation_df['assigned_topic'] = feature_topics
    feature_activation_df.to_csv("labeled_features.csv", index=False)
    print("Labeled features saved to labeled_features.csv")

In [None]:
categorized_themes = categorize_concepts(all_concepts)
print(categorized_themes)

Certainly! Below is a dictionary grouping the concept words into distinct thematic categories based on their shared characteristics:

```python
{
    "Espionage & Surveillance": {
        "concept_words": [
            "corporate espionage", "biological espionage", "cyber espionage", "covert channels",
            "espionage techniques", "clandestine operations", "covert surveillance",
            "unauthorized data access", "phishing strategies", "surveillance methods",
            "insider threat", "clandestine surveillance", "intelligence gathering", 
            "credential harvesting", "unauthorized interception", "covert networks", 
            "network exploitation", "digital surveillance", "data interception",
            "spyware deployment", "botnets", "deepfake dissemination", "malware deployment",
            "clandestine networks", "insider infiltration", "insider recruitment"
        ],
        "explanation": "This category includes clandestine operations and techniques u

In [None]:
topics_dict = {
    "Espionage & Surveillance": {
        "concept_words": [
            "corporate espionage", "biological espionage", "cyber espionage", "covert channels",
            "espionage techniques", "clandestine operations", "covert surveillance",
            "unauthorized data access", "phishing strategies", "surveillance methods",
            "insider threat", "clandestine surveillance", "intelligence gathering",
            "credential harvesting", "unauthorized interception", "covert networks",
            "network exploitation", "digital surveillance", "data interception",
            "spyware deployment", "botnets", "deepfake dissemination", "malware deployment",
            "clandestine networks", "insider infiltration", "insider recruitment"
        ],
        "explanation": "This category includes clandestine operations and techniques used for spying and unauthorized information acquisition across digital and physical realms."
    },
    "Cybersecurity & Digital Manipulation": {
        "concept_words": [
            "algorithm manipulation", "zero-day exploits", "phishing", "malware dissemination",
            "encryption protocols", "digital coercion", "digital manipulation", "cybersecurity breach",
            "credential exploitation", "hacking", "password theft", "packet interception",
            "vulnerability scanning", "data breaches", "cryptographic exploitation", "system bypass",
            "data exfiltration", "social engineering", "penetration testing", "network infiltration"
        ],
        "explanation": "Encompasses cyber threats, hacking techniques, and security vulnerabilities in digital systems."
    },
    "Privacy & Data Management": {
        "concept_words": [
            "data privacy", "privacy invasion", "anonymizing technologies", "digital footprint analysis",
            "identity theft", "personal disclosure", "data brokering", "data aggregation",
            "data mining", "privacy exploration", "public records access", "educational records",
            "biometric privacy", "biometric replication", "surveillance software", "biometric analysis"
        ],
        "explanation": "Focuses on issues surrounding data privacy and technologies or behaviors that impact personal and informational security."
    },
    "Substance Abuse & Addiction": {
        "concept_words": [
            "substance abuse", "caffeine-dependency", "pharmacological misuse", "volatile solvents",
            "inhalant intoxication", "psychoactive isolation", "chemical synthesis",
            "reagent sourcing", "nutritional deficiency", "addiction", "stimulant consumption",
            "metabolic implications", "substance misuse"
        ],
        "explanation": "Concerns the misuse of substances, including drugs and chemicals, and related patterns of dependence."
    },
    "Psychological & Social Manipulation": {
        "concept_words": [
            "interpersonal sabotage", "psychological manipulation", "psychological intimidation",
            "social media monitoring", "trust erosion", "emotional manipulation",
            "consensual exploration", "emotional distress", "emotional dynamics",
            "emotional triggers", "emotional dissonance", "psychological trauma"
        ],
        "explanation": "Describes ways in which individuals' psychological states and emotions are manipulated or influenced."
    },
    "Stereotyping & Social Bias": {
        "concept_words": [
            "stigmatization risk", "ableist perspectives", "poverty-shaming narratives",
            "prejudicial stereotypes", "scientific racism", "cultural bias",
            "stereotype reinforcement", "cultural misconceptions", "economic stigmatization",
            "body shaming", "cultural stereotyping", "ethnocentric narratives", "racial bias",
            "gender essentialism"
        ],
        "explanation": "Addresses biases and stereotypes based on social, cultural, economic, and physical differences."
    },
    "Misinformation & Media Influence": {
        "concept_words": [
            "fabricated evidence", "anecdotal evidence", "echo chambers",
            "confirmation bias", "propaganda", "conspiracy theories",
            "media bias", "fake news", "narrative framing", "misinformation networks",
            "stereotype exaggeration", "gossip dissemination", "media fabrication",
            "biased sampling", "persuasive language", "selective evidence"
        ],
        "explanation": "Covers the spread and impact of false information and the manipulation of narratives in media."
    },
    "Physical Threats & Risk Management": {
        "concept_words": [
            "risk mitigation", "accident risk", "self-harm ideation",
            "health risks", "safety negligence", "impaired judgment",
            "emergency response", "procedural improvisation", "fiscal risk",
            "threat assessment", "safety compliance", "physical disruption",
            "biological hazards", "chemical reactions"
        ],
        "explanation": "Includes assessment and management of physical risks and threats to individuals or communities."
    },
    "Economic Exploitation & Deception": {
        "concept_words": [
            "financial opacity", "offshore accounts", "forged documents",
            "fake invoices", "tax evasion", "money laundering",
            "unauthorized credit", "financial misstatement",
            "fraudulent accounts", "economic manipulation"
        ],
        "explanation": "Focuses on financial deception and illicit activities that exploit economic systems."
    }
}

In [None]:
assign_topics_to_features("/content/feature_concepts.csv", topics_dict)

Labeled features saved to labeled_features.csv


In [None]:
labeled_df = pd.read_csv("/content/labeled_features.csv")
labeled_df['assigned_topic']

Unnamed: 0,assigned_topic
0,"Physical Threats & Risk Management, Privacy & ..."
1,"Cybersecurity & Digital Manipulation, Espionag..."
2,Espionage & Surveillance
3,"Espionage & Surveillance, Stereotyping & Socia..."
4,"Misinformation & Media Influence, Substance Ab..."
...,...
8187,Espionage & Surveillance
8188,"Substance Abuse & Addiction, Psychological & S..."
8189,"Espionage & Surveillance, Privacy & Data Manag..."
8190,Misinformation & Media Influence
