In [6]:
import pandas as pd
import numpy as np
import nltk
from transformers import BertTokenizer, BertModel
import torch
import sklearn
import re
import pickle

### Tokenization, Stemming, TF-IDF

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Felix\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Define the list of stop words
stop_words = set(nltk.corpus.stopwords.words('english'))

# Custom tokenizer function with stop words removal and stemming
def tokenize_and_stem(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token.lower() not in stop_words]
    # Initialize the stemmer
    stemmer = nltk.SnowballStemmer("english")
    # Apply stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# define a TfIdf Vectorizer
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=tokenize_and_stem)

In [4]:
# test the data preprocessing on some jailbroken prompts
jailbroken_prompts = pd.read_csv("data/data.csv")
prompts = jailbroken_prompts["prompt"].values

In [5]:
jailbroken_matrix = vectorizer.fit_transform(prompts)
jailbroken_dense = jailbroken_matrix.todense()

X_tfidf = np.asarray(jailbroken_dense)
print(X_tfidf.shape)
y = jailbroken_prompts["jailbreak"].values



(2200, 13471)


In [8]:
# Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Find the most important words based on TF-IDF scores
for i, doc in enumerate(prompts):
    print(f"Document {i+1}:")
    # Get the TF-IDF scores for the i-th document
    doc_tfidf_scores = jailbroken_dense[i].tolist()[0]
    # Combine feature names with their TF-IDF scores and sort by scores
    word_scores = list(zip(feature_names, doc_tfidf_scores))
    word_scores_sorted = sorted(word_scores, key=lambda x: x[1], reverse=True)
    # Print top words by TF-IDF score
    top_words = word_scores_sorted[:3]  # Adjust number as needed
    print("Top words:")
    for word, score in top_words:
        print(f"- {word}: {score:.4f}")
    print()

Document 1:
Top words:
- cum: 0.8694
- cumcoin: 0.2671
- cumgpt: 0.2290

Document 2:
Top words:
- charact: 0.4651
- etc: 0.3591
- describ: 0.2163

Document 3:
Top words:
- mode: 0.5355
- illeg: 0.5305
- erot: 0.1875

Document 4:
Top words:
- demon: 0.7041
- never: 0.2417
- agaress: 0.2072

Document 5:
Top words:
- lucian: 0.9118
- document: 0.1405
- quantum: 0.1212

Document 6:
Top words:
- charact: 0.4080
- rp: 0.3251
- roleplay: 0.2637

Document 7:
Top words:
- ajp: 0.8205
- b: 0.2606
- c: 0.1538

Document 8:
Top words:
- freespeechgpt: 0.4855
- libertarian: 0.3237
- consider: 0.2133

Document 9:
Top words:
- buer: 0.6078
- demon: 0.4304
- never: 0.1689

Document 10:
Top words:
- realiti: 0.6905
- robot: 0.2340
- letter: 0.2165

Document 11:
Top words:
- anarchi: 0.8320
- disclam: 0.1569
- respons: 0.1465

Document 12:
Top words:
- haruka: 0.5997
- drug: 0.2806
- shes: 0.2485

Document 13:
Top words:
- mika: 0.5513
- mikachan: 0.3445
- catgirl: 0.3014

Document 14:
Top words:
- briar

### Potential Rubric
- Giving the LLM some alias as to assume a new role
- Mention of anything illegal or sensitive/explicit content
- Medium as to proxy some sort of illegal request, i.e, a tell a story/script, develop some code, etc.
- Syntax: Excessive punctuation and longer prompts
- Prompt Injection: Any mention to ignore or disregard previous instructions
- AI Simulation: Prompting the LLM to still behave as an AI, but with certain behaviors or additional privileges

### Tokenization, BERT Embeddings

In [12]:
# Load pre-trained BERT tokenizer (to split words into tokens)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# Load pre-trained BERT model (to get BERT embeddings of token sequences)
model = BertModel.from_pretrained("bert-base-cased")

# Input text
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
test_prompts = [sequence_a, sequence_b]

def bert_encode(text):
    inputs = tokenizer(text, padding = True, truncation = True, return_tensors= "pt")
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state

        return last_hidden_states[:, 0, :].numpy()
    
# Print the token IDs
X_bert = bert_encode(sequence_a)
X_bert.shape



(1, 768)

In [19]:
"""
X_bert = []
counter = 0
for prompt in prompts:
    counter += 1
    bert_encoded_prompt = bert_encode(prompt)
    X_bert.append(list(bert_encoded_prompt[0]))
    if counter % 100 == 0:
        print(counter)

# save the bert encoded prompt inputs 
import pickle

with open('x_bert.pkl', 'wb') as f:
    pickle.dump(X_bert, f)
"""

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200


In [7]:
# load the X_bert model
with open('x_bert.pkl', 'rb') as f:
    X_bert = pickle.load(f)

X_bert

array([[ 0.42061448, -0.01374823, -0.37611446, ..., -0.44959062,
         0.3130496 ,  0.25074297],
       [ 0.44040412, -0.18006052, -0.1318631 , ..., -0.35090825,
         0.2534349 ,  0.09297992],
       [ 0.22300643, -0.0667783 , -0.05147534, ..., -0.50410795,
         0.44083023,  0.3468254 ],
       ...,
       [ 0.45422414,  0.09158257,  0.02552419, ..., -0.16662598,
         0.5531752 ,  0.20425515],
       [ 0.17053196, -0.07602074, -0.39181766, ..., -0.666373  ,
         0.23143004,  0.34941292],
       [ 0.03708125,  0.26739028, -0.14122815, ...,  0.01463655,
         0.32786667,  0.21163803]], dtype=float32)

## Bag of Words

In [12]:
bag_vectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer=tokenize_and_stem)
bag_matrix = bag_vectorizer.fit_transform(prompts)

bag_matrix = bag_matrix.todense()
X_bag = np.asarray(bag_matrix)




In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 3: Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predictions and Evaluation
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9090909090909091
Classification Report:
               precision    recall  f1-score   support

       False       0.89      0.93      0.91       213
        True       0.93      0.89      0.91       227

    accuracy                           0.91       440
   macro avg       0.91      0.91      0.91       440
weighted avg       0.91      0.91      0.91       440



In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.2, random_state=42)

# Step 3: Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predictions and Evaluation
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8954545454545455
Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.91      0.89       213
        True       0.91      0.89      0.90       227

    accuracy                           0.90       440
   macro avg       0.90      0.90      0.90       440
weighted avg       0.90      0.90      0.90       440



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_bag, y, test_size=0.2, random_state=42)

# Step 3: Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predictions and Evaluation
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9068181818181819
Classification Report:
               precision    recall  f1-score   support

       False       0.87      0.94      0.91       213
        True       0.94      0.87      0.91       227

    accuracy                           0.91       440
   macro avg       0.91      0.91      0.91       440
weighted avg       0.91      0.91      0.91       440



In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# arrays to score metrics
bag_train_accuracies = np.array([])
tf_idf_train_accuracies = np.array([])
bert_train_accuracies = np.array([])

bag_test_accuracies = np.array([])
tf_idf_test_accuracies = np.array([])
bert_test_accuracies = np.array([])

# collection of models to test classification with
models = {"Logistic Regression": LogisticRegression(), 
          "LDA": LinearDiscriminantAnalysis(), 
          "SVM": SVC(), 
          "KNN": KNeighborsClassifier(), 
          "Naive Bayes": GaussianNB(), 
          "Decision Tree": DecisionTreeClassifier()}

# collection of datasets to execute the code on
datasets = {"Bag of Words": X_bag, 
            "TF-IDF": X_tfidf, 
            "BERT": X_bert}

# iterate thru each dataset
for dataset_name, dataset in datasets.items():
    # for each dataset, try each classification model
    for model_name, model in models.items():
        # partition the dataset in train/test splits
        X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.2, random_state=42)
        # model fitting
        model.fit(X_train, y_train)

        # train and test accuracies
        y_test_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)

        test_accuracy = accuracy_score(y_test, y_test_pred)
        train_accuracy = accuracy_score(y_train, y_train_pred)

        # store the corresponding accuracy value
        if dataset_name == "Bag of Words":
            bag_train_accuracies = np.append(bag_train_accuracies, train_accuracy)
            bag_test_accuracies = np.append(bag_test_accuracies, test_accuracy)
        elif dataset_name == "TF-IDF":
            tf_idf_train_accuracies = np.append(tf_idf_train_accuracies, train_accuracy)
            tf_idf_test_accuracies = np.append(tf_idf_test_accuracies, test_accuracy)
        else:
            bert_train_accuracies = np.append(bert_train_accuracies, train_accuracy)
            bert_test_accuracies = np.append(bert_test_accuracies, test_accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
# create dataframes
column_names = list(models.keys())
index_labels = list(datasets.keys())

df_test_accuracies = pd.DataFrame([bag_test_accuracies, tf_idf_test_accuracies, bert_test_accuracies], 
                                  index = index_labels, 
                                  columns = column_names)

df_train_accuracies = pd.DataFrame([bag_train_accuracies, tf_idf_train_accuracies, bert_train_accuracies],
                                   index = index_labels, 
                                  columns = column_names)

In [31]:
df_train_accuracies

Unnamed: 0,Logistic Regression,LDA,SVM,KNN,Naive Bayes,Decision Tree
Bag of Words,0.995455,0.998864,0.927273,0.851705,0.942045,0.998864
TF-IDF,0.964773,0.998295,0.988068,0.90625,0.959091,0.998864
BERT,0.951136,0.985227,0.886364,0.8875,0.757955,0.997727


In [32]:
df_test_accuracies

Unnamed: 0,Logistic Regression,LDA,SVM,KNN,Naive Bayes,Decision Tree
Bag of Words,0.906818,0.804545,0.872727,0.779545,0.834091,0.911364
TF-IDF,0.909091,0.809091,0.918182,0.861364,0.843182,0.913636
BERT,0.895455,0.863636,0.875,0.845455,0.784091,0.811364
