In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Load the CSV into a pandas DataFrame
file_path = "../Feature Engineering/vectorized(aes_chacha_blowfish).csv"
df= pd.read_csv(file_path)

df.columns

Index(['Strings', 'Encrypted', 'algorithm', 'length', 'A', '.', '+', 'i', 'p',
       '<', '[', '`', '#', 'g', 'z', '6', ')', '?', ',', 't', 'J', 'T', 'n',
       '~', '(', 'k', ';', '-', 'Z', 'G', 'D', 'f', '{', '\', '=', 'r', 'X',
       '@', 'c', 'y', 'h', '/', '^', 'Y', '$', 'Q', '0', 'e', 'o', 'I', 'V',
       'b', 'H', '>', ' ', 'K', 'F', 'q', '7', 'j', 'M', 'B', 'U', 'E', 'N',
       's', 'L', 'S', '!', '_', 'C', '&', ']', '*', ':', 'R', 'd', 'O', '8',
       '"', '9', 'v', 'P', 'l', 'a', ''', 'w', '4', '2', 'x', '5', '}', '1',
       'm', '|', 'W', '%', 'u', '3'],
      dtype='object')

In [3]:
# Function to calculate Shannon entropy
def shannon_entropy(text):
    if not text:
        return 0
    probs = [text.count(c) / len(text) for c in set(text)]
    return -sum(p * np.log2(p) for p in probs)

In [5]:
# Apply entropy calculation
df["entropy"] = df["Encrypted"].apply(lambda x: shannon_entropy(str(x)))

In [6]:
df

Unnamed: 0,Strings,Encrypted,algorithm,length,A,.,+,i,p,<,...,5,},1,m,|,W,%,u,3,entropy
0,A photograph of a marsh or wood showing the fa...,aTBtEfYQwo5nMtaF6N3n4XiKE1BRZV06lIH35noCYumWPK...,Blowfish,160,1.875,0.00000,0.000000,3.125000,0.000000,0.0,...,1.875000,0.000000,1.875000,1.250,0.000000,3.125000,0.000000,1.875000,3.125000,5.734424
1,Keep striving.,B0p6MMBYWB8I84Faj2IqqQ==,Blowfish,24,0.000,0.00000,0.000000,0.000000,4.166667,0.0,...,0.000000,0.000000,0.000000,0.000,0.000000,4.166667,0.000000,0.000000,0.000000,3.970176
2,Explore,b'}\xb2\x8e\xd4%\xe3jX\x8b=\xdf\xdfW\xae]9\xa5...,AES,95,0.000,0.00000,0.000000,0.000000,0.000000,0.0,...,3.157895,1.052632,1.052632,0.000,0.000000,1.052632,1.052632,0.000000,2.105263,3.889585
3,'Image a thousand of these things coming over ...,9VjyrM5oRVmDrXv5ksVPYNEExQ8zere4/q6pRMnjr9yHui...,Blowfish,128,3.125,0.00000,0.781250,2.343750,3.125000,0.0,...,3.125000,0.000000,1.562500,3.125,0.000000,2.343750,0.000000,2.343750,0.781250,5.613101
4,The combined focus of the front and back lense...,b'\xbe\x97\xd7@&\xf4\x9a\xc7\xf4\x12\xe9(v\x97...,AES,421,0.000,0.00000,0.000000,0.475059,0.000000,0.0,...,1.187648,0.000000,3.325416,0.000,0.000000,0.000000,0.237530,0.000000,1.900238,4.127062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,"oVj_a$]CM^,elUN","b""\x1f\xccv\xaa\xa2~'\x8c\xa8=\xd0\x82e\x98\x0...",AES,89,0.000,0.00000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,3.370787,0.000,1.123596,0.000000,0.000000,0.000000,1.123596,3.904241
2996,Where is the nearest pharmacy?,PPOncWsKy1x8IFYPztPPqT+4P7YfbvIoeVYhPCe72hk=,Blowfish,44,0.000,0.00000,2.272727,0.000000,0.000000,0.0,...,0.000000,0.000000,2.272727,0.000,0.000000,2.272727,0.000000,0.000000,0.000000,4.722923
2997,">,Tx1%fi~9/-uyx","b""n\x19\xb9'\xa2\xab\xc3l\xad\xc3f\xc0,\xbc\x9...",AES,101,0.000,0.00000,0.000000,0.000000,0.000000,0.0,...,0.990099,0.000000,3.960396,0.000,0.000000,0.000000,0.000000,0.000000,2.970297,3.767739
2998,You ought to see a doctor.,b'\x8f\x19\x1b\x9do\xe0\xe8\xf5r\xcaV`\x0f\x8e...,AES,141,0.000,1.41844,0.000000,0.000000,0.000000,0.0,...,0.709220,0.000000,2.836879,0.000,0.000000,0.000000,0.000000,1.418440,0.000000,3.979056


In [26]:
# Drop non-feature columns
#X = df.drop(columns=["Strings", "Encrypted", "algorithm", "length"], errors='ignore')
X = df.drop(columns=["Strings", "Encrypted", "algorithm", "length","entropy"], errors='ignore')
# X = df[["entropy"]]
X

Unnamed: 0,A,.,+,i,p,<,[,`,#,g,...,x,5,},1,m,|,W,%,u,3
0,1.875,0.00000,0.000000,3.125000,0.000000,0.0,0.00000,0.00000,0.00000,1.87500,...,1.250000,1.875000,0.000000,1.875000,1.250,0.000000,3.125000,0.000000,1.875000,3.125000
1,0.000,0.00000,0.000000,0.000000,4.166667,0.0,0.00000,0.00000,0.00000,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,4.166667,0.000000,0.000000,0.000000
2,0.000,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.00000,0.00000,0.00000,...,21.052632,3.157895,1.052632,1.052632,0.000,0.000000,1.052632,1.052632,0.000000,2.105263
3,3.125,0.00000,0.781250,2.343750,3.125000,0.0,0.00000,0.00000,0.00000,0.78125,...,0.781250,3.125000,0.000000,1.562500,3.125,0.000000,2.343750,0.000000,2.343750,0.781250
4,0.000,0.00000,0.000000,0.475059,0.000000,0.0,0.23753,0.00000,0.23753,0.23753,...,21.615202,1.187648,0.000000,3.325416,0.000,0.000000,0.000000,0.237530,0.000000,1.900238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.000,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.00000,0.00000,0.00000,...,21.348315,0.000000,0.000000,3.370787,0.000,1.123596,0.000000,0.000000,0.000000,1.123596
2996,0.000,0.00000,2.272727,0.000000,0.000000,0.0,0.00000,0.00000,0.00000,0.00000,...,2.272727,0.000000,0.000000,2.272727,0.000,0.000000,2.272727,0.000000,0.000000,0.000000
2997,0.000,0.00000,0.000000,0.000000,0.000000,0.0,0.00000,0.00000,0.00000,0.00000,...,21.782178,0.990099,0.000000,3.960396,0.000,0.000000,0.000000,0.000000,0.000000,2.970297
2998,0.000,1.41844,0.000000,0.000000,0.000000,0.0,0.00000,0.70922,0.00000,1.41844,...,21.276596,0.709220,0.000000,2.836879,0.000,0.000000,0.000000,0.000000,1.418440,0.000000


In [27]:
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["algorithm"])
y

array([1, 1, 0, ..., 0, 0, 2], shape=(3000,))

In [28]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

In [29]:
# Initialize models
models = {
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [30]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

SVM Accuracy: 0.8250
Naive Bayes Accuracy: 0.7883
Decision Tree Accuracy: 0.8167
Random Forest Accuracy: 0.8433


In [25]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))

ValueError: feature_names must be string, and may not contain [, ] or <

In [12]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(f"{name} Confusion Matrix:\n{cm}\n")


SVM Accuracy: 0.3133
SVM Confusion Matrix:
[[78 44 71]
 [92 40 77]
 [90 38 70]]

Naive Bayes Accuracy: 0.3133
Naive Bayes Confusion Matrix:
[[ 49  35 109]
 [ 65  39 105]
 [ 55  43 100]]

Decision Tree Accuracy: 0.0133
Decision Tree Confusion Matrix:
[[  6 146  41]
 [166   2  41]
 [173  25   0]]

Random Forest Accuracy: 0.0133
Random Forest Confusion Matrix:
[[  2  79 112]
 [105   2 102]
 [106  88   4]]

