In [1]:
# %%
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import matthews_corrcoef
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-04-29 19:24:51.953542: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-29 19:24:52.086290: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 19:24:52.086356: E external/local_xla/xla/stream_executor/cuda/c

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
train_dir = '/home/btech/2021/abhishek.kumar21b/malware_detection/malimg_dataset/train'
test_dir = '/home/btech/2021/abhishek.kumar21b/malware_detection/malimg_dataset/test'
val_dir = '/home/btech/2021/abhishek.kumar21b/malware_detection/malimg_dataset/val'
batch_size=32

In [3]:
# %%
image_size = (224, 224, 3)

# %%
# Function to load images
def load_images(directory):
    images = []
    labels = []
    for label, class_name in enumerate(os.listdir(directory)):
        class_dir = os.path.join(directory, class_name)
        if os.path.isdir(class_dir):
            for filename in os.listdir(class_dir):
                img_path = os.path.join(class_dir, filename)
                img = image.load_img(img_path, target_size=image_size)
                img_array = image.img_to_array(img)
                images.append(img_array)
                labels.append(label)
    return np.array(images), np.array(labels)

In [4]:
# Load training and test images
# Load training and test images
train_images, train_labels = load_images(train_dir)
test_images, test_labels = load_images(test_dir)
val_images, val_labels = load_images(val_dir)


# Preprocess images
train_images = train_images.astype('float32') / 255.0
test_images = test_images.astype('float32') / 255.0
val_images = val_images.astype('float32') / 255.0

In [5]:
# Data augmentation
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2
)
train_datagen.fit(train_images)


In [6]:
base_model = MobileNet(weights='imagenet', include_top=False, input_shape=image_size)
for layer in base_model.layers[:-5]:  # Fine-tune top 5 layers
    layer.trainable = False

2024-04-29 19:25:50.889254: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-04-29 19:25:50.889339: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:129] retrieving CUDA diagnostic information for host: AI-iiitg
2024-04-29 19:25:50.889348: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:136] hostname: AI-iiitg
2024-04-29 19:25:50.889578: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:159] libcuda reported version is: 545.23.8
2024-04-29 19:25:50.889621: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:163] kernel reported version is: 545.23.8
2024-04-29 19:25:50.889627: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:241] kernel version seems to match DSO: 545.23.8


In [7]:
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu', kernel_regularizer='l2')(x)
x = Dropout(0.5)(x)
predictions = Dense(25, activation='softmax')(x)

In [8]:
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model with AdamW optimizer
model.compile(loss='sparse_categorical_crossentropy', optimizer=AdamW(learning_rate=1e-4), metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

In [9]:
history = model.fit(
    train_datagen.flow(train_images, train_labels, batch_size=batch_size),
    epochs=50,  # Adjust as needed
    validation_data=(val_images, val_labels),  # If using a validation set
    callbacks=[early_stopping]
)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 conv1 (Conv2D)              (None, 112, 112, 32)      864       
                                                                 
 conv1_bn (BatchNormalizati  (None, 112, 112, 32)      128       
 on)                                                             
                                                                 
 conv1_relu (ReLU)           (None, 112, 112, 32)      0         
                                                                 
 conv_dw_1 (DepthwiseConv2D  (None, 112, 112, 32)      288       
 )                                                               
                                                                 
 conv_dw_1_bn (BatchNormali  (None, 112, 112, 32)      128   

In [10]:
model.summary()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

In [None]:
# Train the model (adjust epochs and batch size as needed)
history = model.fit(train_images, train_labels, epochs=10, batch_size=32, validation_data=(val_images, val_labels))

In [None]:
# Extract features using MobileNetV1
def extract_features(model, images, batch_size):
    num_images = images.shape[0]
    features = []
    for start_idx in range(0, num_images, batch_size):
        end_idx = min(start_idx + batch_size, num_images)
        batch = images[start_idx:end_idx]
        batch_features = model.predict(batch)
        features.append(batch_features)
    return np.concatenate(features)

In [None]:
# Extract features using the model without the top layer in batches
# Get the penultimate layer's output
feature_extractor = Model(inputs=model.input, outputs=model.layers[-2].output)


batch_size=32
# Use the extract_features function to process images in batches
train_features = extract_features(feature_extractor, train_images, batch_size)
test_features = extract_features(feature_extractor, test_images, batch_size)

print("Shape of extracted train features:", train_features.shape)
print("Shape of extracted test features:", test_features.shape)

In [None]:
# %%
# Define the parameter grid for SVM
svm_param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

# Create SVM classifier
svm_classifier = SVC()

# Use GridSearchCV to find the best parameters for SVM
svm_grid_search = GridSearchCV(svm_classifier, svm_param_grid, cv=3, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(train_features, train_labels)

# Get the best parameters for SVM
best_svm_params = svm_grid_search.best_params_

# Train the final SVM model with the best parameters
final_svm_classifier = SVC(**best_svm_params)
start_time_svm = time.time()
final_svm_classifier.fit(train_features, train_labels)
train_time_svm = time.time() - start_time_svm

# Predict labels for the test set using SVM
svm_predictions = final_svm_classifier.predict(test_features)

# Evaluate accuracy for SVM
svm_accuracy = accuracy_score(test_labels, svm_predictions)
print("\nBest SVM Parameters:", best_svm_params)
print("SVM Accuracy:", svm_accuracy)

In [None]:
# %%
# Additional evaluation metrics for SVM
precision_svm = precision_score(test_labels, svm_predictions, average='weighted')
recall_svm = recall_score(test_labels, svm_predictions, average='weighted')
f1_svm = f1_score(test_labels, svm_predictions, average='weighted')

print("Train Time (sec) SVM:", train_time_svm)
grid_train_time_svm = svm_grid_search.cv_results_['mean_fit_time'][svm_grid_search.best_index_]
print("Train Time (sec) Grid SVM:", grid_train_time_svm)

# Calculate Matthews Correlation Coefficient (MCC) for SVM
mcc_svm = matthews_corrcoef(test_labels, svm_predictions)
print("Matthews Correlation Coefficient (MCC) SVM:", mcc_svm)

# Accuracy for optimization algorithm (GridSearchCV)
optimization_accuracy_svm = svm_grid_search.best_score_
print("Optimization Algorithm Accuracy SVM:", optimization_accuracy_svm)

In [None]:
# %%
# Confusion matrix for SVM
conf_matrix_svm = confusion_matrix(test_labels, svm_predictions)
precision_svm = precision_score(test_labels, svm_predictions, average='weighted')

FP_svm = conf_matrix_svm.sum(axis=0) - np.diag(conf_matrix_svm)
FN_svm = conf_matrix_svm.sum(axis=1) - np.diag(conf_matrix_svm)
TP_svm = np.diag(conf_matrix_svm)
TN_svm = conf_matrix_svm.sum()

# %%
# Display results for SVM
print("\nResults for SVM:")
print("Precision (SVM):", precision_svm)
print("Recall (SVM):", recall_svm)
print("F1 Score (SVM):", f1_svm)
print("True Positive Rate (TPR) (SVM):", np.mean(TP_svm / (TP_svm + FN_svm)))
print("True Negative Rate (TNR) (SVM):", np.mean(TN_svm / (TN_svm + FP_svm)))
print("False Positive Rate (FPR) (SVM):", np.mean(FP_svm / (FP_svm + TN_svm)))
print("False Negative Rate (FNR) (SVM):", np.mean(FN_svm / (TP_svm + FN_svm)))
print("False Discovery Rate (FDR) (SVM):", np.mean(FP_svm / (FP_svm + TP_svm)))
print("False Omission Rate (FOR) (SVM):", np.mean(FN_svm / (FN_svm + TN_svm)))
print("Matthews Correlation Coefficient (MCC) (SVM):", mcc_svm)

In [None]:
# Plot TPR, TNR, FPR, FNR
fig, ax = plt.subplots(figsize=(15, 10))

bar_width = 0.2
index = np.arange(25)  # Assuming you have 25 classes, adjust this based on your data

bar1 = ax.bar(index, TP_svm, bar_width, label='TPR')
bar2 = ax.bar(index + bar_width, TN_svm, bar_width, label='TNR')
bar3 = ax.bar(index + 2 * bar_width, FP_svm, bar_width, label='FPR')
bar4 = ax.bar(index + 3 * bar_width, FN_svm, bar_width, label='FNR')

ax.set_xlabel('Class')
ax.set_ylabel('Scores')
ax.set_title('Comparison of TPR, TNR, FPR, FNR for knn')
ax.set_xticks(index + 1.5 * bar_width)
ax.legend()

plt.show()

In [None]:
# %%
# Plot Confusion Matrix for SVM
fig, ax = plt.subplots(figsize=(20, 20))
sns.set(font_scale=1.2)  # Adjust font size for better readability
disp = ConfusionMatrixDisplay(conf_matrix_svm, display_labels=np.unique(test_labels))
disp.plot(cmap='Blues', ax=ax)
plt.title('Confusion Matrix for SVM')
plt.show()
