In [None]:
# Step 1: Mount Google Drive to access files stored there
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Import necessary libraries
import os
import numpy as np
from PIL import Image
import cv2
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
from collections import Counter

# Step 3: Configuration for dataset location and image size
DATASET_PATH = '/content/drive/MyDrive/(Copy)malimg_dataset/malimg_paper_dataset_imgs'
IMAGE_SIZE = (128, 128)

# Step 4: Load and preprocess images from dataset
def load_images(path):
    X, y = [], []
    labels = [f for f in os.listdir(path) if os.path.isdir(os.path.join(path, f))]
    label_dict = {label: idx for idx, label in enumerate(labels)}

    for label in labels:
        folder = os.path.join(path, label)
        for img_name in tqdm(os.listdir(folder), desc=f"Loading {label}"):
            img_path = os.path.join(folder, img_name)
            try:
                img = Image.open(img_path).convert('L')
                img = img.resize(IMAGE_SIZE)
                img_array = np.array(img)
                X.append(img_array)
                y.append(label_dict[label])
            except Exception as e:
                print(f"❌ Error loading image {img_path}: {e}")
                continue

    return np.array(X), np.array(y), label_dict

# Load dataset
X, y, label_dict = load_images(DATASET_PATH)
print("✅ Loaded", len(X), "images across", len(label_dict), "classes")

# Step 5: Feature extraction using SVD
def extract_svd_features(img):
    U, S, VT = np.linalg.svd(img, full_matrices=False)
    k = 40
    return S[:k]

# Step 6: Extract SVD features
X_svd = np.array([extract_svd_features(img) for img in X])

# Step 8: Reduce dimensionality using PCA (set to 40)
pca = PCA(n_components=40)
X_pca = pca.fit_transform(X_svd)

# Step 8.5: Filter classes with fewer than 2 samples
class_counts = Counter(y)
valid_classes = [cls for cls, count in class_counts.items() if count >= 2]
filtered_indices = [i for i, label in enumerate(y) if label in valid_classes]
X_pca_filtered = X_pca[filtered_indices]
y_filtered = y[filtered_indices]

# Step 9: Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_pca_filtered, y_filtered, test_size=0.3, stratify=y_filtered, random_state=42
)

# Step 10: Train LDA classifier
lda = LDA()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)

# Step 11: Evaluate performance
print("Classification Report:\n", classification_report(
    y_test, y_pred, target_names=[k for k, v in sorted(label_dict.items(), key=lambda item: item[1]) if v in valid_classes]
))

# Expand console output for confusion matrix
np.set_printoptions(linewidth=200)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading Adialer.C: 100%|██████████| 122/122 [00:01<00:00, 104.82it/s]
Loading Agent.FYI: 100%|██████████| 116/116 [00:00<00:00, 184.36it/s]
Loading Allaple.A: 100%|██████████| 2949/2949 [00:42<00:00, 69.64it/s] 
Loading Allaple.L: 100%|██████████| 1064/1064 [00:12<00:00, 84.38it/s] 
Loading Alueron.gen!J: 100%|██████████| 198/198 [00:01<00:00, 154.98it/s]
Loading Autorun.K: 100%|██████████| 106/106 [00:01<00:00, 55.80it/s]
Loading C2LOP.gen!g: 100%|██████████| 200/200 [00:03<00:00, 65.41it/s]
Loading C2LOP.P: 100%|██████████| 146/146 [00:01<00:00, 91.90it/s] 
Loading Dialplatform.B: 100%|██████████| 177/177 [00:01<00:00, 176.18it/s]
Loading Dontovo.A: 100%|██████████| 162/162 [00:00<00:00, 180.76it/s]
Loading Fakerean: 100%|██████████| 157/157 [00:01<00:00, 143.37it/s]
Loading Instantaccess: 100%|██████████| 431/431 [00:03<00:00, 118.17it/s]
Loading Lolyda.AA1: 100%|██████████| 213/213 [00:01<00:00, 146.19it/s]
Loading Lolyda.AA2: 100%|██████████| 184/184 [00:01<00:00, 160.38it/s]
Load

✅ Loaded 6712 images across 25 classes
Classification Report:
                 precision    recall  f1-score   support

     Adialer.C       0.95      1.00      0.97        37
     Agent.FYI       0.97      0.80      0.88        35
     Allaple.A       0.99      0.99      0.99       885
     Allaple.L       1.00      1.00      1.00       319
 Alueron.gen!J       0.78      0.88      0.83        60
     Autorun.K       0.86      1.00      0.93        32
   C2LOP.gen!g       0.83      0.95      0.88        60
       C2LOP.P       0.83      0.68      0.75        44
Dialplatform.B       1.00      1.00      1.00        53
     Dontovo.A       1.00      1.00      1.00        49
      Fakerean       1.00      1.00      1.00        47
 Instantaccess       0.97      1.00      0.98       129
    Lolyda.AA1       0.87      0.95      0.91        64
    Lolyda.AA2       1.00      0.96      0.98        55
    Lolyda.AA3       1.00      0.97      0.99        37
     Lolyda.AT       0.92      1.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
