### HOG

In [None]:
pip install scikit-image

In [None]:
import cv2
import numpy as np
from skimage.util import view_as_blocks
from sklearn.preprocessing import StandardScaler

In [None]:
# Gabor 필터 생성 함수
def create_filters(scales, orientations):
    filters = []
    for scale in range(scales[0], scales[1] + 1):
        for orientation in np.arange(0, np.pi, np.pi / orientations):
            filt_real = cv2.getGaborKernel((scale, scale), 1, orientation, scale, 0, ktype=cv2.CV_32F)
            filt_imag = cv2.getGaborKernel((scale, scale), 1, orientation, scale, 0.5 * np.pi, ktype=cv2.CV_32F)
            filt = filt_real + filt_imag
            filt /= 2.0 * np.pi * scale * scale
            filters.append(filt)
    return filters

# HOG 디스크립터 계산 함수
def hog_descriptor_single_channel(image, scales=(8, 8), orientations=8, blocks=(4, 4)):    # Gabor 필터 생성
    filters = create_filters(scales, orientations)
    
    # 이미지 크기와 블록 크기 계산
    height, width = image.shape[:2]
    block_size = height // blocks[0], width // blocks[1]

    padding_size = blocks[0] * block_size[0] - height, blocks[1] * block_size[1] - width
    
    # 이미지 패딩 (필요한 경우)
    if padding_size != (0, 0):
        image = cv2.copyMakeBorder(image, 0, padding_size[0], 0, padding_size[1], cv2.BORDER_CONSTANT, value=0)
    
    # 이미지를 블록으로 분할
    block_shape = (block_size[0], block_size[1])
    blocks = view_as_blocks(image, block_shape=(block_size[0], block_size[1])).reshape(-1, *block_size, order='F')
    
    # 각 블록의 hog 특성 추출
    features = []
    for block in blocks:
        feats = []
        for scale in filters:
            for filt in scale:
                filtered = cv2.filter2D(block, cv2.CV_64F, filt)
                feats.append(filtered.mean())
        features.append(feats)
    
    # 전체 hog 디스크립터로 결합
    return np.concatenate(features)


def hog_descriptor(image, scales=(8, 8), orientations=8, blocks=(4, 4)):
    if len(image.shape) == 3:
    # 각 채널에 대해 hog 디스크립터 계산
        descriptors = [hog_descriptor_single_channel(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), scales, orientations, blocks)]            # 전체 GIST 디스크립터로 결합
        return np.concatenate(descriptors)
    else:
    # 단일 채널 이미지의 경우 hog 디스크립터를 한 번만 계산
        return hog_descriptor_single_channel(image, scales, orientations, blocks)

# 각 이미지에 맞는 레이블 생성
def load_labels(dir, num_samples):
    y = []
    for subdir in sorted(os.listdir(dir)):
        subdir_path = os.path.join(dir, subdir)
        if os.path.isdir(subdir_path):
            for i, filename in enumerate(sorted(os.listdir(subdir_path))):
                y.append(labels_dict[subdir])
                if len(y) == num_samples:
                    break
        if len(y) == num_samples:
            break
    return np.array(y)

In [None]:
# 각 악성코드 이미지 폴더에서 350개의 이미지에 대한 hog descriptor를 계산하여 반환
def get_hog_descriptors(train_data):
    descriptors = []
    labels = []
    for i, v in enumerate(train_data):
      _,label,path = v
      # 파일 경로 생성
      # 이미지 로드
      image = cv2.imread(path)
      # 이미지에 대한 hog 디스크립터 계산
      descriptor = hog_descriptor(image)
      descriptors.append(descriptor)
      labels.append(label)
      if i % 100 == 99:
        print("\tProcessed", i + 1, "images")
    return np.array(descriptors) , np.array(labels)

In [None]:
descriptors, train_labels = get_hog_descriptors(train_data)
print(len(descriptors))
print('HOG Descriptor Shape:', descriptors.shape)

In [None]:
# 데이터와 레이블 설정
X = descriptors
hog_features = X

# hog_descriptors 및 레이블 불러오기
X_train = X
X_val , val_label= get_hog_descriptors(add_data)
X_test, test_label = get_hog_descriptors(test_data)

In [None]:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


In [None]:
image_transforms1 = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),        
        transforms.Normalize((0.37306938, 0.3730843, 0.3727943), (0.07064196, 0.0706386, 0.07048721))
    ]),
    "test": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomRotation((-20, 20)),
        transforms.RandomGrayscale(p=0.8),
        transforms.RandomHorizontalFlip(p=0.8),
        transforms.RandomVerticalFlip(p=0.8),
        transforms.ToTensor(),    
        transforms.Normalize((0.37306938, 0.3730843, 0.3727943), (0.07064196, 0.0706386, 0.07048721))   
    ])
}

image_transforms2 = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomRotation((-20, 20)),
        transforms.RandomGrayscale(p=0.8),
        transforms.RandomHorizontalFlip(p=0.8),
        transforms.RandomVerticalFlip(p=0.8),
        transforms.ToTensor(),        
        transforms.Normalize((0.37306938, 0.3730843, 0.3727943), (0.07064196, 0.0706386, 0.07048721))
    ]),
    "test": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.37306938, 0.3730843, 0.3727943), (0.07064196, 0.0706386, 0.07048721))
    ])
}

In [None]:
train_data1 = CustomDataset(root = train_path1, malware_list=malware_list,
                                  transform = image_transforms1['train'])
train_data2 = CustomDataset(root = train_path2, malware_list=malware_list,
                                  transform = image_transforms2['train'])

test_data1 = CustomDataset(root = test_path1, malware_list=malware_list,
                                  transform = image_transforms1['test'])
test_data2 = CustomDataset(root = test_path2, malware_list=malware_list,
                                  transform = image_transforms2['test'])

## Machine Learing result of HOG descriptor

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Define scoring functions
scores = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1': f1_score,
    'Confusion Matrix': confusion_matrix
}

# Model definitions
models = [
    ('Random Forest', make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100))),
    ('XGBoost', XGBClassifier(learning_rate=0.01, reg_lambda=0.1)),
    ('Linear SVM', make_pipeline(StandardScaler(), LinearSVC(penalty='l2', dual=False))),
    ('SMO', make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='scale', probability=True))),
    ('J48', DecisionTreeClassifier())
]
# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

class_names = ['Malware', 'Benign']
# Ensemble model definition
ensemble_model = VotingClassifier(models, voting='hard')

# Cross-validation and evaluation for each model
for model_name, model_instance in models:
    print(model_name)
    model_instance.fit(X_train, train_labels)
    y_pred = model_instance.predict(X_test)
    for score_name, score_func in scores.items():
        # if score_name == 'Confusion Matrix':
        #     cm = score_func(test_label, y_pred)
        #     print(score_name)
        #     print(cm)
        #     plot_confusion_matrix(cm, class_names, model_name)

        # else:
            print(score_name, score_func(test_label, y_pred))

# Ensemble model training and evaluation
ensemble_model.fit(X_train, train_labels)
y_pred_ensemble = ensemble_model.predict(X_test)


## ensemble model result

In [None]:
print('Ensemble Model')
for score_name, score_func in scores.items():
    if score_name == 'Confusion Matrix':
        cm = score_func(test_label, y_pred_ensemble)
        print(score_name)
        print(cm)
        plot_confusion_matrix(cm, class_names, 'Ensemble Model')

    else:
        print(score_name, score_func(test_label, y_pred_ensemble))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.show()

class_names = ['Malware', 'Benign']

print('Ensemble Model')
for score_name, score_func in scores.items():
    if score_name == 'Confusion Matrix':
        cm = score_func(test_label, y_pred_ensemble)
        print(score_name)
        print(cm)
        plot_confusion_matrix(cm, class_names, 'Ensemble Model')
    else:
        print(score_name, score_func(test_label, y_pred_ensemble))


## validation acurracy, loss

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt

models = [
    ('XGBoost', xgb.XGBClassifier()),
    ('Linear SVM', LinearSVC()),
    ('SMO', SVC(kernel='poly', coef0=1.0, C=1.0, degree=3)),
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
    ('J48', DecisionTreeClassifier(max_depth=6))
]

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def run_cross_validation(X_train, train_labels, models, n_splits=5):
    scores = {model_name: {'Validation Accuracy': [], 'Validation Loss': []} for model_name, _ in models}
    
    cv = StratifiedKFold(n_splits = n_splits)
    for train_index, val_index in cv.split(X_train, train_labels):
        X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
        y_cv_train, y_cv_val = train_labels[train_index], train_labels[val_index]
    
        for model_name, model_instance in models:
            print(f"Processing {model_name}...")
            sc = StandardScaler()
            X_cv_train_std = sc.fit_transform(X_cv_train)
            model = model_instance
            model.fit(X_cv_train_std, y_cv_train)
    
            X_cv_val_std = sc.transform(X_cv_val)
            y_val_pred = model.predict(X_cv_val_std)
            scores[model_name]['Validation Accuracy'].append(accuracy_score(y_cv_val, y_val_pred))
            scores[model_name]['Validation Loss'].append(root_mean_squared_error(y_cv_val, y_val_pred))
    
    return scores

scores_per_model = run_cross_validation(X_train, train_labels, models)

# Plot validation accuracy per model
n_splits = 5  # or any positive integer value
plt.figure(figsize=(10, 6))
plt.xlabel('Epochs')
plt.ylabel('Validation Accuracy')
for model_name, model_scores in scores_per_model.items():
    plt.plot(range(1, n_splits + 1), model_scores['Validation Accuracy'], label=model_name)
plt.legend()
plt.show()

# Plot validation loss per model
plt.figure(figsize=(10, 6))
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
for model_name, model_scores in scores_per_model.items():
    plt.plot(range(1, n_splits + 1), model_scores['Validation Loss'], label=model_name)
plt.legend()
plt.show()


### GIST

In [None]:
import cv2
import numpy as np
from skimage.util import view_as_blocks

In [None]:
# Gabor 필터 생성 함수
def create_filters(scales, orientations):
    filters = []
    for scale in range(scales[0], scales[1] + 1):
        for orientation in np.arange(0, np.pi, np.pi / orientations):
            filt_real = cv2.getGaborKernel((scale, scale), 1, orientation, scale, 0, ktype=cv2.CV_32F)
            filt_imag = cv2.getGaborKernel((scale, scale), 1, orientation, scale, 0.5 * np.pi, ktype=cv2.CV_32F)
            filt = filt_real + filt_imag
            filt /= 2.0 * np.pi * scale * scale
            filters.append(filt)
    return filters

# GIST 디스크립터 계산 함수
def gist_descriptor_single_channel(image, scales=(8, 8), orientations=8, blocks=(4, 4)):    # Gabor 필터 생성
    filters = create_filters(scales, orientations)
    
    # 이미지 크기와 블록 크기 계산
    height, width = image.shape[:2]
    block_size = height // blocks[0], width // blocks[1]

    padding_size = blocks[0] * block_size[0] - height, blocks[1] * block_size[1] - width
    
    # 이미지 패딩 (필요한 경우)
    if padding_size != (0, 0):
        image = cv2.copyMakeBorder(image, 0, padding_size[0], 0, padding_size[1], cv2.BORDER_CONSTANT, value=0)
    
    # 이미지를 블록으로 분할
    block_shape = (block_size[0], block_size[1])
    blocks = view_as_blocks(image, block_shape=(block_size[0], block_size[1])).reshape(-1, *block_size, order='F')
    
    # 각 블록의 GIST 특성 추출
    features = []
    for block in blocks:
        feats = []
        for scale in filters:
            for filt in scale:
                filtered = cv2.filter2D(block, cv2.CV_64F, filt)
                feats.append(filtered.mean())
        features.append(feats)
    
    # 전체 GIST 디스크립터로 결합
    return np.concatenate(features)


def gist_descriptor(image, scales=(8, 8), orientations=8, blocks=(4, 4)):
    if len(image.shape) == 3:
    # 각 채널에 대해 GIST 디스크립터 계산
        descriptors = [gist_descriptor_single_channel(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), scales, orientations, blocks)]            # 전체 GIST 디스크립터로 결합
        return np.concatenate(descriptors)
    else:
    # 단일 채널 이미지의 경우 GIST 디스크립터를 한 번만 계산
        return gist_descriptor_single_channel(image, scales, orientations, blocks)

In [None]:
# 각 악성코드 이미지 폴더에서 350개의 이미지에 대한 gist descriptor를 계산하여 반환
def get_gist_descriptors(root_dir):
    descriptors = []
    for subdir in sorted(os.listdir(root_dir)):
        subdir_path = os.path.join(root_dir, subdir)
        if os.path.isdir(subdir_path):
            print("Processing directory:", subdir_path)
            for i, filename in enumerate(os.listdir(subdir_path)):
                # 파일 경로 생성
                filepath = os.path.join(subdir_path, filename)
                # 이미지 로드
                image = cv2.imread(filepath)
                # 이미지에 대한 GIST 디스크립터 계산
                descriptor = gist_descriptor(image)
                descriptors.append(descriptor)
                
                if i % 10 == 9:
                    print("\tProcessed", i + 1, "images")
    return np.array(descriptors)


In [None]:
def resize_image(image, width, height):
    return cv2.resize(image, (width, height), interpolation=cv2.INTER_AREA)

def get_GIST_descriptors(train_data, img_width=64, img_height=64):
    gist_descriptors = []
    train_labels = []

    for i, v in enumerate(train_data):
        _, label, path = v
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 이미지 크기 조정
        img = resize_image(img, img_width, img_height)

        gist_descriptor = gist(img, orientations=8, pixels_per_cell=(8, 8),
                             cells_per_block=(1, 1), block_norm='L2-Hys')
        gist_descriptors.append(gist_descriptor)
        train_labels.append(label)

        # 진행 상황 출력
        if (i + 1) % 100 == 0:
            print(f"Processed {i + 1} images")
    
    gist_descriptors = np.array(gist_descriptors)    
    return gist_descriptors, train_labels

# Assuming 'train_data' is a list of (_, label, path) tuples
h_descriptors, train_labels = get_GIST_descriptors(train_data)


In [None]:
# 데이터와 레이블 설정
X = h_descriptors
gist_features = X

# gist_descriptors 및 레이블 불러오기
X_train = X
X_val , val_label= get_GIST_descriptors(add_data)
X_test, test_label = get_GIST_descriptors(test_data)

### Machin learning and model ensemble result of GIST

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Define scoring functions
scores = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1': f1_score,
    'Confusion Matrix': confusion_matrix
}

# Model definitions
models = [
    ('Random Forest', make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100))),
    ('XGBoost', XGBClassifier(learning_rate=0.01, reg_lambda=0.1)),
    ('Linear SVM', make_pipeline(StandardScaler(), LinearSVC(penalty='l2', dual=False))),
    ('SMO', make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='scale', probability=True))),
    ('J48', DecisionTreeClassifier())
]
# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

class_names = ['Malware', 'Benign']
# Ensemble model definition
ensemble_model = VotingClassifier(models, voting='hard')

# Cross-validation and evaluation for each model
for model_name, model_instance in models:
    print(model_name)
    model_instance.fit(X_train, train_labels)
    y_pred = model_instance.predict(X_test)
    for score_name, score_func in scores.items():
        # if score_name == 'Confusion Matrix':
        #     cm = score_func(test_label, y_pred)
        #     print(score_name)
        #     print(cm)
        #     plot_confusion_matrix(cm, class_names, model_name)

        # else:
            print(score_name, score_func(test_label, y_pred))

# Ensemble model training and evaluation
ensemble_model.fit(X_train, train_labels)
y_pred_ensemble = ensemble_model.predict(X_test)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.show()

class_names = ['Malware', 'Benign']

print('Ensemble Model')
for score_name, score_func in scores.items():
    if score_name == 'Confusion Matrix':
        cm = score_func(test_label, y_pred_ensemble)
        print(score_name)
        print(cm)
        plot_confusion_matrix(cm, class_names, 'Ensemble Model')
    else:
        print(score_name, score_func(test_label, y_pred_ensemble))


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt

models = [
    ('XGBoost', xgb.XGBClassifier()),
    ('Linear SVM', LinearSVC()),
    ('SMO', SVC(kernel='poly', coef0=1.0, C=1.0, degree=3)),
    ('Random Forest', RandomForestClassifier(n_estimators=100)),
    ('J48', DecisionTreeClassifier(max_depth=6))
]

def root_mean_squared_error(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    return rmse

def run_cross_validation(X_train, train_labels, models, n_splits=5):
    scores = {model_name: {'Validation Accuracy': [], 'Validation Loss': []} for model_name, _ in models}
    
    cv = StratifiedKFold(n_splits = n_splits)
    for train_index, val_index in cv.split(X_train, train_labels):
        X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
        train_labels = np.array(train_labels)  # 이 줄을 추가하여 train_labels를 numpy 배열로 변환합니다.
        y_cv_train, y_cv_val = train_labels[train_index], train_labels[val_index]
    
        for model_name, model_instance in models:
            print(f"Processing {model_name}...")
            sc = StandardScaler()
            X_cv_train_std = sc.fit_transform(X_cv_train)
            model = model_instance
            model.fit(X_cv_train_std, y_cv_train)
    
            X_cv_val_std = sc.transform(X_cv_val)
            y_val_pred = model.predict(X_cv_val_std)
            scores[model_name]['Validation Accuracy'].append(accuracy_score(y_cv_val, y_val_pred))
            scores[model_name]['Validation Loss'].append(root_mean_squared_error(y_cv_val, y_val_pred))
    
    return scores

scores_per_model = run_cross_validation(X_train, train_labels, models)

# Plot validation accuracy per model
n_splits = 5  # or any positive integer value
plt.figure(figsize=(10, 6))
plt.xlabel('Epochs')
plt.ylabel('Validation Accuracy')
for model_name, model_scores in scores_per_model.items():
    plt.plot(range(1, n_splits + 1), model_scores['Validation Accuracy'], label=model_name)
plt.legend()
plt.show()

# Plot validation loss per model
plt.figure(figsize=(10, 6))
plt.xlabel('Epochs')
plt.ylabel('Validation Loss')
for model_name, model_scores in scores_per_model.items():
    plt.plot(range(1, n_splits + 1), model_scores['Validation Loss'], label=model_name)
plt.legend()
plt.show()


### SIFT

In [None]:
sift_features = []
sift = cv2.xfeatures2d.SIFT_create()

In [None]:
import cv2
import numpy as np

def get_sift_descriptors(train_data):
    sift = cv2.SIFT_create()
    sift_descriptors = []
    train_labels = []

    for i, v in enumerate(train_data):
        _, label, path = v
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        _, descriptor = sift.detectAndCompute(img, None)

        # If no feature is detected, skip this image
        if descriptor is None:
            continue

        sift_descriptors.append(descriptor)
        train_labels.append(label)

        if (i + 1) % 100 == 0:
            print(f"Processed {i + 1} images")

    # Convert the list of descriptors to an ndarray
    # and zero-padding to match image with fewer keypoints
    max_keypoints = max([desc.shape[0] for desc in sift_descriptors])
    s_descriptors = np.zeros((len(sift_descriptors), max_keypoints, 128))
    for i, desc in enumerate(sift_descriptors):
        s_descriptors[i, :desc.shape[0], :] = desc

    return s_descriptors, train_labels

# Assuming 'train_data' is a list of (_, label, path) tuples
s_descriptors, train_labels = get_sift_descriptors(train_data)
print(len(s_descriptors))
print('SIFT Descriptor Shape:', s_descriptors.shape)


In [None]:
# 데이터와 레이블 설정
X = s_descriptors

# hog_descriptors 및 레이블 불러오기
X_train = X
X_val , val_label= get_sift_descriptors(add_data)
X_test, test_label = get_sift_descriptors(test_data)

In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming s_descriptors is the 3D SIFT descriptors array with shape (num_samples, num_keypoints, num_features)

# Step 1: Flatten SIFT descriptors from 3D to 2D
s_descriptors_flat = s_descriptors.reshape(s_descriptors.shape[0], -1)

# Step 2: Apply StandardScaler to the flattened 2D array
scaler = StandardScaler()
X_train = scaler.fit_transform(s_descriptors_flat)

# Now, train the model with the transformed data
model_instance.fit(X_train, train_labels)


### Machin Learning result abd model ensemble of SIFT

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Define scoring functions
scores = {
    'Accuracy': accuracy_score,
    'Precision': precision_score,
    'Recall': recall_score,
    'F1': f1_score,
    'Confusion Matrix': confusion_matrix
}

# Model definitions
models = [
    ('Random Forest', make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100))),
    ('XGBoost', XGBClassifier(learning_rate=0.01, reg_lambda=0.1)),
    ('Linear SVM', make_pipeline(StandardScaler(), LinearSVC(penalty='l2', dual=False))),
    ('SMO', make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='scale', probability=True))),
    ('J48', DecisionTreeClassifier())
]
# Define a function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, model_name):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{model_name} Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()

class_names = ['Malware', 'Benign']
# Ensemble model definition
ensemble_model = VotingClassifier(models, voting='hard')

# Cross-validation and evaluation for each model
for model_name, model_instance in models:
    print(model_name)
    model_instance.fit(X_train, train_labels)
    y_pred = model_instance.predict(X_test)
    for score_name, score_func in scores.items():
        # if score_name == 'Confusion Matrix':
        #     cm = score_func(test_label, y_pred)
        #     print(score_name)
        #     print(cm)
        #     plot_confusion_matrix(cm, class_names, model_name)

        # else:
            print(score_name, score_func(test_label, y_pred))

# Ensemble model training and evaluation
ensemble_model.fit(X_train, train_labels)
y_pred_ensemble = ensemble_model.predict(X_test)
