In [119]:
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# PE, Ngram 데이터 로드
malware_pe = pd.read_csv('malware_pe.csv', on_bad_lines='skip')
normal_pe = pd.read_csv('normal_pe.csv' ,on_bad_lines='skip')
ngram = pd.read_csv('ngram.csv')

# 데이터 결합 (예: 악성 코드와 정상 코드 결합)
X_pe = pd.concat([malware_pe, normal_pe], axis=0)
y_pe = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# Ngram 데이터 결합
X_ngram = ngram.values
y_ngram = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# 이미지 데이터 로드 (이미지 크기를 64x64로 리사이즈)
def load_images(image_dir, label):
    images = []
    labels = []
    for img_name in os.listdir(image_dir):
        img_path = os.path.join(image_dir, img_name)
        img = cv2.imread(img_path)
        img = cv2.resize(img, (64, 64))  # 64x64 크기로 리사이즈
        images.append(img)
        labels.append(label)
    return np.array(images), np.array(labels)

malware_images, y_images = load_images('/home/ubuntu/images/malware', 1)
normal_images, _ = load_images('/home/ubuntu/images/normal', 0)

X_images = np.concatenate([malware_images, normal_images], axis=0)
y_images = np.concatenate([y_images, _], axis=0)


In [120]:
import pandas as pd
import numpy as np

# PE 데이터셋 로드
malware_pe = pd.read_csv('malware_pe.csv', on_bad_lines='skip')
normal_pe = pd.read_csv('normal_pe.csv', on_bad_lines='skip')

# PE 데이터셋에서 'filename'과 'MD5' 컬럼 제거
malware_pe = malware_pe.drop(columns=['filename', 'MD5'], errors='ignore')
normal_pe = normal_pe.drop(columns=['filename', 'MD5'], errors='ignore')

# PE 데이터셋에 대해 One-Hot Encoding 수행
malware_pe_encoded = pd.get_dummies(malware_pe)
normal_pe_encoded = pd.get_dummies(normal_pe)

# One-Hot Encoding된 PE 데이터셋 결합
X_pe_encoded = pd.concat([malware_pe_encoded, normal_pe_encoded], axis=0)

# 레이블 생성 (악성코드는 1, 정상은 0)
y_pe = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# Ngram 데이터셋 로드
ngram = pd.read_csv('ngram.csv')

# Ngram 데이터셋에 대해 One-Hot Encoding 수행
ngram_encoded = pd.get_dummies(ngram)

# Ngram 데이터셋 값 추출
X_ngram_encoded = ngram_encoded.values
y_ngram = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# One-Hot Encoding 후 컬럼 확인
print("PE 데이터셋 컬럼:", X_pe_encoded.columns)
print("Ngram 데이터셋 컬럼:", ngram_encoded.columns)


PE 데이터셋 컬럼: Index(['e_cblp', 'e_cp', 'e_cparhdr', 'e_maxalloc', 'e_sp', 'e_lfanew',
       'NumberOfSections', 'CreationYear', 'FH_char0', 'FH_char1',
       ...
       'packer_.gfids', 'packer_.tls', 'packer_0', 'packer_PAGER32R',
       'packer_type_.00cfg', 'packer_type_0', 'packer_type_PAGER32R',
       'E_text_INITDATA', 'E_data_PAGEDATA', 'E_file_.reloc'],
      dtype='object', length=312)
Ngram 데이터셋 컬럼: Index(['mov mov mov mov', 'add add add add', 'int3 int3 int3 int3',
       'push push push push', 'push push push call', 'mov mov mov call',
       'mov mov call push', 'nop nop nop nop', 'push push call mov',
       'mov mov call mov',
       ...
       'MD5_fb09af4f6edf6335d2778e42f1344bfd',
       'MD5_fc2ff2a09f884114b62c36cdcb730356',
       'MD5_fc9f896933b6123abebb21c8476448ec',
       'MD5_fd30acc7a696c32f661b33668e73bf7b',
       'MD5_fd442c307bc454d3930eaf6ec878fd36',
       'MD5_febba1a2aefeece75f8d29aac8baf7e3',
       'MD5_ff328a71371993ed57b6a52d94cde746',
       'M

In [121]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

# PE 데이터셋 분할
X_pe_train, X_pe_test, y_pe_train, y_pe_test = train_test_split(X_pe_encoded, y_pe, test_size=0.2, random_state=42)

# Ngram 데이터셋 분할
# X_ngram_encoded의 길이에 맞게 y_ngram을 생성합니다.
# y_ngram의 길이가 X_ngram_encoded의 길이에 맞지 않으면 오류가 발생하므로 정확히 맞춰야 합니다.
num_samples = len(X_ngram_encoded)
y_ngram = np.concatenate([np.ones(num_samples // 2), np.zeros(num_samples - num_samples // 2)])

# train_test_split을 사용하여 Ngram 데이터를 분할합니다.
X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test = train_test_split(X_ngram_encoded, y_ngram, test_size=0.2, random_state=42)

# 이미지 데이터셋 분할
X_images_train, X_images_test, y_images_train, y_images_test = train_test_split(X_images, y_images, test_size=0.2, random_state=42)

# 표준화: PE 데이터셋
scaler_pe = StandardScaler()
X_pe_train_scaled = scaler_pe.fit_transform(X_pe_train)  # 학습 데이터로 fit
X_pe_test_scaled = scaler_pe.transform(X_pe_test)  # 테스트 데이터는 transform만 수행

# 표준화: Ngram 데이터셋
scaler_ngram = StandardScaler()
X_ngram_train_scaled = scaler_ngram.fit_transform(X_ngram_train)
X_ngram_test_scaled = scaler_ngram.transform(X_ngram_test)


In [122]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import numpy as np

# NaN 처리: SimpleImputer 사용
imputer = SimpleImputer(strategy='mean')  # NaN 값을 평균으로 대체

# PE 데이터 NaN 처리
X_pe_train = imputer.fit_transform(X_pe_train)
X_pe_test = imputer.transform(X_pe_test)

# Ngram 데이터 NaN 처리
X_ngram_train = imputer.fit_transform(X_ngram_train)
X_ngram_test = imputer.transform(X_ngram_test)

# 표준화: PE 데이터
scaler_pe = StandardScaler()
X_pe_train_scaled = scaler_pe.fit_transform(X_pe_train)
X_pe_test_scaled = scaler_pe.transform(X_pe_test)

# 표준화: Ngram 데이터
scaler_ngram = StandardScaler()
X_ngram_train_scaled = scaler_ngram.fit_transform(X_ngram_train)
X_ngram_test_scaled = scaler_ngram.transform(X_ngram_test)

# SVM 모델 학습: PE 데이터
svm_model_pe = SVC(random_state=42)
svm_model_pe.fit(X_pe_train_scaled, y_pe_train)

# SVM 모델 학습: Ngram 데이터
svm_model_ngram = SVC(random_state=42)
svm_model_ngram.fit(X_ngram_train_scaled, y_ngram_train)

# PE 데이터 예측 및 성능 평가
y_pe_pred = svm_model_pe.predict(X_pe_test_scaled)
print("PE 데이터 예측:")
print("Accuracy:", accuracy_score(y_pe_test, y_pe_pred))
print(classification_report(y_pe_test, y_pe_pred))

# Ngram 데이터 예측 및 성능 평가
y_ngram_pred = svm_model_ngram.predict(X_ngram_test_scaled)
print("Ngram 데이터 예측:")
print("Accuracy:", accuracy_score(y_ngram_test, y_ngram_pred))
print(classification_report(y_ngram_test, y_ngram_pred))



PE 데이터 예측:
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        71
         1.0       1.00      1.00      1.00        58

    accuracy                           1.00       129
   macro avg       1.00      1.00      1.00       129
weighted avg       1.00      1.00      1.00       129

Ngram 데이터 예측:
Accuracy: 0.6752136752136753
              precision    recall  f1-score   support

         0.0       0.58      1.00      0.73        52
         1.0       1.00      0.42      0.59        65

    accuracy                           0.68       117
   macro avg       0.79      0.71      0.66       117
weighted avg       0.81      0.68      0.65       117



In [123]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import numpy as np

# NaN 값 처리: SimpleImputer 사용
imputer = SimpleImputer(strategy='mean')  # NaN 값을 평균으로 대체

# PE 데이터 NaN 처리
X_pe_train = imputer.fit_transform(X_pe_train)
X_pe_test = imputer.transform(X_pe_test)

# Ngram 데이터 NaN 처리
X_ngram_train = imputer.fit_transform(X_ngram_train)
X_ngram_test = imputer.transform(X_ngram_test)

# 표준화: PE 데이터
scaler_pe = StandardScaler()
X_pe_train_scaled = scaler_pe.fit_transform(X_pe_train)
X_pe_test_scaled = scaler_pe.transform(X_pe_test)

# 표준화: Ngram 데이터
scaler_ngram = StandardScaler()
X_ngram_train_scaled = scaler_ngram.fit_transform(X_ngram_train)
X_ngram_test_scaled = scaler_ngram.transform(X_ngram_test)

# RandomForest 모델 학습: PE 데이터
rf_model_pe = RandomForestClassifier(random_state=42)
rf_model_pe.fit(X_pe_train_scaled, y_pe_train)

# RandomForest 모델 학습: Ngram 데이터
rf_model_ngram = RandomForestClassifier(random_state=42)
rf_model_ngram.fit(X_ngram_train_scaled, y_ngram_train)

# PE 데이터 예측 및 성능 평가
y_pe_pred = rf_model_pe.predict(X_pe_test_scaled)
print("PE 데이터 성능:")
print("Accuracy:", accuracy_score(y_pe_test, y_pe_pred))
print(classification_report(y_pe_test, y_pe_pred))

# Ngram 데이터 예측 및 성능 평가
y_ngram_pred = rf_model_ngram.predict(X_ngram_test_scaled)
print("Ngram 데이터 성능:")
print("Accuracy:", accuracy_score(y_ngram_test, y_ngram_pred))
print(classification_report(y_ngram_test, y_ngram_pred))


PE 데이터 성능:
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        71
         1.0       1.00      1.00      1.00        58

    accuracy                           1.00       129
   macro avg       1.00      1.00      1.00       129
weighted avg       1.00      1.00      1.00       129

Ngram 데이터 성능:
Accuracy: 0.8205128205128205
              precision    recall  f1-score   support

         0.0       0.71      1.00      0.83        52
         1.0       1.00      0.68      0.81        65

    accuracy                           0.82       117
   macro avg       0.86      0.84      0.82       117
weighted avg       0.87      0.82      0.82       117



In [124]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import numpy as np

# NaN 값 처리: SimpleImputer 사용
imputer = SimpleImputer(strategy='mean')  # NaN 값을 평균으로 대체

# PE 데이터 NaN 처리
X_pe_train = imputer.fit_transform(X_pe_train)
X_pe_test = imputer.transform(X_pe_test)

# Ngram 데이터 NaN 처리
X_ngram_train = imputer.fit_transform(X_ngram_train)
X_ngram_test = imputer.transform(X_ngram_test)

# 표준화: PE 데이터
scaler_pe = StandardScaler()
X_pe_train_scaled = scaler_pe.fit_transform(X_pe_train)
X_pe_test_scaled = scaler_pe.transform(X_pe_test)

# 표준화: Ngram 데이터
scaler_ngram = StandardScaler()
X_ngram_train_scaled = scaler_ngram.fit_transform(X_ngram_train)
X_ngram_test_scaled = scaler_ngram.transform(X_ngram_test)

# Naive Bayes 모델 학습: PE 데이터
nb_model_pe = GaussianNB()
nb_model_pe.fit(X_pe_train_scaled, y_pe_train)

# Naive Bayes 모델 학습: Ngram 데이터
nb_model_ngram = GaussianNB()
nb_model_ngram.fit(X_ngram_train_scaled, y_ngram_train)

# PE 데이터 예측 및 성능 평가
y_pe_pred = nb_model_pe.predict(X_pe_test_scaled)
print("PE 데이터 성능:")
print("Accuracy:", accuracy_score(y_pe_test, y_pe_pred))
print(classification_report(y_pe_test, y_pe_pred))

# Ngram 데이터 예측 및 성능 평가
y_ngram_pred = nb_model_ngram.predict(X_ngram_test_scaled)
print("Ngram 데이터 성능:")
print("Accuracy:", accuracy_score(y_ngram_test, y_ngram_pred))
print(classification_report(y_ngram_test, y_ngram_pred))


PE 데이터 성능:
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        71
         1.0       1.00      1.00      1.00        58

    accuracy                           1.00       129
   macro avg       1.00      1.00      1.00       129
weighted avg       1.00      1.00      1.00       129

Ngram 데이터 성능:
Accuracy: 0.5641025641025641
              precision    recall  f1-score   support

         0.0       1.00      0.02      0.04        52
         1.0       0.56      1.00      0.72        65

    accuracy                           0.56       117
   macro avg       0.78      0.51      0.38       117
weighted avg       0.76      0.56      0.42       117



In [125]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, classification_report

# PE 데이터 분할
X_pe_train, X_pe_test, y_pe_train, y_pe_test = train_test_split(X_pe_encoded, y_pe, test_size=0.2, random_state=42)

# Ngram 데이터 분할
num_samples = len(X_ngram_encoded)
y_ngram = np.concatenate([np.ones(num_samples // 2), np.zeros(num_samples - num_samples // 2)])
X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test = train_test_split(X_ngram_encoded, y_ngram, test_size=0.2, random_state=42)

# NaN 값 처리: SimpleImputer 사용
imputer = SimpleImputer(strategy='mean')  # NaN 값을 평균으로 대체

# PE 데이터 NaN 처리
X_pe_train = imputer.fit_transform(X_pe_train)
X_pe_test = imputer.transform(X_pe_test)

# Ngram 데이터 NaN 처리
X_ngram_train = imputer.fit_transform(X_ngram_train)
X_ngram_test = imputer.transform(X_ngram_test)

# 표준화: PE 데이터
scaler_pe = StandardScaler()
X_pe_train_scaled = scaler_pe.fit_transform(X_pe_train)
X_pe_test_scaled = scaler_pe.transform(X_pe_test)

# 표준화: Ngram 데이터
scaler_ngram = StandardScaler()
X_ngram_train_scaled = scaler_ngram.fit_transform(X_ngram_train)
X_ngram_test_scaled = scaler_ngram.transform(X_ngram_test)

# DNN 모델 학습: PE 데이터
dnn_model_pe = Sequential()
dnn_model_pe.add(Dense(128, input_dim=X_pe_train_scaled.shape[1], activation='relu'))
dnn_model_pe.add(Dropout(0.2))
dnn_model_pe.add(Dense(64, activation='relu'))
dnn_model_pe.add(Dense(1, activation='sigmoid'))  # 이진 분류

dnn_model_pe.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
dnn_model_pe.fit(X_pe_train_scaled, y_pe_train, epochs=10, batch_size=32, verbose=1)

# DNN 모델 학습: Ngram 데이터
dnn_model_ngram = Sequential()
dnn_model_ngram.add(Dense(128, input_dim=X_ngram_train_scaled.shape[1], activation='relu'))
dnn_model_ngram.add(Dropout(0.2))
dnn_model_ngram.add(Dense(64, activation='relu'))
dnn_model_ngram.add(Dense(1, activation='sigmoid'))  # 이진 분류

dnn_model_ngram.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
dnn_model_ngram.fit(X_ngram_train_scaled, y_ngram_train, epochs=10, batch_size=32, verbose=1)

# PE 데이터 예측 및 성능 평가
y_pe_pred = dnn_model_pe.predict(X_pe_test_scaled)
y_pe_pred = (y_pe_pred > 0.5)  # 확률을 0.5 이상이면 1로 분류

print("PE 데이터 성능:")
print("Accuracy:", accuracy_score(y_pe_test, y_pe_pred))
print(classification_report(y_pe_test, y_pe_pred))

# Ngram 데이터 예측 및 성능 평가
y_ngram_pred = dnn_model_ngram.predict(X_ngram_test_scaled)
y_ngram_pred = (y_ngram_pred > 0.5)  # 확률을 0.5 이상이면 1로 분류

print("Ngram 데이터 성능:")
print("Accuracy:", accuracy_score(y_ngram_test, y_ngram_pred))
print(classification_report(y_ngram_test, y_ngram_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
PE 데이터 성능:
Accuracy: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        71
         1.0       1.00      1.00      1.00        58

    accuracy                           1.00       129
   macro avg       1.00      1.00      1.00       129
weighted avg       1.00      1.00      1.00       129

Ngram 데이터 성능:
Accuracy: 0.6153846153846154
              precision    recall  f1-score   support

         0.0       0.54      1.00      0.70        52
         1.0       1.00      0.31      0.47        65

    accuracy                           0.62       117
   macro avg       0.77      0.65      0.58       117
weighted avg       0.79      0.62      0.57       117



In [126]:
cnn_model = Sequential()
cnn_model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)))
cnn_model.add(MaxPooling2D((2, 2)))
cnn_model.add(Conv2D(64, (3, 3), activation='relu'))
cnn_model.add(MaxPooling2D((2, 2)))
cnn_model.add(Flatten())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
cnn_model.fit(X_images_train, y_images_train, epochs=10, batch_size=32, verbose=1)
y_images_pred_cnn = (cnn_model.predict(X_images_test) > 0.5).astype("int32")
print(f"CNN Accuracy: {accuracy_score(y_images_test, y_images_pred_cnn)}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CNN Accuracy: 0.91015625


In [127]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# 예측: DNN 모델 (PE)
y_pe_pred_dnn = dnn_model_pe.predict(X_pe_test_scaled)
y_pe_pred_dnn = (y_pe_pred_dnn > 0.5)  # 확률을 이진 값으로 변환

# 예측: DNN 모델 (Ngram)
y_ngram_pred_dnn = dnn_model_ngram.predict(X_ngram_test_scaled)
y_ngram_pred_dnn = (y_ngram_pred_dnn > 0.5)  # 확률을 이진 값으로 변환

# 예측: CNN 모델 (이미지)
y_images_pred_cnn = cnn_model.predict(X_images_test)
y_images_pred_cnn = (y_images_pred_cnn > 0.5).astype("int32")  # 확률을 이진 값으로 변환

# 모델별 성능 결과
results = {
    "Model": ["SVM", "RandomForest", "NaiveBayes", "DNN", "CNN"],
    "PE Accuracy": [
        accuracy_score(y_pe_test, svm_model_pe.predict(X_pe_test_scaled)),
        accuracy_score(y_pe_test, rf_model_pe.predict(X_pe_test_scaled)),
        accuracy_score(y_pe_test, nb_model_pe.predict(X_pe_test_scaled)),
        accuracy_score(y_pe_test, y_pe_pred_dnn),  # DNN 모델 PE 정확도
        0  # CNN은 PE 데이터에서 학습하지 않음
    ],
    "Ngram Accuracy": [
        accuracy_score(y_ngram_test, svm_model_ngram.predict(X_ngram_test_scaled)),
        accuracy_score(y_ngram_test, rf_model_ngram.predict(X_ngram_test_scaled)),
        accuracy_score(y_ngram_test, nb_model_ngram.predict(X_ngram_test_scaled)),
        accuracy_score(y_ngram_test, y_ngram_pred_dnn),  # DNN 모델 Ngram 정확도
        0  # CNN은 Ngram 데이터에서 학습하지 않음
    ],
    "Images Accuracy": [
        0,  # SVM은 이미지 데이터에서 학습하지 않음
        0,  # RandomForest는 이미지 데이터에서 학습하지 않음
        0,  # NaiveBayes는 이미지 데이터에서 학습하지 않음
        0,  # DNN은 이미지 데이터에서 학습하지 않음
        accuracy_score(y_images_test, y_images_pred_cnn)  # CNN 모델 이미지 정확도
    ]
}

# DataFrame으로 변환
df_results = pd.DataFrame(results)

# 결과 출력
print(df_results)


          Model  PE Accuracy  Ngram Accuracy  Images Accuracy
0           SVM          1.0        0.675214         0.000000
1  RandomForest          1.0        0.820513         0.000000
2    NaiveBayes          1.0        0.564103         0.000000
3           DNN          1.0        0.615385         0.000000
4           CNN          0.0        0.000000         0.910156
