In [1]:
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

# PE, Ngram 데이터 로드
malware_pe = pd.read_csv('malware_pe.csv', on_bad_lines='skip')
normal_pe = pd.read_csv('normal_pe.csv' ,on_bad_lines='skip')
ngram = pd.read_csv('ngram.csv')

# 데이터 결합 (예: 악성 코드와 정상 코드 결합)
X_pe = pd.concat([malware_pe, normal_pe], axis=0)
y_pe = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# Ngram 데이터 결합
X_ngram = ngram.values
y_ngram = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])


In [2]:
import pandas as pd
import numpy as np

# PE 데이터셋 로드
malware_pe = pd.read_csv('malware_pe.csv', on_bad_lines='skip')
normal_pe = pd.read_csv('normal_pe.csv', on_bad_lines='skip')

# PE 데이터셋에서 'filename'과 'MD5' 컬럼 제거
malware_pe = malware_pe.drop(columns=['filename', 'MD5'], errors='ignore')
normal_pe = normal_pe.drop(columns=['filename', 'MD5'], errors='ignore')

# PE 데이터셋에 대해 One-Hot Encoding 수행
malware_pe_encoded = pd.get_dummies(malware_pe)
normal_pe_encoded = pd.get_dummies(normal_pe)

# One-Hot Encoding된 PE 데이터셋 결합
X_pe_encoded = pd.concat([malware_pe_encoded, normal_pe_encoded], axis=0)

# 레이블 생성 (악성코드는 1, 정상은 0)
y_pe = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# Ngram 데이터셋 로드
ngram = pd.read_csv('ngram.csv')

# Ngram 데이터셋에 대해 One-Hot Encoding 수행
ngram_encoded = pd.get_dummies(ngram)

# Ngram 데이터셋 값 추출
X_ngram_encoded = ngram_encoded.values
y_ngram = np.concatenate([np.ones(len(malware_pe)), np.zeros(len(normal_pe))])

# One-Hot Encoding 후 컬럼 확인
print("PE 데이터셋 컬럼:", X_pe_encoded.columns)
print("Ngram 데이터셋 컬럼:", ngram_encoded.columns)

PE 데이터셋 컬럼: Index(['e_cblp', 'e_cp', 'e_cparhdr', 'e_maxalloc', 'e_sp', 'e_lfanew',
       'NumberOfSections', 'CreationYear', 'FH_char0', 'FH_char1',
       ...
       'packer_.gfids', 'packer_.tls', 'packer_0', 'packer_PAGER32R',
       'packer_type_.00cfg', 'packer_type_0', 'packer_type_PAGER32R',
       'E_text_INITDATA', 'E_data_PAGEDATA', 'E_file_.reloc'],
      dtype='object', length=312)
Ngram 데이터셋 컬럼: Index(['mov mov mov mov', 'add add add add', 'int3 int3 int3 int3',
       'push push push push', 'push push push call', 'mov mov mov call',
       'mov mov call push', 'nop nop nop nop', 'push push call mov',
       'mov mov call mov',
       ...
       'MD5_fb09af4f6edf6335d2778e42f1344bfd',
       'MD5_fc2ff2a09f884114b62c36cdcb730356',
       'MD5_fc9f896933b6123abebb21c8476448ec',
       'MD5_fd30acc7a696c32f661b33668e73bf7b',
       'MD5_fd442c307bc454d3930eaf6ec878fd36',
       'MD5_febba1a2aefeece75f8d29aac8baf7e3',
       'MD5_ff328a71371993ed57b6a52d94cde746',
       'M

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd

# 1. PE와 Ngram 데이터 병합
# 1. 다중 인덱스 평면화
X_ngram_encoded.columns = X_ngram_encoded.columns.map(
    lambda x: "_".join(map(str, x)) if isinstance(x, tuple) else str(x)
)

# 2. 인덱스 재설정
X_pe_encoded = X_pe_encoded.reset_index(drop=True)
X_ngram_encoded = X_ngram_encoded.reset_index(drop=True)

# 3. 데이터프레임 병합
X_combined = pd.concat([X_pe_encoded, X_ngram_encoded], axis=1)


# 2. 학습 및 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_pe, test_size=0.2, random_state=42, stratify=y_pe)

# 3. 결측값 처리
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# 4. 표준화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 데이터 준비 상태 확인
print("학습 데이터 크기:", X_train_scaled.shape)
print("테스트 데이터 크기:", X_test_scaled.shape)


학습 데이터 크기: (515, 1579)
테스트 데이터 크기: (129, 1579)


In [10]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 2. 랜덤 포레스트 모델로 새로운 특징 생성
rf_params = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt', None]
}
rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, n_jobs=-1, verbose=1)
rf_model.fit(X_train_scaled, y_train)

# 랜덤 포레스트 확률 기반 특징 생성
rf_train_features = rf_model.best_estimator_.predict_proba(X_train_scaled)
rf_test_features = rf_model.best_estimator_.predict_proba(X_test_scaled)

# 3. SVM 모델로 새로운 특징 생성
svm_params = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svm_model = GridSearchCV(SVC(probability=True, random_state=42), svm_params, cv=5, n_jobs=-1, verbose=1)
svm_model.fit(X_train_scaled, y_train)

# SVM 확률 기반 특징 생성
svm_train_features = svm_model.best_estimator_.predict_proba(X_train_scaled)
svm_test_features = svm_model.best_estimator_.predict_proba(X_test_scaled)

# 4. 랜덤 포레스트와 SVM 특징 결합
train_features = np.hstack([rf_train_features, svm_train_features])
test_features = np.hstack([rf_test_features, svm_test_features])

# 5. DNN 모델 정의 및 학습
dnn_model = Sequential([
    Dense(128, activation='relu', input_dim=train_features.shape[1]),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # 이진 분류
])

dnn_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# DNN 모델 학습
dnn_model.fit(train_features, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# 6. DNN 모델 평가
test_loss, test_accuracy = dnn_model.evaluate(test_features, y_test, verbose=0)
print(f"DNN Test Accuracy: {test_accuracy:.4f}")

# 7. Classification Report 출력
y_pred = (dnn_model.predict(test_features) > 0.5).astype(int)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
DNN Test Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        80
         1.0       1.00      1.00      1.00        49

    accuracy                           1.00       129
   macro avg       1.00      1.00      1.00       129
weighted avg       1.00      1.00      1.00       129

