In [19]:
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

ngram = pd.read_csv('ngram.csv')

# Ngram 데이터 결합
X_ngram = ngram.values


In [21]:
import pandas as pd
import numpy as np

# Ngram 데이터 로드
ngram = pd.read_csv('ngram.csv')

# 'filename' 열 삭제
ngram = ngram.drop(columns=['filename'])

# MD5로 시작하는 열 삭제
ngram = ngram.loc[:, ~ngram.columns.str.startswith('MD5')]

# Ngram 데이터에 대해 One-Hot Encoding 수행
ngram_encoded = pd.get_dummies(ngram)

# One-Hot Encoding된 데이터
X_ngram_encoded = ngram_encoded.values

# One-Hot Encoding 후 특성 확인
print("Ngram One-Hot Encoding 후 특성:", ngram_encoded.columns)


Ngram One-Hot Encoding 후 특성: Index(['mov mov mov mov', 'add add add add', 'int3 int3 int3 int3',
       'push push push push', 'push push push call', 'mov mov mov call',
       'mov mov call push', 'nop nop nop nop', 'push push call mov',
       'mov mov call mov',
       ...
       'pop dec dec push', 'mov mov call test', 'push push dec push',
       'mov cmp jne mov', 'dec push pop inc', 'push mov mov push',
       'leave ret push mov', 'mov mov mov cmp', 'mov lea push push', 'class'],
      dtype='object', length=101)


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

num_samples = len(X_ngram_encoded)
y_ngram = np.concatenate([np.ones(num_samples // 2), np.zeros(num_samples - num_samples // 2)])

# train_test_split을 사용하여 Ngram 데이터를 분할합니다.
X_ngram_train, X_ngram_test, y_ngram_train, y_ngram_test = train_test_split(X_ngram_encoded, y_ngram, test_size=0.2, random_state=42)


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import numpy as np

# NaN 값 처리: SimpleImputer 사용
imputer = SimpleImputer(strategy='mean')  # NaN 값을 평균으로 대체

# Ngram 데이터 NaN 처리
X_ngram_train = imputer.fit_transform(X_ngram_train)
X_ngram_test = imputer.transform(X_ngram_test)

# 표준화: Ngram 데이터
scaler_ngram = StandardScaler()
X_ngram_train_scaled = scaler_ngram.fit_transform(X_ngram_train)
X_ngram_test_scaled = scaler_ngram.transform(X_ngram_test)

# RandomForest 모델 학습: Ngram 데이터
rf_model_ngram = RandomForestClassifier(random_state=42)
rf_model_ngram.fit(X_ngram_train_scaled, y_ngram_train)

# Ngram 데이터 예측 및 성능 평가
y_ngram_pred = rf_model_ngram.predict(X_ngram_test_scaled)
print("Ngram 데이터 성능:")
print("Accuracy:", accuracy_score(y_ngram_test, y_ngram_pred))
print(classification_report(y_ngram_test, y_ngram_pred))

Ngram 데이터 성능:
Accuracy: 0.8034188034188035
              precision    recall  f1-score   support

         0.0       0.73      0.88      0.80        52
         1.0       0.89      0.74      0.81        65

    accuracy                           0.80       117
   macro avg       0.81      0.81      0.80       117
weighted avg       0.82      0.80      0.80       117



In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 파라미터 그리드 정의
param_grid = {
    'n_estimators': [100, 200, 500, 1000],  # n_estimators 값
    'max_features': ['sqrt', None]  # max_features 값
}

# GridSearchCV 정의 (10-Fold 교차 검증)
grid_search_ngram = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=10,
    n_jobs=-1,
    verbose=1
)

# GridSearchCV 학습 (받은 데이터를 그대로 사용)
grid_search_ngram.fit(X_ngram_train_scaled, y_ngram_train)

# 최적의 매개변수 출력
print(f"최적의 매개변수: {grid_search_ngram.best_params_}")
print(f"최고 교차 검증 정확도: {grid_search_ngram.best_score_}")

# 최적의 모델로 테스트 데이터 예측
y_ngram_pred = grid_search_ngram.best_estimator_.predict(X_ngram_test_scaled)

# 테스트 데이터 정확도 출력
print("테스트 데이터 정확도:", accuracy_score(y_ngram_test, y_ngram_pred))

# 추가로 Classification Report 출력
print("Classification Report:")
print(classification_report(y_ngram_test, y_ngram_pred))


Fitting 10 folds for each of 8 candidates, totalling 80 fits
최적의 매개변수: {'max_features': None, 'n_estimators': 100}
최고 교차 검증 정확도: 0.868963922294172
테스트 데이터 정확도: 0.8290598290598291
Classification Report:
              precision    recall  f1-score   support

         0.0       0.75      0.92      0.83        52
         1.0       0.92      0.75      0.83        65

    accuracy                           0.83       117
   macro avg       0.84      0.84      0.83       117
weighted avg       0.85      0.83      0.83       117



In [26]:
import joblib

# 최적의 랜덤 포레스트 모델 저장
joblib.dump(grid_search_ngram.best_estimator_, 'rf_model_ngram.joblib')

# 모델 로드 예시
rf_model_ngram_loaded = joblib.load('rf_model_ngram.joblib')
