In [2]:
!sudo apt-get install graphviz
!apt-get install -y graphviz libgraphviz-dev pkg-config

!pip install pygraphviz
!pip install torch torchvision torchaudio torch-geometric
!pip install networkx numpy scikit-learn


Password:
sudo: a password is required
zsh:1: command not found: apt-get
zsh:1: command not found: pip
zsh:1: command not found: pip
zsh:1: command not found: pip


In [None]:
import os
import pygraphviz as pgv
import networkx as nx
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 1. dot 파일 읽기 및 그래프 생성
def read_dot_file(dot_file_path: str) -> nx.DiGraph:
    A = pgv.AGraph(dot_file_path)
    G = nx.DiGraph()

    for node in A.nodes():
        G.add_node(node, label=A.get_node(node).attr['label'])

    for edge in A.edges():
        G.add_edge(edge[0], edge[1], label=A.get_edge(edge[0], edge[1]).attr['label'])

    return G


# 2. 그래프를 PyTorch Geometric 데이터 형식으로 변환
def convert_graph_to_data(G: nx.DiGraph, label: int) -> Data:
    edge_index = []
    node_labels = []

    for node in G.nodes():
        node_labels.append(G.nodes[node]['label'])

    # 노드 레이블을 레이블 인코딩
    label_encoder = LabelEncoder()
    node_labels_encoded = label_encoder.fit_transform(node_labels)

    for edge in G.edges():
        edge_index.append((list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])))

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    x = torch.tensor(node_labels_encoded, dtype=torch.float).view(-1, 1)  # 노드 특성을 숫자 인코딩으로 설정
    data = Data(x=x, edge_index=edge_index)
    data.y = torch.tensor([label], dtype=torch.long)  # 그래프 레이블 추가
    return data


# 3. GNN 모델 정의 (Dropout과 Batch Normalization 추가)
class EnhancedGNNModel(torch.nn.Module):
    def __init__(self, num_node_features: int):
        super(EnhancedGNNModel, self).__init__()
        self.conv1 = GCNConv(num_node_features, 32)
        self.conv2 = GCNConv(32, 64)
        self.conv3 = GCNConv(64, 64)
        self.dropout = nn.Dropout(0.3)
        self.batch_norm1 = nn.BatchNorm1d(32)
        self.batch_norm2 = nn.BatchNorm1d(64)
        self.fc = torch.nn.Linear(64, 2)  # Output for two classes: normal and abnormal

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.conv2(x, edge_index)
        x = self.batch_norm2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        x = self.conv3(x, edge_index)
        x = global_mean_pool(x, batch)  # Global pooling for graph-level classification
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


# 4. 데이터셋 정의
class GraphDataset(Dataset):
    def __init__(self, graphs: list, labels: list):
        super(GraphDataset, self).__init__()
        self.graphs = graphs
        self.labels = labels

    def len(self) -> int:
        return len(self.graphs)

    def get(self, idx: int) -> Data:
        return self.graphs[idx]


# 5. 훈련 과정
def train(model: EnhancedGNNModel, data_loader: DataLoader, optimizer: torch.optim.Optimizer) -> None:
    model.train()
    for data in data_loader:
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()


# 6. 테스트 함수 (F1-score 및 혼동 행렬 저장)
def test(model: EnhancedGNNModel, data_loader: DataLoader) -> (float, float):
    model.eval()
    y_true, y_pred = [], []

    with torch.no_grad():
        for data in data_loader:
            out = model(data)
            pred = out.argmax(dim=1)
            y_true.extend(data.y.tolist())
            y_pred.extend(pred.tolist())

    # F1 Score와 Accuracy 계산
    f1 = f1_score(y_true, y_pred, average="binary")
    accuracy = np.mean(np.array(y_true) == np.array(y_pred))

    # 혼동 행렬 저장 및 표시
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Normal", "Abnormal"])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.savefig("confusion_matrix.png")  # 혼동 행렬을 파일로 저장
    plt.show()

    return f1, accuracy


# 7. 메인 함수
def main(normal_dot_files: list, abnormal_dot_files: list) -> None:
    # 정상 및 비정상 그래프 로드
    normal_graphs = [convert_graph_to_data(read_dot_file(fp), label=0) for fp in normal_dot_files]
    abnormal_graphs = [convert_graph_to_data(read_dot_file(fp), label=1) for fp in abnormal_dot_files]

    # 정상 및 비정상 그래프 각각을 80:20 비율로 분할
    train_normal_graphs, test_normal_graphs = train_test_split(normal_graphs, test_size=0.1, random_state=42)
    train_abnormal_graphs, test_abnormal_graphs = train_test_split(abnormal_graphs, test_size=0.2, random_state=42)

    # 학습 및 테스트 세트 결합
    train_graphs = train_normal_graphs + train_abnormal_graphs
    test_graphs = test_normal_graphs + test_abnormal_graphs
    train_labels = [0] * len(train_normal_graphs) + [1] * len(train_abnormal_graphs)
    test_labels = [0] * len(test_normal_graphs) + [1] * len(test_abnormal_graphs)

    # PyTorch Geometric 데이터셋 생성
    train_dataset = GraphDataset(train_graphs, train_labels)
    test_dataset = GraphDataset(test_graphs, test_labels)

    # DataLoader 정의
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32)

    # 모델 초기화 및 옵티마이저 설정
    model = EnhancedGNNModel(num_node_features=1)  # 노드 특성 수 설정
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # 모델 학습
    for epoch in range(200):  # 에포크 수
        train(model, train_loader, optimizer)

    # 테스트 데이터셋으로 성능 평가
    test_f1, test_accuracy = test(model, test_loader)
    print(f"Overall Test F1 Score: {test_f1:.4f}")
    print(f"Overall Test Accuracy: {test_accuracy * 100:.2f}%")

    # F1 Score와 Accuracy 시각화
    plt.figure(figsize=(6, 4))
    plt.bar(['F1 Score', 'Accuracy'], [test_f1, test_accuracy], color=['blue', 'green'])
    plt.title("Test F1 Score and Accuracy")
    plt.ylim(0, 1)
    plt.ylabel("Score")
    plt.show()

# 8. 실행
normal_dot_files = [f"./graphs/graphs_benign_dot/system_call_graph_{i}.dot" for i in range(1, 188)]  # 정상 그래프 경로
abnormal_dot_files = (
    [f"./graphs/graphs_helloworld_dot/system_call_graph_{i}.dot" for i in range(1, 201)] +
    [f"./graphs/graphs_sqldump_dot/system_call_graph_{i}.dot" for i in range(1, 201)] # 비정상 그래프 경로
)

main(normal_dot_files, abnormal_dot_files)


ModuleNotFoundError: No module named 'pygraphviz'