<a href="https://colab.research.google.com/github/fikrinotes/LSTM-IDS/blob/main/Evaluasi_Model_Deteksi_Intrusi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras import layers, Model
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns


# Load Dataset

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

dataset_path = kagglehub.dataset_download('chethuhn/network-intrusion-dataset')
model_path = kagglehub.dataset_download('fikrimulyanasetiawan/rnn-model')
encoder_path = kagglehub.dataset_download('fikrimulyanasetiawan/encoder')

print('Data source import complete. \n')
print("Information about your data sources:")
print(f"Dataset path: {dataset_path}")
print(f"Model path: {model_path}")
print(f"Encoder path: {encoder_path}")

# Load Model

In [None]:
from urllib.request import urlretrieve

urlretrieve('https://github.com/fikrinotes/LSTM-IDS/raw/refs/heads/main/Model%20v2.3/imputer.joblib', 'imputer.joblib')
urlretrieve('https://github.com/fikrinotes/LSTM-IDS/raw/refs/heads/main/Model%20v2.3/label_encoder.joblib', 'label_encoder.joblib')
urlretrieve('https://github.com/fikrinotes/LSTM-IDS/raw/refs/heads/main/Model%20v2.3/rnn_model.keras', 'rnn_model.keras')
urlretrieve('https://github.com/fikrinotes/LSTM-IDS/raw/refs/heads/main/Model%20v2.3/scaler.joblib', 'scaler.joblib')
urlretrieve('https://github.com/fikrinotes/LSTM-IDS/raw/refs/heads/main/Model%20v2.3/selector.joblib', 'selector.joblib')

In [None]:
# prompt: load imputer, labelencoder, rnn_model, scaler dan selector

import pickle
from joblib import load
from tensorflow import keras

path = "/content"
# Load the imputer
imputer_file = f"{path}/imputer.joblib"
imputer = load(imputer_file)

# Load the label encoder
label_encoder_file = f"{path}/label_encoder.joblib"
label_encoder = load(label_encoder_file)

# Load the scaler
scaler_file = f"{path}/scaler.joblib"
scaler = load(scaler_file)

# Load the selector
selector_file = f"{path}/selector.joblib"
selector = load(selector_file)

# Load the RNN model
rnn_model_file = f"{path}/rnn_model.keras"
rnn_model = keras.models.load_model(rnn_model_file)

print("Imputer, LabelEncoder, RNN model, Scaler, and Selector loaded successfully.")

# Baca Dataset dan Preprocessing

In [None]:
# Fungsi untuk membaca dan preprocessing setiap file
def read_and_clean_file(file_path):
    print(f"Membaca file: {file_path}")
    df = pd.read_csv(file_path, low_memory=False, sep=",")

    # Bersihkan nama kolom dari whitespace
    df.columns = df.columns.str.strip()

    # Hapus kolom yang tidak diperlukan
    redundant_column = ['Flow ID', 'Source IP', 'Source Port', 'Destination IP',
                 'Destination Port', 'Protocol', 'Timestamp']
    df = df.drop(redundant_column, axis=1, errors='ignore')

    # drop baris yang tidak punya label
    df.dropna(subset = ['Label'], inplace=True)

    # Handling missing values dan infinite values
    df = df.replace([np.inf, -np.inf], np.nan)

    return df


# Baca semua file CSV dari folder
data1 = dataset_path + "/Monday-WorkingHours.pcap_ISCX.csv"
data2 = dataset_path + "/Tuesday-WorkingHours.pcap_ISCX.csv"
data3 = dataset_path + "/Wednesday-workingHours.pcap_ISCX.csv"
data4 = dataset_path + "/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv"
data5 = dataset_path + "/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
data6 = dataset_path + "/Friday-WorkingHours-Morning.pcap_ISCX.csv"
data7 = dataset_path + "/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"
data8 = dataset_path + "/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"


# Buat list semua dataset yang tersedia
all_files = [data2, data3, data4, data5, data6, data7, data8]

# Membaca file dan mengkonversi semua data file dari list "all_files" menjadi dataframe
dataframes = []
for file in all_files:
    df = read_and_clean_file(file)
    dataframes.append(df)
    del df

# Menggabungkan semua dataframe
print("Menggabungkan semua file...")
df = pd.concat(dataframes, ignore_index=True)
try:
    print("Semua file dataset berhasil digabungkan!")
except:
    print("Error! file dataset tidak berhasil digabungkan")

# ganti nama kolom dengan cara hapus whitespaces
col_names = {col: col.strip() for col in df.columns}
df.rename(columns = col_names, inplace = True)

# informasi data duplikat
dups = df[df.duplicated()]
print(f'Banyak data duplikat : {len(dups)}')
print(f'Banyak data sebelum duplikat : {df.shape[0]}')

print("menghapus data duplikat...")

# Hapus data duplikat
df.drop_duplicates(inplace = True)
print("data duplikat selesai dihapus!")
df.shape
print(f"banyak data setelah data duplikat dihapus : {df.shape[0]}")

# konversi semua label selain BENIGN jadi ATTACK
df["Label"] = df["Label"].where(df["Label"] == "BENIGN", "ATTACK")
print("Informasi Kelas : ")
df["Label"].unique()


# Menampilkan informasi dataset
print("\nInformasi Dataset:")
print(f"\nJumlah total data: {len(df)}")
print(f"Jumlah fitur : {len(df.columns)}")
print("\nDistribusi Label sebelum preprocessing:")

# tabel distribusi label
def create_distribution_table(df):
    label_dist = pd.DataFrame(df['Label'].value_counts())
    label_dist['percentage'] = df['Label'].value_counts()/len(df)
    return label_dist

create_distribution_table(df)

# Split Dataset menjadi X dan Y serta Training dan Testing

In [None]:
numerical_columns = df.select_dtypes(include=[np.number]).columns
X = df[numerical_columns]
y = df["Label"]
print(f"jumlah fitur : {len(X.columns)}")
print(f"jumlah label : {len(y.unique())}")

tss = TimeSeriesSplit(n_splits=7)
print(tss)

#X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
train_index, test_index = [], []
for i, (train_interval, test_interval) in enumerate(tss.split(X)):
    print(f"fold {i}:")
    print(f"  Train: index : from {train_interval.min()} up to {train_interval.max()}")
    print(f"  Test:  index=from {test_interval.min()} up to {test_interval.max()}")
    print(f"  Jumlah kelas pada training set : {y.iloc[train_interval].nunique()}")
    print(f"  Jumlah kelas pada testing set : {y.iloc[test_interval].nunique()}")
    train_index, test_index = train_interval, test_interval

# Split dataset dengan stratifikasi
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test  = y.iloc[train_index], y.iloc[test_index]

# Transformasi Data Mentah

In [None]:
def transform_data(X, y, scaler, imputer, le):
    # Handling missing values untuk dataset training
    print("\nMenangani missing values...")
    X = imputer.transform(X)
    print("selesai!")

    # Normalisasi Data
    print("\nMelakukan normalisasi data...")
    X = scaler.transform(X)
    print("selesai!")

    # Pelabelan Kelas
    num_classes = len(le.classes_)
    print("\nMelakukan one-hot encoding...")
    y = le.transform(y)
    print("selesai!")

    return X, y

In [None]:
## Transformasi Data Training
X_train, y_train = transform_data(X_train, y_train, scaler, imputer, label_encoder)

# Transformmasi data testing
X_test, y_test = transform_data(X_test, y_test, scaler, imputer, label_encoder)

# Select feature untuk data training juga
X_train_selected = selector.transform(X_train)

# Select feature untuk data testing
X_test_selected = selector.transform(X_test)

# Evaluasi Model

In [None]:
num_features = X_train_selected.shape[1]
num_classes = len(label_encoder.classes_)
num_classes

In [None]:
def create_sequences(data, targets, timesteps):
    X, y = [], []
    for i in range(len(data) - timesteps):
        X.append(data[i:i+timesteps])  # Ambil blok sekuensial
        y.append(targets[i+timesteps]) # Target berikutnya
    return np.array(X), np.array(y)

# Pilih jumlah timestep (contoh: 10 langkah waktu)
timesteps = 10

# Buat sequence untuk training dan testing
X_train_seq, y_train_seq = create_sequences(X_train_selected, y_train, timesteps)
X_test_seq, y_test_seq = create_sequences(X_test_selected, y_test, timesteps)

In [None]:
# Evaluasi model
y_pred_prob = rnn_model.predict(X_test_seq)
y_pred_classes = (y_pred_prob > 0.5).astype(int)
y_test_classes = y_test_seq

# Tampilkan hasil evaluasi
print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))