# Step 1: Cleanup

In [19]:
#!/usr/bin/env python

import pandas as pd
from functools import reduce

import ipaddress
filename = "ADA_Network.csv"
file1 = pd.read_csv(filename)
file1.head(10)
file1.isnull().sum

# step-1 to replace all null
update_file = file1.fillna(" ")
update_file.isnull().sum()

update_file.to_csv('cleaned_'+filename, index = False)

# step-2 to remove all rows with null value
update_file = file1.fillna(0)

# step-3 to convert tcp.flag, ip.dst, ip.src to integer
update_file['tcp.flags'] = update_file['tcp.flags'].apply(lambda x: int(str(x), 16))

update_file['ip.dst'] = update_file['ip.dst'].apply(lambda x: int(ipaddress.IPv4Address(x.split(",")[0].strip())) if isinstance(x, str) else x)
update_file['ip.src'] = update_file['ip.src'].apply(lambda x: int(ipaddress.IPv4Address(x.split(",")[0].strip())) if isinstance(x, str) else x)

update_file['ip.len'] = update_file['ip.len'].apply(lambda x: int(x.split(",")[0].strip() if isinstance(x, str) else x))
update_file['ip.ttl'] = update_file['ip.ttl'].apply(lambda x: int(x.split(",")[0].strip() if isinstance(x, str) else x))

update_file['udp.port'] = update_file['udp.port'].apply(lambda x: int(str(x).replace(',', '')))

update_file['ip.flags.df'] = update_file['ip.flags.df'].apply(lambda x: 0 if x == 'Not set' else 1)
update_file['ip.flags.mf'] = update_file['ip.flags.mf'].apply(lambda x: 0 if x == 'Not set' else 1)

update_file['ip.proto'] = update_file['ip.proto'].apply(lambda x: 6 if x == 'TCP' else (17 if x == 'UDP' else 999))

update_file['ip.fragment'] = update_file['ip.fragment'].apply(lambda x: int(x.replace(',', '')) if isinstance(x, str) else x)
update_file['ip.fragments'] = update_file['ip.fragments'].apply(lambda x: 999 if isinstance(x, str) else x)

# Function to replace non-ASCII characters with a specific number
def replace_special_character(text, replacement_number):
    if isinstance(text, str):
        return text.replace('�', str(replacement_number))
    else:
        return text

update_file['tcp.segments'] = update_file['tcp.segments'].apply(lambda x: replace_special_character(x, 9999))

# Convert columns to numeric after replacing special characters
update_file['tcp.segments'] = pd.to_numeric(update_file['tcp.segments'], errors='coerce')

# Convert True/False to int
update_file['http.request'] = update_file['http.request'].astype(int)

update_file = update_file.fillna(0)
update_file.to_csv('cleaned_'+filename, index = False)

# Step 2: Label

In [None]:
import sys
import csv

file_name = sys.argv[1]
label = sys.argv[2]

file = open(file_name)
content = csv.reader(file)
row0 = next(content)
row0.append('label')
all = []
all.append(row0)
for item in content:
    item.append(label)
    all.append(item)

new_file = open(label+'_'+ file_name, 'w')
writer = csv.writer(new_file, lineterminator='\n')
writer.writerows(all)

# Step 3: Aggregate

In [21]:
import os
import pandas as pd

data_directory = './'

# # List of dataset files
# dataset_files = [
#     'benign_update_benign.csv',
#     'bruteforce_update_bruteforce.csv',
#     'ddos_update_ddos.csv',
#     'probe_update_nmap.csv',
#     'sqlattack_update_sqlattack.csv'
# ]
dataset_files = [
    'bruteforce_cleaned_bruteforce.csv',
    'ipdos_cleaned_ipdos.csv',
    'sql_cleaned_sql.csv'
]

# Create an empty DataFrame to store the aggregated data
master_dataset = pd.DataFrame()

# Iterate through the dataset files and concatenate them
for file in dataset_files:
    file_path = os.path.join(data_directory, file)
    df = pd.read_csv(file_path)
    master_dataset = pd.concat([master_dataset, df], ignore_index=True)

# Save the aggregated dataset to a new CSV file
master_dataset.to_csv('master_dataset.csv', index=False)
print("Master dataset has been created: master_dataset.csv")

Master dataset has been created: master_dataset.csv


# Step 4: Train

In [22]:
import pandas as pd
import numpy as np


data = pd.read_csv("master_dataset.csv")
data

Unnamed: 0,ip.src,ip.dst,ip.len,ip.flags.df,ip.flags.mf,ip.fragment,ip.fragment.count,ip.fragments,ip.ttl,ip.proto,...,tcp.analysis.ack_rtt,tcp.segments,tcp.reassembled.length,http.request,udp.port,frame.time_relative,frame.time_delta,tcp.time_relative,tcp.time_delta,label
0,168430081,168430335,78.0,0,0,0.0,0.0,0.0,128.0,17,...,0.000000,0.0,0.0,0,137137,0.000000,0.000000,0.000000,0.000000,bruteforce
1,168430081,168430335,78.0,0,0,0.0,0.0,0.0,128.0,17,...,0.000000,0.0,0.0,0,137137,0.749872,0.749872,0.000000,0.000000,bruteforce
2,168430081,168430335,78.0,0,0,0.0,0.0,0.0,128.0,17,...,0.000000,0.0,0.0,0,137137,1.502095,0.752223,0.000000,0.000000,bruteforce
3,168430210,168430082,64.0,1,0,0.0,0.0,0.0,64.0,17,...,0.000000,0.0,0.0,0,5297953,2.321336,0.819241,0.000000,0.000000,bruteforce
4,168430210,168430082,64.0,1,0,0.0,0.0,0.0,64.0,17,...,0.000000,0.0,0.0,0,5297953,2.322052,0.000717,0.000000,0.000000,bruteforce
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73313,168430081,168430210,47.0,1,0,0.0,0.0,0.0,128.0,6,...,0.002058,0.0,0.0,0,0,840.934497,0.002058,275.343675,0.002058,sql
73314,168430210,168430081,40.0,1,0,0.0,0.0,0.0,64.0,6,...,0.000036,0.0,0.0,0,0,840.934533,0.000036,275.343711,0.000036,sql
73315,168430210,168430081,43.0,1,0,0.0,0.0,0.0,64.0,6,...,0.000000,0.0,0.0,0,0,865.938311,25.003779,300.347490,25.003779,sql
73316,168430081,168430210,47.0,1,0,0.0,0.0,0.0,128.0,6,...,0.001657,0.0,0.0,0,0,865.939968,0.001657,300.349146,0.001657,sql


In [30]:
data.groupby('label').size()

label
bruteforce     5090
ipdos         66818
sql            1410
dtype: int64

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

X = data[data.columns[:-1]]
y = data[data.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [32]:
from sklearn.preprocessing import StandardScaler

log_reg_model = Pipeline([
    ("imputer", SimpleImputer()),
    ("scale", StandardScaler()),
    ("log_reg", LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)
log_reg_model.score(X_test, y_test)

0.9977904470934832

In [26]:
from sklearn.neural_network import MLPClassifier

mlp_model = Pipeline([
    ("imputer", SimpleImputer()),
    ("scale", StandardScaler()),
    ("mlp", MLPClassifier())
])

mlp_model.fit(X_train, y_train)
mlp_model.score(X_test, y_test)

0.9979087106746681

In [27]:
from sklearn.metrics import confusion_matrix

y_pred = log_reg_model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 1500,     4,    23],
       [    2, 20040,     4],
       [   18,     4,   401]])