# Description
This notebook utilizes the KDD 99 Cup dataset (https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html) to analyze the behavior of attacker and normal connections by focusing on the most relevant variables. The resulting distributions will be applied in agent-based modeling. Note that this analysis specifically considers only "smurf" type attacks.

In [None]:
# Python
import os

# Plotting
import matplotlib.pyplot as plt

# Data handling
import pandas as pd
import numpy as np

# Machine learning modeling
from sklearn.datasets import fetch_kddcup99
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Get data

## Fetch data from Sklearn repositories

Obtain data from Skearn datasets and saves it into a parque file for later use. This allows to save some memory space when dealing with large datasets.

In [246]:
reload_data = False

if reload_data:
    # Load the KDD Cup 99 10% dataset
    kddcup99_data = fetch_kddcup99(percent10=True)
    
    # Create a DataFrame
    df = pd.DataFrame(kddcup99_data.data, columns=kddcup99_data.feature_names)
    df['target'] = kddcup99_data.target
    
    # Filter by 'smurf' and 'normal' attacks. These are the only types of attacks that will be considered in this code.
    # df_filtered = df[df['target'].isin([b'smurf.', b'normal.', b'neptune.'])]
    df_filtered = df[df['target'].isin([b'smurf.', b'normal.'])]
    df_filtered.to_parquet(f'data{os.sep}input{os.sep}kddcup99_data_smurf_&_normal.parquet')
else:
    df = pd.read_parquet(f'data{os.sep}input{os.sep}kddcup99_data_smurf_&_normal.parquet')

# Create distributions for Agent-based Modeling

This code analyzes the data and finds the most relevant variables to predict whether an entry is a smurf attack or not. For each of these variables, we will find a distribution that can be used to model agents that simulate the behavior of attackers.

## Get most relevante variables

In [247]:
# Encode categorical variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    col_encoded = f'{col}_encoded'
    df[col_encoded] = le.fit_transform(df[col])

In [248]:
# Get X and y data for modeling
cols_to_delete = list(categorical_cols)
cols_to_delete.append('target_encoded')

# Separate features (X) and target (y)
X = df.drop(cols_to_delete, axis=1)
y = df['target_encoded']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.95, random_state=42)

In [249]:
# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [250]:
# Feature importances
feature_importances = pd.DataFrame(
    clf.feature_importances_, index=X.columns, columns=["Importance"])
feature_importances.sort_values("Importance", ascending=False, inplace=True)

# Print top 10 features
print("\nTop 10 Important Features:")
print(feature_importances.head(10))


Top 10 Important Features:
                             Importance
srv_count                      0.223064
count                          0.172157
protocol_type_encoded          0.163645
dst_host_same_src_port_rate    0.108634
dst_bytes                      0.079667
service_encoded                0.063947
src_bytes                      0.061102
logged_in                      0.048471
dst_host_srv_count             0.021383
dst_host_count                 0.016728


In [251]:
df_train = df.loc[X_train.index]
df_train_normal = df_train[df_train.target==b'normal.']
df_train_smurf = df_train[df_train.target==b'smurf.']

## Distributions generation

### srv_count

In [252]:
df_train_normal['srv_count'].describe()

count    4806.000000
mean       11.101748
std        23.039274
min         1.000000
25%         1.000000
50%         4.000000
75%        13.000000
max       461.000000
Name: srv_count, dtype: float64

In [253]:
df_train_smurf['srv_count'].describe()

count    14097.000000
mean       507.015961
std         18.402039
min         13.000000
25%        511.000000
50%        511.000000
75%        511.000000
max        511.000000
Name: srv_count, dtype: float64

In [254]:
def save_dist_to_parquet(series, file_path, file_name):
    values, counts = np.unique(series, return_counts=True)
    probs = counts / len(series)
    distribution = pd.DataFrame()
    distribution['values']=values
    distribution['probs']=probs
    distribution.to_parquet(f'{file_path}{file_name}')

In [255]:
save_dist_to_parquet(df_train_normal['srv_count'], 'distributions/normal/', 'srv_count.parquet')
save_dist_to_parquet(df_train_smurf['srv_count'], 'distributions/smurf/', 'srv_count.parquet')

### count

In [256]:
df_train_normal['count'].describe()

count    4806.000000
mean        8.282772
std        18.663841
min         1.000000
25%         1.000000
50%         3.000000
75%        10.000000
max       461.000000
Name: count, dtype: float64

In [257]:
df_train_smurf['count'].describe()


count    14097.000000
mean       507.018444
std         18.402478
min         13.000000
25%        511.000000
50%        511.000000
75%        511.000000
max        511.000000
Name: count, dtype: float64

In [258]:
save_dist_to_parquet(df_train_normal['count'], 'distributions/normal/', 'count.parquet')
save_dist_to_parquet(df_train_smurf['count'], 'distributions/smurf/', 'count.parquet')

### protocol_type

In [259]:
df_train_normal['protocol_type'].value_counts()

protocol_type
b'tcp'     3795
b'udp'      949
b'icmp'      62
Name: count, dtype: int64

In [260]:
df_train_smurf['protocol_type'].value_counts()

protocol_type
b'icmp'    14097
Name: count, dtype: int64

In [261]:
pd.DataFrame(df_train_normal['protocol_type'].value_counts()/len(df_train_normal['protocol_type'])).to_parquet('distributions/normal/protocol_type.parquet')
pd.DataFrame(df_train_smurf['protocol_type'].value_counts()/len(df_train_smurf['protocol_type'])).to_parquet('distributions/smurf/protocol_type.parquet')

### dst_host_same_src_port_rate

In [262]:
df_train_normal['dst_host_same_src_port_rate'].describe()


count    4806.000000
mean        0.128583
std         0.273220
min         0.000000
25%         0.000000
50%         0.010000
75%         0.070000
max         1.000000
Name: dst_host_same_src_port_rate, dtype: float64

In [263]:
df_train_smurf['dst_host_same_src_port_rate'].describe()


count    14097.000000
mean         0.999913
std          0.006518
min          0.420000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: dst_host_same_src_port_rate, dtype: float64

In [264]:
save_dist_to_parquet(df_train_normal['dst_host_same_src_port_rate'], 'distributions/normal/', 'dst_host_same_src_port_rate.parquet')
save_dist_to_parquet(df_train_smurf['dst_host_same_src_port_rate'], 'distributions/smurf/', 'dst_host_same_src_port_rate.parquet')

### service

In [265]:
df_train_normal['service'].value_counts()


service
b'http'        3042
b'smtp'         472
b'private'      368
b'domain_u'     311
b'other'        257
b'ftp_data'     192
b'urp_i'         28
b'finger'        27
b'eco_i'         20
b'ftp'           19
b'ntp_u'         15
b'ecr_i'         14
b'telnet'        13
b'auth'           9
b'IRC'            7
b'pop_3'          7
b'time'           4
b'domain'         1
Name: count, dtype: int64

In [266]:
df_train_smurf['service'].value_counts()


service
b'ecr_i'    14097
Name: count, dtype: int64

In [267]:
pd.DataFrame(df_train_normal['service'].value_counts()/len(df_train_normal['service'])).to_parquet('distributions/normal/service.parquet')
pd.DataFrame(df_train_smurf['service'].value_counts()/len(df_train_smurf['service'])).to_parquet('distributions/smurf/service.parquet')

### src_bytes

In [268]:
df_train_normal['src_bytes'].describe()


count    4.806000e+03
mean     2.519129e+03
std      6.380968e+04
min      0.000000e+00
25%      1.470000e+02
50%      2.320000e+02
75%      3.130000e+02
max      2.194619e+06
Name: src_bytes, dtype: float64

In [269]:
df_train_smurf['src_bytes'].describe()

count    14097.000000
mean       936.115769
std        199.754276
min        520.000000
25%       1032.000000
50%       1032.000000
75%       1032.000000
max       1032.000000
Name: src_bytes, dtype: float64

In [270]:
save_dist_to_parquet(df_train_normal['src_bytes'], 'distributions/normal/', 'src_bytes.parquet')
save_dist_to_parquet(df_train_smurf['src_bytes'], 'distributions/smurf/', 'src_bytes.parquet')

### diff_srv_rate

In [271]:
df_train_normal['diff_srv_rate'].describe()


count    4806.000000
mean        0.014667
std         0.104712
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: diff_srv_rate, dtype: float64

In [272]:
df_train_smurf['diff_srv_rate'].describe()


count    14097.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: diff_srv_rate, dtype: float64

In [273]:
save_dist_to_parquet(df_train_normal['diff_srv_rate'], 'distributions/normal/', 'diff_srv_rate.parquet')
save_dist_to_parquet(df_train_smurf['diff_srv_rate'], 'distributions/smurf/', 'diff_srv_rate.parquet')

### same_srv_rate

In [274]:
df_train_normal['same_srv_rate'].describe()

count    4806.000000
mean        0.988288
std         0.083480
min         0.020000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: same_srv_rate, dtype: float64

In [275]:
df_train_smurf['same_srv_rate'].describe()


count    14097.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: same_srv_rate, dtype: float64

In [276]:
save_dist_to_parquet(df_train_normal['same_srv_rate'], 'distributions/normal/', 'same_srv_rate.parquet')
save_dist_to_parquet(df_train_smurf['same_srv_rate'], 'distributions/smurf/', 'same_srv_rate.parquet')