In [41]:
# Plotting
import matplotlib.pyplot as plt

# Data handling
import pandas as pd
import numpy as np

# Machine learning modeling
from sklearn.datasets import fetch_kddcup99
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Get data

## Fetch data from Sklearn repositories

Obtain data from Skearn datasets and saves it into a parque file for later use. This allows to save some memory space when dealing with large datasets.

In [42]:
reload_data = False

if reload_data:
    # Load the KDD Cup 99 10% dataset
    kddcup99_data = fetch_kddcup99(percent10=True)
    
    # Create a DataFrame
    df = pd.DataFrame(kddcup99_data.data, columns=kddcup99_data.feature_names)
    df['target'] = kddcup99_data.target
    
    # Filter by 'smurf' and 'normal' attacks. These are the only types of attacks that will be considered in this code.
    df_filtered = df[df['target'].isin([b'smurf.', b'normal.', b'neptune.'])]
    df_filtered.to_parquet('data/kddcup99_data_smurf_&_normal_&_neptune.parquet')
else:
    df = pd.read_parquet('data/kddcup99_data_smurf_&_normal_&_neptune.parquet')

# Create distribtions for Agent-based Modeling

This code analyzes the data and finds the most relevant variables to predict whether an entry is a smurf attack or not. For each of these variables, we will find a distribution that can be used to model agents that simulate the behavior of attackers.

## Get most relevante variables

In [43]:
# Encode categorical variables
le = LabelEncoder()
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    col_encoded = f'{col}_encoded'
    df[col_encoded] = le.fit_transform(df[col])

In [44]:
# Get X and y data for modeling
cols_to_delete = list(categorical_cols)
cols_to_delete.append('target_encoded')

# Separate features (X) and target (y)
X = df.drop(cols_to_delete, axis=1)
y = df['target_encoded']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.95, random_state=42)

In [45]:
# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

In [46]:
# Feature importances
feature_importances = pd.DataFrame(
    clf.feature_importances_, index=X.columns, columns=["Importance"])
feature_importances.sort_values("Importance", ascending=False, inplace=True)

# Print top 10 features
print("\nTop 10 Important Features:")
print(feature_importances.head(10))


Top 10 Important Features:
                             Importance
count                          0.161772
srv_count                      0.138192
protocol_type_encoded          0.110158
diff_srv_rate                  0.086317
src_bytes                      0.083344
same_srv_rate                  0.077351
dst_host_same_src_port_rate    0.075588
service_encoded                0.051253
dst_host_same_srv_rate         0.034839
flag_encoded                   0.034213


In [47]:
df_train = df.loc[X_train.index]
df_train_normal = df_train[df_train.target==b'normal.']
df_train_smurf = df_train[df_train.target==b'smurf.']
df_train_neptune = df_train[df_train.target==b'neptune.']

## Distributions generation

### srv_count

In [48]:
df_train_normal['srv_count'].describe()

count    4852.000000
mean       10.638293
std        19.769055
min         1.000000
25%         1.000000
50%         4.000000
75%        13.000000
max       325.000000
Name: srv_count, dtype: float64

In [49]:
df_train_smurf['srv_count'].describe()

count    14083.000000
mean       506.946247
std         17.782580
min          7.000000
25%        511.000000
50%        511.000000
75%        511.000000
max        511.000000
Name: srv_count, dtype: float64

In [50]:
df_train_neptune['srv_count'].describe()

count    5328.000000
mean       11.030405
std         6.184254
min         1.000000
25%         6.000000
50%        11.000000
75%        16.000000
max        25.000000
Name: srv_count, dtype: float64

In [51]:
def save_dist_to_parquet(series, file_path, file_name):
    values, counts = np.unique(series, return_counts=True)
    probs = counts / len(series)
    distribution = pd.DataFrame()
    distribution['values']=values
    distribution['probs']=probs
    distribution.to_parquet(f'{file_path}{file_name}')

In [52]:
save_dist_to_parquet(df_train_normal['srv_count'], 'distributions/normal/', 'srv_count.parquet')
save_dist_to_parquet(df_train_smurf['srv_count'], 'distributions/smurf/', 'srv_count.parquet')
save_dist_to_parquet(df_train_neptune['srv_count'], 'distributions/neptune/', 'srv_count.parquet')

### count

In [53]:
df_train_normal['count'].describe()

count    4852.000000
mean        7.967642
std        15.450975
min         1.000000
25%         1.000000
50%         3.000000
75%        10.000000
max       326.000000
Name: count, dtype: float64

In [54]:
df_train_smurf['count'].describe()


count    14083.000000
mean       506.948306
std         17.782968
min          7.000000
25%        511.000000
50%        511.000000
75%        511.000000
max        511.000000
Name: count, dtype: float64

In [55]:
df_train_neptune['count'].describe()

count    5328.000000
mean      188.874437
std        69.395625
min         1.000000
25%       126.000000
50%       205.000000
75%       250.000000
max       301.000000
Name: count, dtype: float64

In [56]:
save_dist_to_parquet(df_train_normal['count'], 'distributions/normal/', 'count.parquet')
save_dist_to_parquet(df_train_smurf['count'], 'distributions/smurf/', 'count.parquet')
save_dist_to_parquet(df_train_neptune['count'], 'distributions/neptune/', 'count.parquet')

### protocol_type_encoded

In [57]:
df_train_normal['protocol_type_encoded'].value_counts()

protocol_type_encoded
1    3797
2     990
0      65
Name: count, dtype: int64

In [58]:
df_train_smurf['protocol_type_encoded'].value_counts()

protocol_type_encoded
0    14083
Name: count, dtype: int64

In [59]:
df_train_neptune['protocol_type_encoded'].value_counts()

protocol_type_encoded
1    5328
Name: count, dtype: int64

In [60]:
pd.DataFrame(df_train_normal['protocol_type_encoded'].value_counts()/len(df_train_normal['protocol_type_encoded'])).to_parquet('distributions/normal/protocol_type_encoded.parquet')
pd.DataFrame(df_train_smurf['protocol_type_encoded'].value_counts()/len(df_train_smurf['protocol_type_encoded'])).to_parquet('distributions/smurf/protocol_type_encoded.parquet')
pd.DataFrame(df_train_neptune['protocol_type_encoded'].value_counts()/len(df_train_neptune['protocol_type_encoded'])).to_parquet('distributions/neptune/protocol_type_encoded.parquet')

### dst_host_same_src_port_rate

In [61]:
df_train_normal['dst_host_same_src_port_rate'].describe()


count    4852.000000
mean        0.131849
std         0.279559
min         0.000000
25%         0.000000
50%         0.010000
75%         0.070000
max         1.000000
Name: dst_host_same_src_port_rate, dtype: float64

In [62]:
df_train_smurf['dst_host_same_src_port_rate'].describe()


count    14083.000000
mean         0.999589
std          0.017600
min          0.030000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: dst_host_same_src_port_rate, dtype: float64

In [63]:
df_train_neptune['dst_host_same_src_port_rate'].describe()

count    5328.000000
mean        0.000094
std         0.002905
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         0.200000
Name: dst_host_same_src_port_rate, dtype: float64

In [64]:
save_dist_to_parquet(df_train_normal['dst_host_same_src_port_rate'], 'distributions/normal/', 'dst_host_same_src_port_rate.parquet')
save_dist_to_parquet(df_train_smurf['dst_host_same_src_port_rate'], 'distributions/smurf/', 'dst_host_same_src_port_rate.parquet')
save_dist_to_parquet(df_train_neptune['dst_host_same_src_port_rate'], 'distributions/neptune/', 'dst_host_same_src_port_rate.parquet')

### service_encoded

In [65]:
df_train_normal['service_encoded'].value_counts()


service_encoded
22    3056
49     494
44     381
11     321
40     278
19     186
60      25
13      21
14      19
17      17
18      16
39      13
55       8
3        8
58       3
42       3
10       1
0        1
1        1
Name: count, dtype: int64

In [66]:
df_train_smurf['service_encoded'].value_counts()


service_encoded
14    14083
Name: count, dtype: int64

In [67]:
df_train_neptune['service_encoded'].value_counts()

service_encoded
44    5063
22      15
55      13
52      10
17       9
48       9
58       9
9        7
19       7
26       7
40       7
18       7
24       7
49       6
42       6
20       6
30       6
10       6
37       6
50       5
15       5
53       5
2        5
64       5
36       5
4        5
31       5
62       5
28       5
27       5
33       4
8        4
35       4
34       4
51       4
61       3
47       3
16       3
23       3
25       3
63       3
38       3
46       3
32       3
3        3
6        2
41       2
5        2
21       2
7        2
54       2
12       2
29       2
43       1
Name: count, dtype: int64

In [68]:
pd.DataFrame(df_train_normal['service_encoded'].value_counts()/len(df_train_normal['service_encoded'])).to_parquet('distributions/normal/service_encoded.parquet')
pd.DataFrame(df_train_smurf['service_encoded'].value_counts()/len(df_train_smurf['service_encoded'])).to_parquet('distributions/smurf/service_encoded.parquet')
pd.DataFrame(df_train_neptune['service_encoded'].value_counts()/len(df_train_neptune['service_encoded'])).to_parquet('distributions/neptune/service_encoded.parquet')

### src_bytes

In [69]:
df_train_normal['src_bytes'].describe()


count    4.852000e+03
mean     1.128623e+03
std      3.252131e+04
min      0.000000e+00
25%      1.467500e+02
50%      2.300000e+02
75%      3.130000e+02
max      2.194619e+06
Name: src_bytes, dtype: float64

In [70]:
df_train_smurf['src_bytes'].describe()

count    14083.000000
mean       935.256976
std        200.439687
min        520.000000
25%       1032.000000
50%       1032.000000
75%       1032.000000
max       1032.000000
Name: src_bytes, dtype: float64

In [71]:
df_train_neptune['src_bytes'].describe()

count    5328.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: src_bytes, dtype: float64

In [72]:
save_dist_to_parquet(df_train_normal['src_bytes'], 'distributions/normal/', 'src_bytes.parquet')
save_dist_to_parquet(df_train_smurf['src_bytes'], 'distributions/smurf/', 'src_bytes.parquet')
save_dist_to_parquet(df_train_neptune['src_bytes'], 'distributions/neptune/', 'src_bytes.parquet')

### diff_srv_rate

In [73]:
df_train_normal['diff_srv_rate'].describe()


count    4852.000000
mean        0.017768
std         0.115248
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: diff_srv_rate, dtype: float64

In [74]:
df_train_smurf['diff_srv_rate'].describe()


count    14083.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: diff_srv_rate, dtype: float64

In [75]:
df_train_neptune['diff_srv_rate'].describe()

count    5328.000000
mean        0.065893
std         0.046911
min         0.000000
25%         0.060000
50%         0.060000
75%         0.070000
max         1.000000
Name: diff_srv_rate, dtype: float64

In [76]:
save_dist_to_parquet(df_train_normal['diff_srv_rate'], 'distributions/normal/', 'diff_srv_rate.parquet')
save_dist_to_parquet(df_train_smurf['diff_srv_rate'], 'distributions/smurf/', 'diff_srv_rate.parquet')
save_dist_to_parquet(df_train_neptune['diff_srv_rate'], 'distributions/neptune/', 'diff_srv_rate.parquet')

### same_srv_rate

In [77]:
df_train_normal['same_srv_rate'].describe()

count    4852.000000
mean        0.985472
std         0.093801
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: same_srv_rate, dtype: float64

In [78]:
df_train_smurf['same_srv_rate'].describe()


count    14083.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: same_srv_rate, dtype: float64

In [79]:
df_train_neptune['same_srv_rate'].describe()

count    5328.000000
mean        0.070011
std         0.069336
min         0.000000
25%         0.030000
50%         0.060000
75%         0.090000
max         1.000000
Name: same_srv_rate, dtype: float64

In [80]:
save_dist_to_parquet(df_train_normal['same_srv_rate'], 'distributions/normal/', 'same_srv_rate.parquet')
save_dist_to_parquet(df_train_smurf['same_srv_rate'], 'distributions/smurf/', 'same_srv_rate.parquet')
save_dist_to_parquet(df_train_neptune['same_srv_rate'], 'distributions/neptune/', 'same_srv_rate.parquet')