In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
import plotly.express as px
import matplotlib.pyplot as plt

dataset_path = "/Users/marc-andrerauscher/Library/CloudStorage/OneDrive-bwedu/2.Semester/1.Master/Innovation_and_Transfer_Competence/datasets/api_behavior_ext.csv"
show_tables = True  # debugging option to control the displaying of the new datasets/tables

In [2]:
dt = pd.read_csv(dataset_path)
dt  # debugging table plotting

Unnamed: 0.1,Unnamed: 0,_id,inter_api_access_duration(sec),api_access_uniqueness,sequence_length(count),vsession_duration(min),ip_type,behavior,behavior_type,num_sessions,num_users,num_unique_apis,source
0,0,024aae60-1a81-3f37-bbe6-3f832c919706,7.010387,0.419355,31.000000,13040,default,outlier,outlier,1.0,1.0,13.0,E
1,1,028d67dd-c6d0-329f-a20e-78db9eab7a55,51.419393,0.252336,107.000000,330113,default,outlier,outlier,1.0,1.0,27.0,E
2,2,02d12bf9-5fe2-3d0c-b233-30e02224b686,25.860775,0.275000,40.000000,62066,default,outlier,outlier,1.0,1.0,11.0,E
3,3,0b3aee1e-dc3b-3728-bc55-f57a23446b3d,0.205909,0.818182,11.000000,136,default,outlier,outlier,1.0,1.0,9.0,E
4,4,0dbfffb4-3ed4-3cb2-904d-c348501a996e,0.122125,0.812500,16.000000,118,default,outlier,outlier,1.0,1.0,13.0,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34418,34418,e564f100-0a97-3dd8-8b55-fb1e801bd228,0.030952,0.016275,245.777778,4108,default,Normal,normal,99.0,9.0,36.0,F
34419,34419,e61bda8b-bff3-37dd-ab57-6f44e11ea4e8,2.300500,0.300000,30.000000,4141,private_ip,outlier,outlier,5.0,1.0,9.0,F
34420,34420,eddf6a8c-277c-36c3-affd-2b2209c2b1b9,0.393229,0.044610,8.966667,6347,default,Normal,normal,30.0,30.0,12.0,F
34421,34421,f311b468-610e-31f5-b8d8-1a2495a14cdc,0.394731,0.166667,108.000000,2558,default,outlier,outlier,4.0,1.0,18.0,F


In [3]:
# 0. Step. Get the behavior distribution
behavior_list = dt['behavior_type']
counter = {}

for elem in behavior_list:
    if elem not in counter:
        counter[elem] = 1
    else:
        counter[elem] += 1

print(counter)

{'outlier': 24146, 'bot': 1309, 'normal': 8946, 'attack': 22}


In [4]:
# 1. Step: Replace all strintgs with integers

In [5]:
# 1.1 Change the 'ip_type'
# Problem: The dataset preview does not show all options of the 'ip_type' attribute.
# Gather all possible options for the 'ip_type' fields

ip_type_values = list(dt['ip_type'].unique())
ip_type_values

['default', 'private_ip', 'datacenter', 'google_bot']

In [6]:
# 1.1 Change the 'ip_type'
# Replace the values with integers.

ip_type_replacement_matrix = {
    'default': 0,
    'private_ip': 1,
    'datacenter': 2,
    'google_bot': 3
}
# new dataset with replaced 'ip_type' value
dt.ip_type = dt.ip_type.replace(ip_type_replacement_matrix)
# dt  # debugging table plotting

In [7]:
# 1.2 Change the 'behavior_tpye' argument
# Replace the values with integers.
# Problem: The dataset preview does not show all options of the 'behavior_type' attribute.
# Gather all possible options for the 'behavior_tpye' fields

ip_type_values = list(dt['behavior_type'].unique())
ip_type_values

['outlier', 'bot', 'normal', 'attack']

In [8]:
behavior_type_replacement_matrix = {
    'outlier': 0,
    'normal': 1,
    'bot': 2,
    'attack': 3
}
# new dataset with replaced 'behavior_type' value
dt.behavior_type = dt.behavior_type.replace(behavior_type_replacement_matrix)
labels = dt.behavior_type
# dt  # debugging table plotting

In [9]:
# 2. Step: Drop the 'behavior' columns/argutment, because it is just an subset of 'bevavior_type' and the 
# dataset only requires one label

dt = dt.drop(columns='behavior')
# dt  # debugging table plotting

In [10]:
# 3. Step: Drop for the analysis unnecessary columns/fields.
# The following columns must be dropped: Unnamed, _id, source

dt = dt.drop(columns=['Unnamed: 0', '_id', 'source'])
# dt  # debugging table plotting

In [11]:
# 4. Step: Normalisation of the data
scaler = preprocessing.StandardScaler().fit(dt)
dt_scaled = pd.DataFrame(scaler.transform(dt))
dt_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.173915,-0.082972,-0.226693,-0.207404,-0.207572,-0.613171,-0.136526,-0.248597,-0.162274
1,0.396571,-0.635466,0.277944,3.762410,-0.207572,-0.613171,-0.136526,-0.248597,0.749431
2,0.068240,-0.560496,-0.166934,0.406411,-0.207572,-0.613171,-0.136526,-0.248597,-0.292517
3,-0.261327,1.236342,-0.359493,-0.368964,-0.207572,-0.613171,-0.136526,-0.248597,-0.422761
4,-0.262403,1.217546,-0.326293,-0.369189,-0.207572,-0.613171,-0.136526,-0.248597,-0.162274
...,...,...,...,...,...,...,...,...,...
34418,-0.263574,-1.416355,1.199425,-0.319234,-0.207572,1.201721,1.370177,0.511804,1.335527
34419,-0.234419,-0.477796,-0.233333,-0.318821,4.089025,-0.613171,-0.075028,-0.248597,-0.422761
34420,-0.258920,-1.322624,-0.372994,-0.291201,-0.207572,1.201721,0.309335,2.507858,-0.227396
34421,-0.258901,-0.918861,0.284584,-0.338640,-0.207572,-0.613171,-0.090403,-0.248597,0.163335


In [12]:
# Problem when normalizing the data
# The dataset seems to consist of some fields/values which are not processible.
try:
    dt_normalized = preprocessing.normalize(dt_scaled, norm='l2')
except ValueError as err:
    print(err)

Input contains NaN, infinity or a value too large for dtype('float64').


In [13]:
# With the following function, "NaN" values can be searched in the dataset.
# As the results show, the dataset actually contains "NaN" values.
np.where(np.isnan(dt_scaled))

(array([10642, 10642]), array([0, 1]))

In [14]:
# Solution of the "NaN Problem":
# Replace the all "NaN" values with 0. Consequently, the dataset should be able to
# be processed/mormalized.
# https://datascience.stackexchange.com/questions/11928/valueerror-input-contains-nan-infinity-or-a-value-too-large-for-dtypefloat32
dt_scaled = pd.DataFrame(dt_scaled).fillna(0)

# By searching for "NaN" values again, the solution can be verified.
# The empty arrays confirm the successful replacement.
np.where(np.isnan(dt_scaled))

(array([], dtype=int64), array([], dtype=int64))

In [15]:
# Try the normalization again.
# This time it works without any problem.
dt_normalized_l2 = pd.DataFrame(preprocessing.normalize(dt_scaled, norm='l2'))
dt_normalized_l1 = pd.DataFrame(preprocessing.normalize(dt_scaled, norm='l1'))
dt_normalized_max = pd.DataFrame(preprocessing.normalize(dt_scaled, norm='max'))

In [16]:
# 5. Step: Apply a PCA on the dataset
pca = PCA(n_components=3)
dt_pca = pca.fit_transform(dt_normalized_l2)
dt_pca

array([[-0.4601873 , -0.36159466,  0.52949623],
       [ 0.31750311, -0.45048655, -0.12675545],
       [ 0.03044889, -0.43090376,  0.68952038],
       ...,
       [ 0.73194103,  0.33781095,  0.02293105],
       [ 0.35403605, -0.65022358,  0.52480121],
       [-0.84309692,  0.15453725, -0.1649403 ]])

In [17]:
# 6. Step: Plot the results of the PCA analysis
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
    dt_pca, x=0, y=1, z=2, color=(labels),
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3', 'color': 'Labels'}
)
fig.update_traces(marker=dict(size=1))
fig.write_html("pca_marc_approach_max.html")

In [18]:
# 7. Step: Clustering of the dataset

# scaler = preprocessing.StandardScaler()
# X_scaled = scaler.fit_transform()
dbscan = DBSCAN(eps=0.2, min_samples=6)
dt_clustered = dbscan.fit_predict(dt_normalized_l2)
dt_clustered = pd.DataFrame(dt_clustered)
dt_clustered = pd.DataFrame(labels)
dt_clustered

Unnamed: 0,behavior_type
0,0
1,0
2,0
3,0
4,0
...,...
34418,1
34419,0
34420,1
34421,0


In [20]:
# 8. Step: Plot the results of the clustering
total_var = pca.explained_variance_ratio_.sum() * 100
fig = px.scatter_3d(
    dt_clustered, x=0, y=1, z=2, color=(labels),
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3', 'color': 'Labels'}
)
fig.update_traces(marker=dict(size=1))
fig.write_html("pca_dbscan.html")

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['behavior_type'] but received: 0