# Normalize customized dataset

This notebook normalizes the cutomized dataset and saves the normalized data to disk.

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import matplotlib.patches as mpatches
from sklearn.preprocessing import normalize

# Loading Benign data

In [None]:
# Both benign chrome and firefox contain DoH and non-DoH traffic
DATASET_PATH = '../datasets/'
df_benign = pd.read_csv(os.path.join(DATASET_PATH, '', 'all-benign-chrome.csv'))
# Add the label column 0 in all rows
df_benign['label'] = 0
# drop unnecessary columns
df_benign = df_benign.drop(columns=['SourceIP', 'DestinationIP', 'TimeStamp', 'SourcePort', 'DestinationPort', 'Duration', 'DoH'])
print(df_benign.shape)

In [None]:
display(df_benign.head())

# Loading Malicious data

In [None]:
# Both benign chrome and firefox contain DoH and non-DoH traffic
DATASET_PATH = '../datasets/'
df_malign = pd.read_csv(os.path.join(DATASET_PATH, '', '27072024-tunnel.csv'))
# Add the label column 1 in all rows
df_malign['label'] = 1
# drop unnecessary columns
df_malign = df_malign.drop(columns=['SourceIP', 'DestinationIP', 'TimeStamp', 'SourcePort', 'DestinationPort', 'Duration', 'DoH'])
print(df_malign.shape)

In [None]:
display(df_malign.head())

# Merging and shuffling benign and malicious data

In [None]:
data = shuffle(pd.concat([df_benign, df_malign], ignore_index=True))

In [None]:
print(data.shape)
display(data.head())

In [None]:
# Checking for missing values
data.isnull().sum() # no missing values

In [None]:
# Removing the columns with all NaN values
data = data.dropna()
print(data.shape)
data.isnull().sum() # no missing values

# Data Exploration and Visualization
## 0: Benign, 1: Malicious

In [None]:
sns.countplot(x='label', hue='label', data=data, palette=['blue', 'orange'])

# Create legend patches
legend_labels = ['0: Benign', '1: Malicious']
legend_patches = [mpatches.Patch(color=color, label=label) for color, label in zip(['blue', 'orange'], legend_labels)]  

# Add legend to the plot
plt.legend(handles=legend_patches)
plt.show()

# Normalizing the data

In [None]:
# Removing no utilized features
#data = data.drop(columns=['SourceIP', 'DestinationIP', 'TimeStamp'])
# X: features, y: labels
X, y = data.drop(columns=['label']), data['label']

In [None]:
# Normalize
X = pd.DataFrame(normalize(X, norm='l2', axis=1), columns=data.columns[0:-1])

In [None]:
display(X)

In [None]:
y = y.reset_index(drop=True)
display(y)

In [None]:
# Save X and y to disk
X.to_csv('x-e-valente-custom-normalized.csv', sep=',')
y.to_csv('y-evalente-custom-normalized.csv', sep=',', header=['label'])

In [None]:
X.head()

# Feature Names

In [None]:
# features names
count = 0
for val in data.columns.values:
  if count == 31:
        print(f'{count} -> {val} (not considered as feature)')
  else:
        print(f'{count} -> {val}')
      
  count = count + 1

In [None]:
# Correlation matrix
f, ax = plt.subplots(figsize=(20,20))
plt.matshow(X.corr('pearson'), f.number)
# X is the input features
# data.columns[:31] is the feature names
plt.xticks(range(X.shape[1]), data.columns[:28], fontsize=10, rotation=90)
plt.yticks(range(X.shape[1]), data.columns[0:28], fontsize=10)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);
plt.show()

# End of the experiments.