In [1]:
!!pip show scikit-learn

['Name: scikit-learn',
 'Version: 1.2.0',
 'Summary: A set of python modules for machine learning and data mining',
 'Home-page: http://scikit-learn.org',
 'Author: ',
 'Author-email: ',
 'License: new BSD',
 'Location: /Users/boramert/opt/anaconda3/lib/python3.9/site-packages',
 'Requires: joblib, numpy, scipy, threadpoolctl',
 'Required-by: pyod, scikit-learn-intelex']

In [2]:
import sys
sys.path.append('/Users/boramert/opt/anaconda3/lib/python3.9/site-packages')

In [3]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing



In [4]:
def labelencoder(ds):
    encoded_ds=ds.copy()
    le = preprocessing.LabelEncoder()
    for col in encoded_ds.select_dtypes(include=['object']).columns:
        encoded_ds[col]=le.fit_transform(encoded_ds[col])
    
    return encoded_ds

def encoding_for_anomalous(ds):
    encoded_ds=ds.copy()
    for i in range(len(ds)):
        if encoded_ds.iloc[i]=='BENIGN':
            encoded_ds.iloc[i]= 0
        else:
            encoded_ds.iloc[i]= 1
            
    return encoded_ds  

def scaling(ds):
    scaler=preprocessing.StandardScaler()
    scaled_ds=scaler.fit_transform(ds)
    return scaled_ds

def drop_infs(ds):
    ds.replace([np.inf, -np.inf], np.nan, inplace=True)
    ds.dropna(how='any', inplace=True)
    return ds

In [5]:
pd.set_option("display.max_rows", None,
             "display.max_columns", None)

In [6]:
PATH="MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv"
normal_ds=pd.read_csv(PATH)

In [7]:
normal_ds=drop_infs(normal_ds)
normal_ds=labelencoder(normal_ds)
normal_y_train=normal_ds[' Label']
normal_x_train=normal_ds.drop(' Label', axis=1) #Label'ı yani y'yi dropluyoruz
normal_x_train=scaling(normal_x_train)

In [8]:
test_ds=pd.read_csv("test_dataset.csv")

In [9]:
test_ds=drop_infs(test_ds)
y_test=test_ds[' Label']
x_test=test_ds.drop(' Label', axis=1)
x_test=scaling(x_test)

In [10]:
y_test.value_counts()

0    1741839
1     556556
Name:  Label, dtype: int64

In [11]:
normal_x_train = np.asarray(normal_x_train).astype(np.float32)
x_test = np.asarray(x_test).astype(np.float32)

normal_x_train = tf.cast(normal_x_train, tf.float32)
x_test = tf.cast(x_test, tf.float32)

2022-12-12 23:40:47.900619: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
input = tf.keras.layers.Input(shape=(78,))

In [13]:
encoder = tf.keras.Sequential([
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu')])(input)

decoder = tf.keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(78, activation='sigmoid')])(encoder)

In [14]:
autoencoder = tf.keras.Model(inputs=input, outputs=decoder)

In [15]:
autoencoder.compile(optimizer='adam', loss='mae')

In [16]:
history = autoencoder.fit(normal_x_train, normal_x_train, 
          epochs=10, 
          batch_size=64,
          validation_data=(x_test, x_test),
          shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
reconstructions = autoencoder.predict(normal_x_train)
train_loss = tf.keras.losses.mae(reconstructions, normal_x_train)



In [18]:
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

Threshold:  0.49923417


In [19]:
def predict(model, data, threshold):
  reconstructions = model(data)
  loss = tf.keras.losses.mae(reconstructions, data)
  return tf.math.less(loss, threshold)

def print_stats(predictions, labels):
  print("Accuracy = {}".format(accuracy_score(labels, predictions)))
  print("Precision = {}".format(precision_score(labels, predictions)))
  print("Recall = {}".format(recall_score(labels, predictions)))

In [20]:
preds = predict(autoencoder, x_test, threshold)
print_stats(preds, y_test.astype(bool))

Accuracy = 0.2087369664483259
Precision = 0.19605112764716395
Recall = 0.7313352115510389


In [24]:
confusion_matrix(y_test.astype(bool), preds)

array([[  72731, 1669108],
       [ 149527,  407029]])