In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras import regularizers, Sequential

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score

import matplotlib.pyplot as plt
from matplotlib import gridspec

from scipy import stats

import seaborn as sns

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
pd.set_option("display.max_rows", None,
             "display.max_columns", None)

In [2]:
def drop_infs(ds):
    ds.replace([np.inf, -np.inf], np.nan, inplace=True)
    ds.dropna(how='any', inplace=True)
    return ds

def labelencoder(ds):
    encoded_ds=ds.copy()
    le = preprocessing.LabelEncoder()
    for col in encoded_ds.select_dtypes(include=['object']).columns:
        encoded_ds[col]=le.fit_transform(encoded_ds[col])
    
    return encoded_ds

In [3]:
PATH="/Users/ecemdenizbabaoglan/Desktop/TOBBETU/yap470/test_dataset_hepsi.csv"

In [4]:
ds=pd.read_csv(PATH)

In [5]:
ds.shape

(2830743, 79)

In [6]:
ds=drop_infs(ds)

In [7]:
ds.loc[ds[' Label'] == 'BENIGN', ' Label'] = 0
normal=ds.loc[ds[' Label'] == 0, :]

In [8]:
normal_y=normal[' Label'].copy()
normal_x=normal.drop(' Label', axis=1)

In [9]:
ds.loc[ds[' Label'] != 0, ' Label'] = 1
anomaly=ds.loc[ds[' Label'] == 'BENIGN', :] 

In [10]:
anomaly_y=anomaly[' Label'].copy()
anomaly_x=anomaly.drop(' Label', axis=1)

In [11]:
normal_x_train, normal_x_test, normal_y_train, normal_y_test = train_test_split(normal_x, normal_y, 
                                                                                test_size=0.20)

In [12]:
normal_x_train=labelencoder(normal_x_train)

scaler=preprocessing.StandardScaler()
normal_x_train=scaler.fit_transform(normal_x_train)

In [13]:
normal_x_train = np.asarray(normal_x_train).astype(np.float32)
normal_x_train = tf.cast(normal_x_train, tf.float32)

In [14]:
test_x=pd.concat([normal_x_test, anomaly_x])
test_x=labelencoder(test_x)

scaler=preprocessing.StandardScaler()
test_x=scaler.fit_transform(test_x)

In [15]:
test_y=pd.concat([normal_y_test, anomaly_y])

In [16]:
test_x = np.asarray(test_x).astype(np.float32)
test_x = tf.cast(test_x, tf.float32)

In [17]:
input = tf.keras.layers.Input(shape=(78,))

In [18]:
encoder = tf.keras.Sequential([
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(8, activation='relu')])(input)

In [19]:
decoder = tf.keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(78, activation='sigmoid')])(encoder)

In [20]:
autoencoder = tf.keras.Model(inputs=input, outputs=decoder)

In [21]:
autoencoder.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())

In [22]:
history = autoencoder.fit(normal_x_train, normal_x_train, 
          epochs=11,
          batch_size=64,
          validation_split=0.1,
          shuffle=True)

2022-12-16 23:30:43.953883: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-12-16 23:30:43.954127: W tensorflow/core/platform/profile_utils/cpu_utils.cc:126] Failed to get CPU frequency: 0 Hz


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [23]:
reconstructions = autoencoder.predict(normal_x_train)
train_loss = tf.keras.losses.mae(reconstructions, normal_x_train)

In [24]:
threshold = np.mean(train_loss) + np.std(train_loss)
print("Threshold: ", threshold)

Threshold:  0.47152823


In [25]:
def predict(model, data, threshold):
  reconstructions = model(data)
  loss = tf.keras.losses.mae(reconstructions, data)
  return tf.math.less(loss, threshold)

def print_stats(predictions, labels):
  print("Accuracy = {}".format(accuracy_score(labels, predictions)))
  print("Precision = {}".format(precision_score(labels, predictions)))
  print("Recall = {}".format(recall_score(labels, predictions)))

In [26]:
preds = predict(autoencoder, test_x , threshold)
print_stats(preds, test_y.astype(bool))

Accuracy = 0.06730227356779318
Precision = 0.0
Recall = 0.0


  _warn_prf(average, modifier, msg_start, len(result))
