In [97]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.regularizers import L1
from sklearn.preprocessing import LabelEncoder
import numpy as np
from collections import Counter

In [98]:
train = pd.read_csv("../data/processed/traindata.csv")
test = pd.read_csv("../data/processed/testdata.csv")

train.head()

Unnamed: 0.1,Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,service_icmp
0,0,0,239,486,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,0,235,1337,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,2,0,219,1337,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,3,0,217,2032,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,4,0,217,2032,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [99]:
train.shape

(492998, 121)

In [6]:
test.head()

Unnamed: 0.1,Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,service_pm_dump,service_red_i,service_urh_i
0,0,0,105,146,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,105,146,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,5,0,29,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,6,0,105,146,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,8,0,223,185,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [100]:
test.shape

(292299, 121)

In [102]:
encoder = LabelEncoder()
train['labels'] = encoder.fit_transform(train['labels'])
test['labels'] = encoder.fit_transform(test['labels'])

In [103]:
train_x = train.drop("labels", axis=1)
train_y = train[["labels"]]
test_x = test.drop("labels", axis=1)
test_y = test[["labels"]]

train_labs = len(train_y.labels.unique())
test_labs = len(test_y.labels.unique())

assert train_labs == test_labs
assert not set(train.columns).symmetric_difference(set(test.columns))
assert set(train.labels) == set(test.labels)


In [105]:
model = Sequential(
    [tf.keras.Input(shape=(len(train_x.columns),)),
    Dense(units=60, name="L1", activation="relu", kernel_initializer="he_normal", kernel_regularizer=L1(0.01)),
    Dense(units=40, name="L2", activation="relu", kernel_initializer="he_normal", kernel_regularizer=L1(0.01)),
    Dense(units=train_labs, name="L3", activation="linear")], name="my_model"
)

model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 L1 (Dense)                  (None, 60)                7260      
                                                                 
 L2 (Dense)                  (None, 40)                2440      
                                                                 
 L3 (Dense)                  (None, 21)                861       
                                                                 
Total params: 10,561
Trainable params: 10,561
Non-trainable params: 0
_________________________________________________________________


In [106]:
l1, l2, l3 = model.layers

l1.get_weights()

[array([[ 0.05643423, -0.16491634,  0.00492953, ...,  0.09993041,
          0.12813663,  0.02654676],
        [-0.02323315, -0.2736648 ,  0.0280675 , ..., -0.05825223,
         -0.04402432, -0.03537798],
        [ 0.133852  ,  0.11454333,  0.00544056, ..., -0.00748676,
         -0.09658032,  0.02204844],
        ...,
        [-0.08946439,  0.07595997, -0.01344377, ..., -0.01873898,
         -0.07357429, -0.00733952],
        [ 0.09531335, -0.06815206,  0.11804809, ..., -0.19064245,
         -0.06754567,  0.04548772],
        [ 0.13314871, -0.10972337, -0.01450934, ...,  0.04219949,
          0.05573809, -0.07641761]], dtype=float32),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)]

In [108]:
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
    metrics=["Accuracy", "SparseCategoricalCrossentropy"]
)

model.fit(train_x, train_y, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x31bf38400>

In [109]:
preds_test = model.predict(test_x)
preds_train = model.predict(train_x)




In [110]:
Counter(list(map(np.argmax, tf.nn.softmax(preds_test).numpy())))

Counter({18: 292299})

In [111]:
Counter(list(map(np.argmax, tf.nn.softmax(preds_train).numpy())))

Counter({18: 492998})

In [112]:
encoder.inverse_transform(np.array([11,18]))

array(['normal', 'smurf'], dtype=object)

In [113]:
train_y["labels"].value_counts()

18    280790
9     107201
11     97277
0       2203
17      1589
5       1247
15      1040
19       979
14       264
10       231
3         53
1         30
6         21
20        20
4         12
16        10
7          9
2          8
8          7
13         4
12         3
Name: labels, dtype: int64