In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras.metrics as metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
cols = {
    
    # 'ts': int,
    # 'src_ip': 'category',
    'src_port': float,
    # 'dst_ip': 'category',
    'dst_port': float,
    'proto': 'category',
    # 'service': 'category',
    # 'duration': float,
    # 'src_bytes': float,
    # 'dst_bytes': float,
    # 'conn_state': 'category',
    # 'missed_bytes': float,

    # 'src_pkts': float,
    # 'src_ip_bytes': float,
    # 'dst_pkts': float,
    # 'dst_ip_bytes': float,

    # 'dns_query': 'category',
    # 'dns_qclass': float,
    # 'dns_qtype': float,
    # 'dns_rcode': float,
    # 'dns_AA': float, #bool
    # 'dns_RD': float, #bool
    # 'dns_rejected': float, #bool

    # 'ssl_version': 'category',
    # 'ssl_cipher': 'category',
    # 'ssl_resumed': float, #bool
    # 'ssl_established': float, #bool
    # 'ssl_subject': 'category',
    # 'ssl_issuer': 'category',

    # 'http_trans_depth': float,
    # 'http_method': 'category',
    # 'http_uri': 'category',
    # 'http_version': 'category',
    # 'http_request_body_len': float,
    # 'http_response_body_len': float,
    # 'http_status_code': float,
    # 'http_user_agent': 'category',
    # 'http_orig_mime_types': 'category',
    # 'http_resp_mime_types': 'category',

    # 'weird_name': 'category',
    # 'weird_addl': 'category',
    # 'weird_notice': float, #bool

    'label': int

}

def bool_to_number(x: str) -> float:
  if x == 'T':
    return 1
  elif x == 'F':
    return 0
  elif x == '-':
    return 0.5
  else:
    raise x

converters = {
    'dns_AA': bool_to_number,
    'dns_RD': bool_to_number,
    'dns_rejected': bool_to_number,
    'ssl_resumed': bool_to_number,
    'ssl_established': bool_to_number,
    'weird_notice': bool_to_number
}

In [None]:
df = pd.read_csv('/content/drive/Shareddrives/datasci/ton_iot/Train_Test_Network.csv',
                      sep=',',
                      header=0,
                      dtype=cols,
                      usecols=list(cols.keys()),
                      converters=converters,
                      na_values=['-'])

In [None]:
df

Unnamed: 0,src_port,dst_port,proto,label
0,1883.0,52976.0,tcp,0
1,47260.0,15600.0,udp,0
2,1880.0,51782.0,tcp,0
3,34296.0,10502.0,tcp,0
4,46608.0,53.0,udp,0
...,...,...,...,...
461038,33108.0,80.0,tcp,1
461039,37242.0,443.0,tcp,1
461040,4444.0,49178.0,tcp,1
461041,60816.0,443.0,tcp,1


In [None]:
features = list(df.columns)
features.remove('label')

In [None]:
for col,typ in dict(df.dtypes).items():
  if typ == 'category':
    df[col] = df[col].cat.codes.astype(int)
  else:
    vs = list(df[col].unique())
    # should not happen in the full dataset, otherwise would be removed in 1st step of preproc
    if (len(vs) == 1) and (np.isnan(vs[0])):
      print('error',col,'const column with NaN. Replacing with 0')
      df[col] = 0
    elif (len(vs) == 2) and (np.isnan(vs[0]) or np.isnan(vs[1])):
      val = vs[1] if np.isnan(vs[0]) else vs[0]
      val = -val if val != 0 else 1
      df[col] = df[col].fillna(value=val)
    else:
      df[col] = df[col].fillna(df[col].mean())

In [None]:
X = df.drop('label',axis=1).values
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    df['label'].values, 
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(len(X[0]),)))
model.add(tf.keras.layers.Dense(64, 
                                activation='relu'))
model.add(tf.keras.layers.Dense(64, 
                                activation='relu'))
model.add(tf.keras.layers.Dense(64, 
                                activation='relu'))
model.add(tf.keras.layers.Dense(64, 
                                activation='relu'))
model.add(tf.keras.layers.Dense(1, 
                                activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam', 
              metrics = ['accuracy', metrics.Precision(), metrics.Recall()])

In [None]:
history = model.fit(X_train,
                    Y_train,
                    epochs=20,
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
predict = model.predict(X_test, verbose=1)



In [None]:
tp,tn,fp,fn = 0,0,0,0
predictn = predict.flatten().round().tolist()
len(predictn)
for i in range(len(predictn)):
  if predictn[i]==1 and Y_test[i]==1:
    tp+=1
  elif predictn[i]==0 and Y_test[i]==0:
    tn+=1
  elif predictn[i]==0 and Y_test[i]==1:
    fp+=1
  elif predictn[i]==1 and Y_test[i]==0:
    fn+=1
    
print(tp,tn,fp,fn)
classification_report(Y_test, predictn, output_dict=True)

44025 85585 4316 4387


{'0': {'f1-score': 0.951615862302847,
  'precision': 0.951991635243212,
  'recall': 0.9512403858978349,
  'support': 89972},
 '1': {'f1-score': 0.9100493007968746,
  'precision': 0.9093819714120466,
  'recall': 0.9107176103100888,
  'support': 48341},
 'accuracy': 0.9370774981382806,
 'macro avg': {'f1-score': 0.9308325815498608,
  'precision': 0.9306868033276292,
  'recall': 0.9309789981039618,
  'support': 138313},
 'weighted avg': {'f1-score': 0.9370881667878902,
  'precision': 0.9370993708916154,
  'recall': 0.9370774981382806,
  'support': 138313}}

In [None]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                256       
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 12,801
Trainable params: 12,801
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.save('/content/drive/Shareddrives/datasci/ton_iot/DNN.tf')
with open('/content/drive/Shareddrives/datasci/ton_iot/result/DNN.csv','w') as infile:
  txt = ''
  for pred in predictn:
    txt += str(int(pred))+','
  txt = txt[:-1]
  txt += '\n'
  infile.write(txt)

INFO:tensorflow:Assets written to: /content/drive/Shareddrives/datasci/ton_iot/DNN.tf/assets
