In [3]:
from scipy.io import arff
import pandas as pd
import numpy as np
import os

from f01_preprocess_datasets import datasets

In [4]:
def read_data(dataset):
    return pd.read_csv(f'./datasets/{dataset}.csv')

# Carpetas de datasets

## 0. CIC-IDS2017: https://www.unb.ca/cic/datasets/ids-2017.html

In [5]:
data = read_data(datasets[0])

In [6]:
data

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3056483,53,32215,4,2,112,152,28,28,28.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3056484,53,324,2,2,84,362,42,42,42.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3056485,58030,82,2,1,31,6,31,0,15.5,21.92031,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3056486,53,1048635,6,2,192,256,32,32,32.0,0.00000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [7]:
data.dtypes.unique()

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

In [8]:
data.dtypes[data.dtypes == 'O']

 Label    object
dtype: object

In [9]:
data[' Label'].value_counts()

BENIGN                        2370815
DDoS                           256054
DoS Hulk                       231073
PortScan                       158930
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name:  Label, dtype: int64

In [10]:
data[' Label'].unique().shape

(15,)

In [11]:
np.sum(data[' Label'] == 'BENIGN')/data.shape[0]

0.7756663857342152

### Observaciones

Los datos son numéricos, a excepción del Label, por lo que están listos para ser utilizados en modelos de clasificación. Se puede utilizar una clasificación binaria (BENIGN, OTROS) o una clasificación múltiple con todos los tipos de ataque.

## 1. UNSW-NB15: https://research.unsw.edu.au/projects/unsw-nb15-dataset

### Result CSV file

In [12]:
data = read_data(datasets[1])

  return pd.read_csv(f'./datasets/{dataset}.csv')


In [13]:
data

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,﻿59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3240043,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,31,...,,1,2,3,3,1,1,3,,0
3240044,59.166.0.7,20848,149.171.126.4,21,tcp,CON,0.365058,456,346,31,...,2,2,2,2,2,2,2,2,,0
3240045,59.166.0.3,21511,149.171.126.9,21,tcp,CON,6.335154,1802,2088,31,...,2,2,2,4,2,2,2,2,,0
3240046,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,31,...,,1,1,2,4,2,2,2,,0


In [14]:
data.dtypes.unique()

array([dtype('O'), dtype('float64'), dtype('int64')], dtype=object)

In [15]:
data.dtypes[data.dtypes == 'O']

srcip         object
sport         object
dstip         object
dsport        object
proto         object
state         object
service       object
ct_ftp_cmd    object
attack_cat    object
dtype: object

In [16]:
data['attack_cat'].value_counts()

Generic             223003
Exploits             49934
 Fuzzers             19195
DoS                  17520
 Reconnaissance      12228
 Fuzzers             10102
Reconnaissance        3518
Analysis              3203
Backdoor              1795
 Shellcode            1288
Backdoors             1068
Shellcode              446
Worms                  198
Name: attack_cat, dtype: int64

In [17]:
data['attack_cat'].unique().shape

(14,)

In [18]:
np.sum(data['Label'] == 0)/data.shape[0]

0.8939836693777377

### Observaciones

-Se tiene algunos datos string que deberán ser mapeados a números para poder procesar el dataset dentro de un modelo de clasificación. Se puede utilizar una clasificación binaria con la columna Label (Ataque: 1, No ataque: 0) o una clasificación múltiple con la columna de attack_cat

## 2. NF-UNSW-NB15-v2: https://staff.itee.uq.edu.au/marius/NIDS_datasets/#RA1

Contiene un unico archivo csv con información de flujos etiquetados con un tipo de ataque.

In [19]:
data = read_data(datasets[2])

In [20]:
data

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,59.166.0.5,1305,149.171.126.8,21,6,1.0,9,1,193,3,...,0,7240,0,0,0,0,0,331.0,0,Benign
1,59.166.0.5,1305,149.171.126.8,21,6,1.0,261,5,469,7,...,8688,8688,18944,74,0,0,0,230.0,0,Benign
2,59.166.0.5,1305,149.171.126.8,21,6,1.0,481,9,750,11,...,10136,10136,33792,132,0,0,0,229.0,0,Benign
3,59.166.0.5,1305,149.171.126.8,21,6,1.0,701,13,1054,15,...,11584,11584,48640,190,0,0,0,125.0,0,Benign
4,59.166.0.5,1305,149.171.126.8,21,6,1.0,1031,19,1474,21,...,14480,13032,64256,251,0,0,0,230.0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390270,59.166.0.4,58663,149.171.126.1,5190,6,0.0,1064,12,2364,14,...,10136,10136,29696,116,0,0,0,0.0,0,Benign
2390271,59.166.0.6,60977,149.171.126.4,53,17,0.0,146,2,178,2,...,0,0,0,0,19348,1,60,0.0,0,Benign
2390272,59.166.0.6,54553,149.171.126.9,80,6,7.0,994,10,8896,10,...,10136,7240,7424,29,0,0,0,0.0,0,Benign
2390273,59.166.0.6,55026,149.171.126.5,8248,6,0.0,4014,68,60268,70,...,44888,14480,8960,35,0,0,0,0.0,0,Benign


In [21]:
data.dtypes.unique()

array([dtype('O'), dtype('int64'), dtype('float64')], dtype=object)

In [22]:
data.dtypes[data.dtypes == 'O']

IPV4_SRC_ADDR    object
IPV4_DST_ADDR    object
Attack           object
dtype: object

In [23]:
data['Label'].value_counts()

0    2295222
1      95053
Name: Label, dtype: int64

In [24]:
data['Attack'].value_counts()

Benign            2295222
Exploits            31551
Fuzzers             22310
Generic             16560
Reconnaissance      12779
DoS                  5794
Analysis             2299
Backdoor             2169
Shellcode            1427
Worms                 164
Name: Attack, dtype: int64

In [25]:
data['Attack'].unique().shape

(10,)

In [26]:
np.sum(data['Attack'] == 'Benign') / data.shape[0]

0.9602334459424124

### Observaciones

Además del tipo de ataque, únicamente las direcciones IP de fuente y destino son strings. Se puede utilizar una clasificación binaria con la columna Label (Ataque: 1, No ataque: 0) o una clasificación múltiple con la columna de Attack

## 3. CSE-CIC-IDS2018: https://www.unb.ca/cic/datasets/ids-2018.html

In [27]:
data = read_data(datasets[3])

In [28]:
data

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,3389,6,02/03/2018 02:08:18,3982183,14,8,1442,1731.0,725,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1048571,3389,6,02/03/2018 02:08:22,3802316,14,8,1440,1731.0,725,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1048572,3389,6,02/03/2018 02:08:25,4004239,14,8,1459,1731.0,741,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign
1048573,3389,6,02/03/2018 02:08:29,3998435,14,8,1459,1731.0,741,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Benign


In [29]:
data.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [30]:
data.dtypes[data.dtypes == 'O']

Timestamp    object
Label        object
dtype: object

In [31]:
data['Label'].value_counts()

Benign    762384
Bot       286191
Name: Label, dtype: int64

In [32]:
np.sum(data['Label'] == 'Benign') / data.shape[0]

0.7270667334239325

## 4. NSL-KDD: https://www.unb.ca/cic/datasets/nsl.html

In [33]:
data = read_data(datasets[4])

In [34]:
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,difficulty
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148512,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00,normal,21
148513,0,tcp,http,SF,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00,normal,21
148514,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.00,0.07,0.07,back,15
148515,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.00,0.00,0.00,0.00,0.00,0.00,normal,21


In [35]:
data.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [36]:
data.dtypes[data.dtypes == 'O']

protocol_type    object
service          object
flag             object
class            object
dtype: object

In [37]:
data['class'].value_counts().shape

(40,)

In [38]:
np.sum(data['class'] == 'normal') / data.shape[0]

0.5188227610307238