# Libraries

In [9]:
import pandas as pd

# Read first_dataset.csv generated by subdataset notebook 

In [10]:
df = pd.read_csv('first_dataset.csv', low_memory=False, sep=',')

# Initial Analysis

# Checking correlations for DoS attack labeling

## Calculate correlation between frame delta and packet lenght

In [11]:
#There's exactly one value in this column that goes beyond a  float64 which is weird but is ignored, probably it's the datatype
frame_delta = pd.to_numeric(df['frame.time_delta'], errors='coerce') 
packet_lenght = df['ipv6.plen']

correlation = frame_delta.corr(packet_lenght)
print(correlation)

0.023931876963261337


### No correlation here, this is due to fragmentation of packets, as confirmed in the RFC8200 (https://tools.ietf.org/html/rfc8200) as of july 2017, that standardizes ipv6, the maximum packet size for ipv6 is the MTU (maximum transmiter unit) of the link, according to the same, the minimum MTU required for ipv6 implementation has to have 1280 octanes of throughput. The only time a big packet does not get fragmented is when it size meets or surpasses the MTU link in the latter case you have to have another link.

### Check if any row contains a packet larger than 1280 bytes

In [12]:
df.loc[df['ipv6.plen'] >= 1280*8]

Unnamed: 0,frame.time_delta,frame.time_epoch,frame.time_relative,ipv6.plen,ipv6.nxt,ipv6.src,ipv6.dst,tcp.srcport,tcp.dstport,eth.src,...,mqtt.msgtype,mqtt.passwd,mqtt.qos,mqtt.retain,mqtt.topic,mqtt.topic_len,mqtt.username,mqtt.willmsg,mqtt.willtopic,label


## Calculate correlation between frame delta and message lenght

In [13]:
frame_delta = pd.to_numeric(df['frame.time_delta'], errors='coerce') 
mqtt_lenght = pd.to_numeric(df['mqtt.len'], errors='coerce')

correlation = mqtt_lenght.corr(packet_lenght)
print(correlation)

0.9883822306471168


### Now we can observe some correlation here, the mqtt message lenght is associated with the delay between packets, which makes sense since the DoS packets have higher message sizes and causes the server to send packets with higher delays, this is a good correlation to prove and futher investigate

### Obs: Maybe the correlations are wrong even if they check with theory because of the excluded tuples and the data type conversion

### TODO: Check if the datatype conversion is wrong and if any of the tuples that is not included in the correlation affects those assumptions

### excluded tuples:

In [14]:
df['frame.time_delta'] = pd.to_numeric(df['frame.time_delta'], errors='coerce')
df[df['frame.time_delta'].isnull()]



Unnamed: 0,frame.time_delta,frame.time_epoch,frame.time_relative,ipv6.plen,ipv6.nxt,ipv6.src,ipv6.dst,tcp.srcport,tcp.dstport,eth.src,...,mqtt.msgtype,mqtt.passwd,mqtt.qos,mqtt.retain,mqtt.topic,mqtt.topic_len,mqtt.username,mqtt.willmsg,mqtt.willtopic,label
0,0.000000,1.573.757.292.769.270.000,0.000000000,68,6,fd9e:6c51:2336:0:6201:94ff:fe0e:877a,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,50728.0,1883.0,60:01:94:0e:87:7a,...,3,,0,1,ufpi/ppgcc/esp-dht22-node07/temperatura,39,,,,normal
1,0.000023,1.573.757.292.769.290.000,0.000023000,20,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:6201:94ff:fe0e:877a,1883.0,50728.0,68:a3:c4:6e:50:12,...,,,,,,,,,,normal
2,0.003886,1.573.757.292.773.170.000,0.003909000,64,6,fd9e:6c51:2336:0:6201:94ff:fe0e:877a,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,50728.0,1883.0,60:01:94:0e:87:7a,...,3,,0,1,ufpi/ppgcc/esp-dht22-node07/umidade,35,,,,normal
3,0.000010,1.573.757.292.773.180.000,0.003919000,20,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:6201:94ff:fe0e:877a,1883.0,50728.0,68:a3:c4:6e:50:12,...,,,,,,,,,,normal
4,,1.573.757.296.060.980.000,3.291.717.000,68,6,fd9e:6c51:2336:0:ce50:e3ff:fe1c:2a5b,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,60066.0,1883.0,cc:50:e3:1c:2a:5b,...,3,,0,1,ufpi/ppgcc/esp-dht22-node03/temperatura,39,,,,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670047,0.002997,1.574.035.197.120.260.000,277.904.350.995.000,64,6,fd9e:6c51:2336:0:6201:94ff:fe0e:877a,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,50746.0,1883.0,60:01:94:0e:87:7a,...,3,,0,1,ufpi/ppgcc/esp-dht22-node07/umidade,35,,,,normal
670048,0.000010,1.574.035.197.120.270.000,277.904.351.005.000,20,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:6201:94ff:fe0e:877a,1883.0,50746.0,68:a3:c4:6e:50:12,...,,,,,,,,,,normal
670049,0.809234,1.574.035.197.929.500.000,277.905.160.239.000,68,6,fd9e:6c51:2336:0:ce50:e3ff:fe55:cd91,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,62795.0,1883.0,cc:50:e3:55:cd:91,...,3,,0,1,ufpi/ppgcc/esp-dht22-node01/temperatura,39,,,,normal
670050,0.000023,1.574.035.197.929.530.000,277.905.160.262.000,20,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:ce50:e3ff:fe55:cd91,1883.0,62795.0,68:a3:c4:6e:50:12,...,,,,,,,,,,normal


In [20]:
#670052 100 
#95642 x
print(str((95642*100)/670052) + "% of the tuples were excluded in the first correlation")

14.27381755445846% of the tuples were excluded in the first correlation


In [24]:
df['mqtt.len'] = pd.to_numeric(df['mqtt.len'], errors='coerce')
df[df['mqtt.len'].isnull()].loc[df['label'] != 'normal']

Unnamed: 0,frame.time_delta,frame.time_epoch,frame.time_relative,ipv6.plen,ipv6.nxt,ipv6.src,ipv6.dst,tcp.srcport,tcp.dstport,eth.src,...,mqtt.msgtype,mqtt.passwd,mqtt.qos,mqtt.retain,mqtt.topic,mqtt.topic_len,mqtt.username,mqtt.willmsg,mqtt.willtopic,label
12218,0.318415,1.573.762.306.023.030.000,5.013.253.767.000,40,6,fd9e:6c51:2336:0:cdac:31e2:320d:90b6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,44245.0,1883.0,80:86:f2:f9:f9:4d,...,,,,,,,,,,bruteforce
12219,0.000014,1.573.762.306.023.050.000,5.013.253.781.000,40,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:cdac:31e2:320d:90b6,1883.0,44245.0,30:b5:c2:4c:23:8a,...,,,,,,,,,,bruteforce
12220,0.002561,1.573.762.306.025.610.000,5.013.256.342.000,32,6,fd9e:6c51:2336:0:cdac:31e2:320d:90b6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,44245.0,1883.0,80:86:f2:f9:f9:4d,...,,,,,,,,,,bruteforce
12222,0.750819,1.573.762.306.776.440.000,5.014.007.170.000,32,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:cdac:31e2:320d:90b6,1883.0,44245.0,30:b5:c2:4c:23:8a,...,,,,,,,,,,bruteforce
12224,0.003944,1.573.762.306.780.400.000,5.014.011.135.000,32,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:cdac:31e2:320d:90b6,1883.0,44245.0,30:b5:c2:4c:23:8a,...,,,,,,,,,,bruteforce
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665018,0.000009,1.574.033.108.135.310.000,275.815.366.048.000,32,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:b043:1759:1f44:81e8,1883.0,42774.0,68:a3:c4:6e:50:12,...,,,,,,,,,,DoS
665019,0.588280,1.574.033.108.723.590.000,275.815.954.328.000,32,6,fd9e:6c51:2336:0:b043:1759:1f44:81e8,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,42774.0,1883.0,80:86:f2:f9:f9:4d,...,,,,,,,,,,DoS
665020,0.000024,1.574.033.108.723.620.000,275.815.954.352.000,32,6,fd9e:6c51:2336:0:b043:1759:1f44:81e8,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,42780.0,1883.0,80:86:f2:f9:f9:4d,...,,,,,,,,,,DoS
665021,0.002138,1.574.033.108.725.760.000,275.815.956.490.000,32,6,fd9e:6c51:2336:0:717b:e02b:9c00:d43a,fd9e:6c51:2336:0:b043:1759:1f44:81e8,1883.0,42780.0,68:a3:c4:6e:50:12,...,,,,,,,,,,DoS


In [26]:
#670052 100 
#322215  x
print(str((105283*100)/670052) + "% of the tuples were excluded in the second correlation this may or may not overlap with the loss from the first correlation which is still present")

15.712661106899166% of the tuples were excluded in the second correlation this may or may not overlap with the loss from the first correlation which is still present
