In [2]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
# open the benign captures from the IOT devices
amazon_echo = pd.read_pickle('Datasets/IOT23/Benign-Amazon-Echo/zeek_normal.pkl')
phillips_hue = pd.read_pickle('Datasets/IOT23/Benign-Phillips-HUE/zeek_normal.pkl')
soomfy_doorlock = pd.read_pickle('Datasets/IOT23/Benign-Soomfy-Doorlock/zeek_normal.pkl')

## Amazon Echo

In [7]:
amazon_echo.head()

Unnamed: 0,date,src_ip,src_port,dst_ip,dst_port,protocol,duration,state,missed_bytes,orig_packets,orig_ip_bytes,resp_packets,resp_ip_bytes,label,detailed_label,protocol_num,state_num
0,2018-09-21 09:40:22.965529919,0.0.0.0,68,255.255.255.255,67,udp,8.322388,S0,0,2,656,0,0,benign,missing,0,0
1,2018-09-21 09:40:26.845520020,192.168.2.1,57621,192.168.2.255,57621,udp,19576.598629,S0,0,617,44424,0,0,benign,missing,0,0
2,2018-09-21 09:41:37.732295036,192.168.2.1,5353,224.0.0.251,5353,udp,7.628973,S0,0,1,391,0,0,benign,missing,0,0
3,2018-09-21 09:41:37.732373953,fe80::80e6:50ff:fe12:1464,5353,ff02::fb,5353,udp,7.628974,S0,0,1,411,0,0,benign,missing,0,0
4,2018-09-21 09:41:37.732506990,169.254.15.115,5353,224.0.0.251,5353,udp,7.369418,S0,0,1,391,0,0,benign,missing,0,0


In [8]:
# show some statistics on the numerical features for each type of data
continuous_features = ['duration', 'protocol_num', 'state_num', 'missed_bytes', 'orig_packets', 'orig_ip_bytes', 'resp_packets', 'resp_ip_bytes', 'src_port', 'dst_port']

In [9]:
amazon_echo[continuous_features].describe()

Unnamed: 0,duration,protocol_num,state_num,missed_bytes,orig_packets,orig_ip_bytes,resp_packets,resp_ip_bytes,src_port,dst_port
count,1374.0,1374.0,1374.0,1374.0,1374.0,1374.0,1374.0,1374.0,1374.0,1374.0
mean,76.559374,0.475983,1.544396,0.0,103.873362,8794.371,185.092431,260023.2,28292.233624,1073.914119
std,646.878562,0.612273,2.089894,0.0,3343.205784,196736.8,6463.852208,9432581.0,21448.604753,3270.789519
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0
25%,0.015156,0.0,0.0,0.0,1.0,74.0,0.0,0.0,5353.0,53.0
50%,0.243638,0.0,1.0,0.0,2.0,216.0,1.0,84.0,34930.0,80.0
75%,58.109089,1.0,1.0,0.0,9.0,840.0,2.0,139.0,47091.25,443.0
max,19576.598629,2.0,9.0,0.0,123657.0,6527241.0,239484.0,349618700.0,64769.0,57621.0


## Host level analysis

### Bytes and packets statistics per host

In [22]:
amazon_grouped_source_ip_pb = amazon_echo.groupby('src_ip').agg({'orig_packets': ['count', 'mean', 'max', 'min', 'std'], 'orig_ip_bytes': ['mean', 'max', 'min', 'std'], 'resp_packets': ['mean', 'max', 'min', 'std'], 'resp_ip_bytes': ['mean', 'max', 'min', 'std']})
amazon_grouped_source_ip_pb.columns = ['count', 'orig_packets_mean', 'orig_packets_max', 'orig_packets_min', 'orig_packets_std', 'orig_ip_bytes_mean', 'orig_ip_bytes_max', 'orig_ip_bytes_min', 'orig_ip_bytes_std', 'resp_packets_mean', 'resp_packets_max', 'resp_packets_min', 'resp_packets_std', 'resp_ip_bytes_mean', 'resp_ip_bytes_max', 'resp_ip_bytes_min', 'resp_ip_bytes_std']
amazon_grouped_source_ip_pb = amazon_grouped_source_ip_pb.sort_values(by=['count'], ascending=False).reset_index()
amazon_grouped_source_ip_pb

Unnamed: 0,src_ip,count,orig_packets_mean,orig_packets_max,orig_packets_min,orig_packets_std,orig_ip_bytes_mean,orig_ip_bytes_max,orig_ip_bytes_min,orig_ip_bytes_std,resp_packets_mean,resp_packets_max,resp_packets_min,resp_packets_std,resp_ip_bytes_mean,resp_ip_bytes_max,resp_ip_bytes_min,resp_ip_bytes_std
0,192.168.2.3,979,142.687436,123657,0,3960.508469,11583.75383,6527241,0,233040.61364,259.766088,239484,0,7657.471116,364934.017365,349618679,0,11174540.0
1,0.0.0.0,153,10.058824,22,2,1.204352,3299.333333,7219,656,395.255131,0.0,0,0,0.0,0.0,0,0,0.0
2,192.168.2.1,51,15.0,617,1,86.009302,1455.137255,44424,77,6151.040114,0.0,0,0,0.0,0.0,0,0,0.0
3,169.254.15.115,51,2.901961,11,1,2.100047,596.411765,1637,77,407.623266,0.0,0,0,0.0,0.0,0,0,0.0
4,fe80::80e6:50ff:fe12:1464,50,2.96,11,1,2.156717,654.96,1857,97,454.778796,0.0,0,0,0.0,0.0,0,0,0.0
5,fe80::482:6d20:b3e:adf4,15,3.933333,32,1,7.87824,1006.0,9988,72,2516.744666,0.2,1,0,0.414039,12.8,64,0,26.49852
6,fe80::4eef:c0ff:fe27:561e,12,3.833333,17,1,4.687184,477.75,2634,137,745.999284,0.0,0,0,0.0,0.0,0,0,0.0
7,fe80::1847:a1bd:8d13:f43c,8,2.375,4,1,0.916125,391.875,703,114,217.554878,0.0,0,0,0.0,0.0,0,0,0.0
8,192.168.69.73,7,2.142857,3,1,1.069045,420.714286,643,94,212.897248,0.0,0,0,0.0,0.0,0,0,0.0
9,192.168.69.192,7,4.571429,18,0,6.187545,1335.714286,5919,0,2073.223393,0.285714,2,0,0.755929,164.571429,1152,0,435.4151


### Duration and avg time difference between flows per host

In [26]:
amazon_grouped_source_ip_time = amazon_echo.groupby('src_ip').agg({'duration': ['count', 'mean', 'max', 'min', 'std'], 'date': [lambda group: group.sort_values().diff().mean(), lambda group: group.sort_values().diff().max(), lambda group: group.sort_values().diff().min(), lambda group: group.sort_values().diff().std()]})
amazon_grouped_source_ip_time.columns = ['count', 'duration_mean', 'duration_max', 'duration_min', 'duration_std', 'mean_time_diff', 'max_time_diff', 'min_time_diff', 'std_time_diff']
amazon_grouped_source_ip_time = amazon_grouped_source_ip_time.sort_values(by=['count'], ascending=False).reset_index()
amazon_grouped_source_ip_time

Unnamed: 0,src_ip,count,duration_mean,duration_max,duration_min,duration_std,mean_time_diff,max_time_diff,min_time_diff,std_time_diff
0,192.168.2.3,979,75.956158,11851.214389,0.0,444.31343,00:00:18.253228,00:04:59.807264,00:00:00.000002,00:00:52.627639
1,0.0.0.0,153,60.147332,188.26218,8.322388,11.623248,00:02:08.403643,00:04:16.724261,00:01:16.533275,00:00:11.309331
2,192.168.2.1,51,391.33426,19576.598629,0.10099,2740.205736,00:06:30.492747,00:15:09.969765,00:00:13.036051,00:05:42.370194
3,169.254.15.115,51,7.369418,20.411436,0.100984,3.382637,00:06:29.075011,00:15:09.969779,00:00:13.036313,00:05:43.746615
4,fe80::80e6:50ff:fe12:1464,50,7.628974,20.411316,0.100977,3.249482,00:06:37.015317,00:15:09.969743,00:00:13.036051,00:05:42.765028
5,fe80::482:6d20:b3e:adf4,15,4.018468,17.250803,0.002716,5.666301,00:00:08.976808,00:00:34.626947,00:00:00.000001,00:00:13.377206
6,fe80::4eef:c0ff:fe27:561e,12,23.593763,38.175279,7.550691,8.873173,00:09:51.893625,01:43:24.293062,00:00:00.001798,00:31:01.571964
7,fe80::1847:a1bd:8d13:f43c,8,9.554011,59.390351,1.023844,20.155887,00:00:17.232763,00:00:34.509592,00:00:00.000735,00:00:14.479750
8,192.168.69.73,7,2.89652,4.094252,0.0,1.390861,00:00:20.445888,00:00:34.403316,00:00:00.105968,00:00:13.506459
9,192.168.69.192,7,6.466902,13.654073,1.038254,5.414228,00:00:20.706823,00:00:39.574230,00:00:00.688812,00:00:15.593624


### Destination ips and ports, as well as protocols statistics per flow

In [30]:
amazon_grouped_source_ip_net = amazon_echo.groupby('src_ip').agg({'dst_ip': ['count', 'nunique'], 'src_port': 'nunique', 'dst_port': 'nunique', 'protocol': 'nunique'})
amazon_grouped_source_ip_net.columns = ['count', 'number of unique destination ips', 'number of unique source ports', 'number of unique destination ports', 'number of unique protocols']
amazon_grouped_source_ip_net = amazon_grouped_source_ip_net.sort_values(by=['count'], ascending=False).reset_index()
amazon_grouped_source_ip_net

Unnamed: 0,src_ip,count,number of unique destination ips,number of unique source ports,number of unique destination ports,number of unique protocols
0,192.168.2.3,979,120,763,7,3
1,0.0.0.0,153,1,1,1,1
2,192.168.2.1,51,2,2,2,1
3,169.254.15.115,51,1,1,1,1
4,fe80::80e6:50ff:fe12:1464,50,1,1,1,1
5,fe80::482:6d20:b3e:adf4,15,6,5,5,2
6,fe80::4eef:c0ff:fe27:561e,12,3,3,3,2
7,fe80::1847:a1bd:8d13:f43c,8,2,2,2,2
8,192.168.69.73,7,2,1,1,1
9,192.168.69.192,7,2,2,2,1


## Connection level analysis

## Flow level analysis