# Data exploration
Review the servers traffic data

In [2]:
# import libraries
from ydata_profiling import ProfileReport
import pandas as pd
import glob
import os
from utils.constants import RELEVANT_COLUMNS, VALID_TRAFFIC_TYPES

Read base data, daily reports and merge them in a single dataset

In [3]:
traffic_filepath = './data/Weekly-WorkingHours_report.csv'
traffic_df = pd.read_csv(traffic_filepath, low_memory=False)
# keep only relevant columns
traffic_df = traffic_df[RELEVANT_COLUMNS]
traffic_df

clean data


Unnamed: 0,Source Port,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Mean,...,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Idle Mean,Idle Std,Label
0,49459,80,38308,1,1,6.0,6.0,6.000000,0.000000,6.000000,...,6,255,946,0,20,0.0,0.0,0.0,0.0,BENIGN
1,49453,389,479,11,5,172.0,326.0,15.636364,31.449238,65.200000,...,326,29200,260,4,32,0.0,0.0,0.0,0.0,BENIGN
2,46124,88,1095,10,6,3150.0,3150.0,315.000000,632.561635,525.000000,...,3150,29200,2081,3,32,0.0,0.0,0.0,0.0,BENIGN
3,49454,389,15206,17,12,3452.0,6660.0,203.058823,425.778474,555.000000,...,6660,29200,0,10,32,0.0,0.0,0.0,0.0,BENIGN
4,46126,88,1092,9,6,3150.0,3152.0,350.000000,694.509719,525.333333,...,3152,29200,2081,2,32,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8182751,443,55641,49,1,3,6.0,18.0,6.000000,0.000000,6.000000,...,18,0,0,0,20,0.0,0.0,0.0,0.0,BENIGN
8182752,443,45337,217,2,1,31.0,6.0,15.500000,21.920310,6.000000,...,6,137,0,0,32,0.0,0.0,0.0,0.0,BENIGN
8182753,60148,22,1387547,41,46,2728.0,6634.0,66.536585,110.129945,144.217391,...,6634,29200,243,24,32,0.0,0.0,0.0,0.0,BENIGN
8182754,60146,22,207,1,1,0.0,0.0,0.000000,0.000000,0.000000,...,0,290,243,0,32,0.0,0.0,0.0,0.0,BENIGN


In [5]:
traffic_df.columns

Index([' Source Port', ' Destination Port', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', ' Flow IAT Mean',
       ' Flow IAT Std', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Fwd Header Length',
       ' Bwd Header Length', ' Packet Length Mean', ' Packet Length Std',
       ' Packet Length Variance', ' PSH Flag Count', ' ACK Flag Count',
       'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Subflow Bwd Packets',
       ' Subflow Bwd Bytes', 'Init_Win_bytes_forward',
       ' Init_Win_bytes_backward', ' act_data_pkt_fwd',
       ' min_seg_size_forward', 'Active Mean', ' Active Std', 'Idle Mean',
       ' Idle Std', ' Label'],
      dtype='object')

In [12]:
traffic_df.loc[traffic_df[' Total Fwd Packets']>26][' Label'].value_counts()

 Label
BENIGN                       200801
SSH-Patator                      96
Web Attack  Brute Force         74
Web Attack Â Brute Force        74
Infiltration                     51
Bot                              48
Heartbleed                       33
Web Attack Â XSS                16
Web Attack  XSS                 16
PortScan                          3
Name: count, dtype: int64

In [13]:
traffic_df.loc[traffic_df[' Total Backward Packets']>28][' Label'].value_counts()

 Label
BENIGN                       193404
SSH-Patator                    8784
Bot                              76
Web Attack  Brute Force         72
Web Attack Â Brute Force        72
Infiltration                     51
Heartbleed                       33
Web Attack Â XSS                16
Web Attack  XSS                 16
PortScan                          3
Name: count, dtype: int64

In [16]:
traffic_df.loc[traffic_df['Total Length of Fwd Packets']>3375][' Label'].value_counts()

 Label
BENIGN                       202911
DoS Slowhttptest                708
DoS GoldenEye                   210
Bot                              92
Infiltration                     87
Web Attack  Brute Force         74
Web Attack Â Brute Force        74
Heartbleed                       33
Web Attack  XSS                 16
Web Attack Â XSS                16
DoS Hulk                          3
SSH-Patator                       3
Name: count, dtype: int64

In [19]:
traffic_df.loc[traffic_df[' Fwd Header Length']>736][' Label'].value_counts()

 Label
BENIGN                       203725
SSH-Patator                     996
Web Attack  Brute Force         74
Web Attack Â Brute Force        74
Infiltration                     42
Heartbleed                       33
Web Attack  XSS                 16
Web Attack Â XSS                16
Bot                               8
DoS Slowhttptest                  3
Name: count, dtype: int64

Save the weekly report and generate a profiling report including all the daily traffic reports

In [9]:
# define weekly report filename
filename = 'Weekly-WorkingHours_report'

In [10]:
# save the weekly report
weekly_traffic_report_filename = os.path.join('data', filename + '.csv')
weekly_traffic_df.to_csv(weekly_traffic_report_filename, index=False)

In [11]:
# using ydata-profiling generate the profiling report, minimal=True due the amount of data to process
# define the ' Label' feature type -> categorical
type_schema = {' Label': "categorical"}
# generate the report, save the result as a HTML file
weekly_traffic_profile = ProfileReport(weekly_traffic_df, title="Server Traffic Profiling Report", minimal=True, type_schema=type_schema)
weekly_traffic_profiling_report_filename = os.path.join('data', filename + '.html')
weekly_traffic_profile.to_file(weekly_traffic_profiling_report_filename)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  x = asanyarray(arr - arrmean)
  x = asanyarray(arr - arrmean)
  x = asanyarray(arr - arrmean)
  x = asanyarray(arr - arrmean)
  adjusted = values - mean
  adjusted = values - mean
  adjusted = values - mean
  adjusted = values - mean


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Features review
Review the dataset features. Define the business meaning and keep the relevant

In [12]:
# show the columns
weekly_traffic_df.columns

Index(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Pa

The following is the features meaning:

1. Source Port
Meaning: The TCP/UDP source port number that initiated the connection/flow.
Typical Valid Values: Integer in the range 0 to 65,535.
2. Destination Port
Meaning: The TCP/UDP destination port number to which the connection/flow was directed.
Typical Valid Values: Integer in the range 0 to 65,535.
3. Protocol
Meaning: IP protocol number identifying the transport layer protocol (e.g., TCP, UDP, ICMP).
Typical Valid Values: Integer in [0, 255], though the most common are:
6 = TCP
17 = UDP
1 = ICMP
etc.
4. Flow Duration
Meaning: Total time for which the flow was active, often measured from the first packet to the last packet of that flow.
Typical Valid Values: Non-negative integer. Tools often store it in microseconds, so it can range from 0 up to large values depending on how long the flow lasted.
5. Total Fwd Packets
Meaning: The total number of packets sent in the “forward” direction (usually source → destination).
Typical Valid Values: Non-negative integer (0, 1, 2, …).
6. Total Backward Packets
Meaning: The total number of packets sent in the “backward” direction (destination → source).
Typical Valid Values: Non-negative integer.
7. Total Length of Fwd Packets
Meaning: Sum of the sizes (in bytes) of all forward-direction packets.
Typical Valid Values: Non-negative integer (in bytes). Could range from 0 up to millions of bytes, depending on the flow.
8. Total Length of Bwd Packets
Meaning: Sum of the sizes (in bytes) of all backward-direction packets.
Typical Valid Values: Non-negative integer (in bytes).
9. Fwd Packet Length Max
Meaning: The maximum size of any single forward-direction packet (in bytes).
Typical Valid Values: Non-negative integer. Often limited by typical MTU sizes (e.g., up to ~1500 bytes in Ethernet networks), but can be larger in some cases.
10. Fwd Packet Length Min
Meaning: The minimum size of any single forward-direction packet (in bytes).
Typical Valid Values: Non-negative integer.
11. Fwd Packet Length Mean
Meaning: The average size (in bytes) of forward-direction packets.
Typical Valid Values: Non-negative real number.
12. Fwd Packet Length Std
Meaning: The standard deviation of forward packet sizes (in bytes).
Typical Valid Values: Non-negative real number (0 if all packets are of identical size).
13. Bwd Packet Length Max
Meaning: The maximum size of any single backward-direction packet (in bytes).
Typical Valid Values: Non-negative integer (similar constraints as #9).
14. Bwd Packet Length Min
Meaning: The minimum size of any single backward-direction packet (in bytes).
Typical Valid Values: Non-negative integer.
15. Bwd Packet Length Mean
Meaning: The average size (in bytes) of backward-direction packets.
Typical Valid Values: Non-negative real number.
16. Bwd Packet Length Std
Meaning: The standard deviation of backward packet sizes (in bytes).
Typical Valid Values: Non-negative real number.
17. Flow Bytes/s
Meaning: The average rate of bytes per second for the entire flow
18. Flow Packets/s
Meaning: The average rate of packets per second for the entire flow
Typical Valid Values: Non-negative real number.
19. Flow IAT Mean
Meaning: Mean of the Inter-Arrival Times (IAT) between all consecutive packets (in the entire flow).
Typical Valid Values: Non-negative real number (often in microseconds).
20. Flow IAT Std
Meaning: Standard deviation of the Inter-Arrival Times for the entire flow.
Typical Valid Values: Non-negative real number.
21. Flow IAT Max
Meaning: Maximum inter-arrival time observed between any two consecutive packets in the flow.
Typical Valid Values: Non-negative integer/real (time units, often microseconds).
22. Flow IAT Min
Meaning: Minimum inter-arrival time between any two consecutive packets in the flow.
Typical Valid Values: Non-negative integer/real (time units).
23. Fwd IAT Total
Meaning: Sum of all inter-arrival times for packets moving forward (source → destination).
Typical Valid Values: Non-negative integer/real (time units).
24. Fwd IAT Mean
Meaning: Average inter-arrival time for packets moving forward.
Typical Valid Values: Non-negative real number (time units).
25. Fwd IAT Std
Meaning: Standard deviation of inter-arrival times in the forward direction.
Typical Valid Values: Non-negative real number.
26. Fwd IAT Max
Meaning: Maximum inter-arrival time in the forward direction.
Typical Valid Values: Non-negative integer/real (time units).
27. Fwd IAT Min
Meaning: Minimum inter-arrival time in the forward direction.
Typical Valid Values: Non-negative integer/real (time units).
28. Bwd IAT Total
Meaning: Sum of all inter-arrival times for packets moving backward (destination → source).
Typical Valid Values: Non-negative integer/real (time units).
29. Bwd IAT Mean
Meaning: Average inter-arrival time for packets moving backward.
Typical Valid Values: Non-negative real number (time units).
30. Bwd IAT Std
Meaning: Standard deviation of inter-arrival times in the backward direction.
Typical Valid Values: Non-negative real number.
31. Bwd IAT Max
Meaning: Maximum inter-arrival time in the backward direction.
Typical Valid Values: Non-negative integer/real (time units).
32. Bwd IAT Min
Meaning: Minimum inter-arrival time in the backward direction.
Typical Valid Values: Non-negative integer/real (time units).
33. Fwd Header Length
Meaning: Total header length (in bytes) of all forward-direction packets. In TCP, this may include IP + TCP headers.
Typical Valid Values: Non-negative integer.
34. Bwd Header Length
Meaning: Total header length (in bytes) of all backward-direction packets.
Typical Valid Values: Non-negative integer.
35. Fwd Packets/s
Meaning: Forward packets per second
36. Bwd Packets/s
Meaning: Backward packets per second
37. Min Packet Length
Meaning: The smallest packet size observed in the entire flow (forward or backward).
Typical Valid Values: Non-negative integer (bytes).
38. Max Packet Length
Meaning: The largest packet size observed in the entire flow.
Typical Valid Values: Non-negative integer (bytes).
39. Packet Length Mean
Meaning: The mean size of all packets (in bytes) in the entire flow.
Typical Valid Values: Non-negative real number.
40. Packet Length Std
Meaning: The standard deviation of packet sizes in the flow.
Typical Valid Values: Non-negative real number.
41. Packet Length Variance
Meaning: The variance (standard deviation squared) of packet sizes in the flow.
Typical Valid Values: Non-negative real number.
42. PSH Flag Count
Meaning: Number of packets (in the entire flow) that have the TCP PSH (Push) flag set.
Typical Valid Values: Non-negative integer.
43. ACK Flag Count
Meaning: Number of packets (in the entire flow) that have the TCP ACK (Acknowledgment) flag set.
Typical Valid Values: Non-negative integer.
44. Down/Up Ratio
Meaning: Ratio of the “downstream” (backward) traffic to “upstream” (forward) traffic
45. Average Packet Size
Meaning: Average packet size across the entire flow (forward + backward)
46. Avg Fwd Segment Size
Meaning: Average size of the segments in the forward direction
47. Avg Bwd Segment Size
Meaning: Average size of the segments in the backward direction.
Typical Valid Values: Non-negative real number (bytes).
48. Subflow Fwd Packets
Meaning: In some flow measurement tools (like CICFlowMeter), if a single flow is split into smaller “subflows” (e.g., due to inactivity timeouts), this is the number of forward packets in those subflows (often an aggregate or average).
Typical Valid Values: Non-negative integer.
49. Subflow Fwd Bytes
Meaning: The total (or average) number of forward-direction bytes for the subflows created from the main flow.
Typical Valid Values: Non-negative integer (bytes).
50. Subflow Bwd Packets
Meaning: The number of backward packets (destination → source) for the subflows.
Typical Valid Values: Non-negative integer.
51. Subflow Bwd Bytes
Meaning: The total (or average) number of backward-direction bytes for the subflows.
Typical Valid Values: Non-negative integer (bytes).
52. Init_Win_bytes_forward
Meaning: The TCP initial window size (in bytes) in the forward direction (based on the first packet(s) in forward direction).
Typical Valid Values: Non-negative integer (commonly in ranges like 0 to 65,535 for classic TCP windows, but can be larger for modern TCP).
53. Init_Win_bytes_backward
Meaning: The TCP initial window size (in bytes) in the backward direction.
Typical Valid Values: Non-negative integer (similar range as above).
54. act_data_pkt_fwd
Meaning: Number of forward-direction packets that contain actual payload data (non-zero TCP payload length).
Typical Valid Values: Non-negative integer.
55. min_seg_size_forward
Meaning: The smallest TCP segment size observed in the forward direction (excluding headers, if the tool measures payload only).
Typical Valid Values: Non-negative integer (bytes).
56. Active Mean
Meaning: The mean time (often in microseconds or milliseconds) that the flow was active before becoming idle.
“Active” periods are intervals when packets are continuously being transmitted.
Typical Valid Values: Non-negative real number (time units).
57. Active Std
Meaning: The standard deviation of active periods for the flow.
Typical Valid Values: Non-negative real number.
58. Active Max
Meaning: The maximum length of any continuous active period within the flow.
Typical Valid Values: Non-negative integer/real (time units).
59. Active Min
Meaning: The minimum length of any continuous active period within the flow.
Typical Valid Values: Non-negative integer/real (time units).
60. Idle Mean
Meaning: The mean time (often in microseconds or milliseconds) that the flow was idle.
“Idle” periods are intervals with no packets arriving in either direction.
Typical Valid Values: Non-negative real number (time units).
61. Idle Std
Meaning: The standard deviation of the idle times within the flow.
Typical Valid Values: Non-negative real number.
62. Idle Max
Meaning: The maximum length of any idle period within the flow.
Typical Valid Values: Non-negative integer/real (time units).
63. Idle Min
Meaning: The minimum length of any idle period within the flow.
Typical Valid Values: Non-negative integer/real (time units).
64. Label
Meaning: The class or category of the flow (e.g., “BENIGN,” “Botnet,” “DDoS,” etc.). Used for supervised machine learning or threat classification.
Typical Valid Values: String or categorical value (e.g., “BENIGN,” “FTP-Patator,” “SSH-Patator,” “DoS slowloris,” etc.).

### Filter invalid features


Non-informative
* Flow ID
* Source IP
* Destination IP
* Timestamp

Low Variance
* Fwd PSH Flags
* Fwd URG Flags
* FIN Flag Count
* SYN Flag Count
* RST Flag Count
* URG Flag Count
* CWE Flag Count
* ECE Flag Count

Single Value
* Bwd PSH Flags
* Bwd URG Flags
* Fwd Avg Bytes/Bulk
* Fwd Avg Packets/Bulk
* Fwd Avg Bulk Rate
* Bwd Avg Bytes/Bulk
* Bwd Avg Packets/Bulk
* Bwd Avg Bulk Rate

Duplicated feature
* Fwd Header Length.1

Redundant features
* Flow Bytes/s
* Flow Packets/s
* Fwd Packets/s
* Bwd Packets/s
* Average Packet Size
* Avg Fwd Segment Size
* Avg Bwd Segment Size
* Down/Up Ratio

In [13]:
# keep only the relevant columns
# NOTE: RELEVANT_COLUMNS could be an input argument
base_traffic_df = weekly_traffic_df[RELEVANT_COLUMNS]
base_traffic_df

Unnamed: 0,Source Port,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Mean,...,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Idle Mean,Idle Std,Label
0,49459,80,38308,1,1,6.0,6.0,6.000000,0.000000,6.000000,...,6,255,946,0,20,0.0,0.0,0.0,0.0,BENIGN
1,49453,389,479,11,5,172.0,326.0,15.636364,31.449238,65.200000,...,326,29200,260,4,32,0.0,0.0,0.0,0.0,BENIGN
2,46124,88,1095,10,6,3150.0,3150.0,315.000000,632.561635,525.000000,...,3150,29200,2081,3,32,0.0,0.0,0.0,0.0,BENIGN
3,49454,389,15206,17,12,3452.0,6660.0,203.058823,425.778474,555.000000,...,6660,29200,0,10,32,0.0,0.0,0.0,0.0,BENIGN
4,46126,88,1092,9,6,3150.0,3152.0,350.000000,694.509719,525.333333,...,3152,29200,2081,2,32,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170324,443,55641,49,1,3,6.0,18.0,6.000000,0.000000,6.000000,...,18,0,0,0,20,0.0,0.0,0.0,0.0,BENIGN
170325,443,45337,217,2,1,31.0,6.0,15.500000,21.920310,6.000000,...,6,137,0,0,32,0.0,0.0,0.0,0.0,BENIGN
170326,60148,22,1387547,41,46,2728.0,6634.0,66.536585,110.129945,144.217391,...,6634,29200,243,24,32,0.0,0.0,0.0,0.0,BENIGN
170327,60146,22,207,1,1,0.0,0.0,0.000000,0.000000,0.000000,...,0,290,243,0,32,0.0,0.0,0.0,0.0,BENIGN


In [14]:
# limit labels to use
base_traffic_df = base_traffic_df.loc[base_traffic_df[' Label'].isin(VALID_TRAFFIC_TYPES)]
base_traffic_df

Unnamed: 0,Source Port,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Mean,...,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Idle Mean,Idle Std,Label
0,49459,80,38308,1,1,6.0,6.0,6.000000,0.000000,6.000000,...,6,255,946,0,20,0.0,0.0,0.0,0.0,BENIGN
1,49453,389,479,11,5,172.0,326.0,15.636364,31.449238,65.200000,...,326,29200,260,4,32,0.0,0.0,0.0,0.0,BENIGN
2,46124,88,1095,10,6,3150.0,3150.0,315.000000,632.561635,525.000000,...,3150,29200,2081,3,32,0.0,0.0,0.0,0.0,BENIGN
3,49454,389,15206,17,12,3452.0,6660.0,203.058823,425.778474,555.000000,...,6660,29200,0,10,32,0.0,0.0,0.0,0.0,BENIGN
4,46126,88,1092,9,6,3150.0,3152.0,350.000000,694.509719,525.333333,...,3152,29200,2081,2,32,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170324,443,55641,49,1,3,6.0,18.0,6.000000,0.000000,6.000000,...,18,0,0,0,20,0.0,0.0,0.0,0.0,BENIGN
170325,443,45337,217,2,1,31.0,6.0,15.500000,21.920310,6.000000,...,6,137,0,0,32,0.0,0.0,0.0,0.0,BENIGN
170326,60148,22,1387547,41,46,2728.0,6634.0,66.536585,110.129945,144.217391,...,6634,29200,243,24,32,0.0,0.0,0.0,0.0,BENIGN
170327,60146,22,207,1,1,0.0,0.0,0.000000,0.000000,0.000000,...,0,290,243,0,32,0.0,0.0,0.0,0.0,BENIGN


## Balance Target

In [15]:
# Apply Down sampling to solve the unbalanced
n_instances_per_traffic_type = 150000
traffic_valid_traffic_list = []
for valid_traffic_type in VALID_TRAFFIC_TYPES:
    # filter by type and sample
    traffic_type_df = base_traffic_df.loc[base_traffic_df[' Label'] == valid_traffic_type]
    # check if there are enough instances to apply sampling
    if traffic_type_df.shape[0]>n_instances_per_traffic_type:
        traffic_type_df = traffic_type_df.sample(n_instances_per_traffic_type)
    # concatenate result
    traffic_valid_traffic_list.append(traffic_type_df)

# concatenate the results
base_traffic_df = pd.concat(traffic_valid_traffic_list, axis=0)
base_traffic_df

Unnamed: 0,Source Port,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Mean,...,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Idle Mean,Idle Std,Label
113085,443,53842,0,2,0,37.0,0.0,18.500000,17.677670,0.000000,...,0,110,-1,1,20,0.0,0.0,0.0,0.0,BENIGN
129279,80,52716,6048853,1,5,1375.0,24.0,1375.000000,0.000000,4.800000,...,24,229,0,0,20,11483.0,0.0,6037370.0,0.0,BENIGN
8114610,41045,53,61539,2,2,68.0,182.0,34.000000,0.000000,91.000000,...,182,-1,-1,1,20,0.0,0.0,0.0,0.0,BENIGN
299378,7076,53,49667,2,2,68.0,184.0,34.000000,0.000000,92.000000,...,184,-1,-1,1,32,0.0,0.0,0.0,0.0,BENIGN
2065371,57030,53,101878,2,2,68.0,218.0,34.000000,0.000000,109.000000,...,218,-1,-1,1,32,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6750506,52114,21,9794876,9,15,96.0,188.0,10.666667,8.231039,12.533333,...,188,29200,227,6,32,0.0,0.0,0.0,0.0,FTP-Patator
6750507,52126,21,9802316,9,15,96.0,188.0,10.666667,8.246211,12.533333,...,188,29200,227,6,32,0.0,0.0,0.0,0.0,FTP-Patator
6750508,52118,21,9803729,9,15,93.0,188.0,10.333333,7.874008,12.533333,...,188,29200,227,6,32,0.0,0.0,0.0,0.0,FTP-Patator
6750510,52130,21,9811139,9,15,94.0,188.0,10.444444,7.986099,12.533333,...,188,29200,227,6,32,0.0,0.0,0.0,0.0,FTP-Patator


In [16]:
# Generate a report with the base dataset
base_traffic_profile = ProfileReport(base_traffic_df, title="Server Base Traffic Profiling Report", minimal=True, type_schema=type_schema)
base_traffic_filename = os.path.join('data', filename + '_valid_traffic.html')
base_traffic_profile.to_file(base_traffic_filename)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]