In [1]:
import pandas as pd 
import sys
import os
from format_normal_traffic import DataFrameFormatterNormalTraffic
from format_cowrie_df import DataFrameFormatterCowrie
from format_dionea_df import DataFrameFormatterDionea
from format_suricata_df import DataFrameFormatterSuricata
from handler_df_formatter import DataFrameFormatter

# Get parent of current working directory
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

from df_initializing.handler_init_dfs import DataFrameInitializer

In [4]:


df_initializer = DataFrameInitializer(
    cowrie_json_path='../../data/cowrie/log/cowrie.json',
    dionea_json_path='../../data/dionaea/log/dionaea.json',
    suricata_json_path='../../data/suricata/log/suricata.json',
    normal_traffic_json_path="../../data/normal_traffic/benign_traffic_fixed.json"
)

df_cowrie, df_dionea, df_suricata, df_normal_traffic = df_initializer.initialize_dfs(sample_size=1000)

df_formatter = DataFrameFormatter(df_cowrie, df_dionea, df_suricata, df_normal_traffic)

print("Formatted Cowrie DataFrame:" , df_formatter.cowrie_df['label'].head())
print("Formatted Dionea DataFrame:" , df_formatter.dionea_df['label'].head())
print("Formatted Suricata DataFrame:" , df_formatter.suricata_df['label'].head())
print("Formatted Normal Traffic DataFrame:" , df_formatter.normal_traffic_df['label'].head())

print("Cowrie Columns:", df_formatter.cowrie_df.columns)
print("Dionea Columns:", df_formatter.dionea_df.columns)
print("Suricata Columns:", df_formatter.suricata_df.columns)
print("Normal Traffic Columns:", df_formatter.normal_traffic_df.columns)

combined_df = df_formatter.unite_all_dfs()

print("Combined DataFrame:" , combined_df.head())
print("Combined DataFrame shape:", combined_df.shape)
print("Combined DataFrame columns:", combined_df.columns)

df_formatter.cowrie_df.head(10)











Index(['dst_ip', 'dst_port', 'src_hostname', 'src_ip', 'src_port', 'timestamp',
       'ftp', 'credentials', 'protocol', 'transport', 'type'],
      dtype='object')
Formatted Cowrie DataFrame: 0    malicious
1    malicious
2    malicious
3    malicious
4    malicious
Name: label, dtype: object
Formatted Dionea DataFrame: 0    malicious
1    malicious
2    malicious
3    malicious
4    malicious
Name: label, dtype: object
Formatted Suricata DataFrame: 0    malicious
1    malicious
2    malicious
3    malicious
4    malicious
Name: label, dtype: object
Formatted Normal Traffic DataFrame: 0    benign
1    benign
2    benign
3    benign
4    benign
Name: label, dtype: object
Cowrie Columns: Index(['source_ip', 'destination_ip', 'source_port', 'destination_port',
       'timestamp_start', 'transport_protocol', 'application_protocol',
       'duration', 'label'],
      dtype='object')
Dionea Columns: Index(['source_ip', 'destination_ip', 'source_port', 'destination_port',
       'timestamp_s

Unnamed: 0,source_ip,destination_ip,source_port,destination_port,timestamp_start,transport_protocol,application_protocol,duration,label
0,14.1.107.116,172.29.0.2,53639.0,23.0,,tcp,telnet,12.735529,malicious
1,193.32.162.146,172.29.0.2,34664.0,22.0,,tcp,ssh,1.7,malicious
2,103.146.202.84,172.29.0.2,34400.0,22.0,,tcp,ssh,2.1,malicious
3,103.146.202.84,172.29.0.2,36284.0,22.0,,tcp,ssh,2.1,malicious
4,143.20.185.225,172.29.0.2,38136.0,23.0,,tcp,telnet,30.547589,malicious
5,114.138.98.37,172.29.0.2,50390.0,23.0,,tcp,telnet,13.832137,malicious
6,65.49.1.152,172.29.0.2,10640.0,22.0,,tcp,ssh,0.0,malicious
7,103.177.227.135,172.29.0.2,34812.0,22.0,,tcp,ssh,3.2,malicious
8,185.247.137.195,172.29.0.2,47797.0,22.0,,tcp,ssh,2.0,malicious
9,103.146.202.84,172.29.0.2,45252.0,22.0,,tcp,ssh,2.1,malicious


In [3]:
print(df_suricata.columns)

df_suricata["app_proto_ts"].head(20)


Index(['timestamp', 'flow_id', 'in_iface', 'event_type', 'src_ip', 'dest_ip',
       'proto', 'icmp_type', 'icmp_code', 'pkt_src', 'alert', 'direction',
       'flow', 'payload', 'payload_printable', 'stream', 'src_port',
       'dest_port', 'app_proto', 'metadata', 'tls', 'tx_id', 'http',
       'fileinfo', 'tcp', 'smb', 'response_icmp_type', 'response_icmp_code',
       'sip', 'files', 'app_proto_tc', 'anomaly', 'ssh', 'tftp', 'tx_guessed',
       'app_proto_orig', 'snmp', 'rfb', 'app_proto_ts', 'pgsql', 'smtp',
       'label'],
      dtype='object')


0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
Name: app_proto_ts, dtype: object