In [3]:
import pandas as pd 
import sys
import os
from pathlib import Path


# Add project root to path (for Jupyter notebooks)
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

from feature_engineering.df_formatting.format_normal_traffic_df import DataFrameFormatterNormalTraffic
from format_suricata_df import DataFrameFormatterSuricata

from feature_engineering.df_initializing.handler_init_dfs import DataFrameInitializer
from handler_df_formatter import DataFrameFormatter

In [4]:
pd.set_option('display.max_rows', None)        # Show all rows
pd.set_option('display.max_columns', None)     # Show all columns
pd.set_option('display.width', None)           # No line wrapping
pd.set_option('display.max_colwidth', None)    # Show full cell content

# Check if file exists
file_path = Path('../../data/normal_traffic/benign_traffic_fixed.json')
print(f"File exists: {file_path.exists()}")
print(f"Absolute path: {file_path.resolve()}")

# Check file size
if file_path.exists():
    print(f"File size: {file_path.stat().st_size / (1024**2):.2f} MB")

df_initializer = DataFrameInitializer(
    suricata_json_path='../../data/suricata/log/suricata.json',
    normal_traffic_json_path="../../data/normal_traffic/benign_traffic_fixed.json"
)

df_suricata, df_normal_traffic = df_initializer.initialize_dfs(sample_size=1000)

df_formatter = DataFrameFormatter(df_suricata, df_normal_traffic)


#print("Cowrie Columns:", df_formatter.cowrie_df.info())
#print("Dionea Columns:", df_formatter.dionea_df.info())
print("Suricata Columns:", df_formatter.suricata_df.info())
#print("Normal Traffic Columns:", df_formatter.normal_traffic_df.info())

#df_formatter.dionea_df

#df_formatter.cowrie_df['application_protocol'].value_counts()
#df_formatter.dionea_df['application_protocol'].value_counts()
#df_formatter.suricata_df['application_protocol'].value_counts()
#df_formatter.normal_traffic_df['application_protocol'].value_counts()

# df_formatter.unite_honeypot_and_normal_traffic_dfs()
# df_formatter.unite_all_honeypot_dfs()

#df_dionea["connection"]

print(df_suricata.head(2))

df_formatter.suricata_df.head()






File exists: True
Absolute path: C:\University\Magistrale\Data Driven System Engineering\real-time-cyber-anomaly-detection\data\normal_traffic\benign_traffic_fixed.json
File size: 4899.36 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71679 entries, 0 to 71678
Data columns (total 37 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   source_ip                      71679 non-null  object        
 1   destination_ip                 71679 non-null  object        
 2   source_port                    71679 non-null  Int64         
 3   destination_port               71679 non-null  Int64         
 4   timestamp_start                71679 non-null  datetime64[ns]
 5   transport_protocol             71679 non-null  object        
 6   application_protocol           71679 non-null  object        
 7   duration                       71679 non-null  float64       
 8   bytes_sent               

Unnamed: 0,source_ip,destination_ip,source_port,destination_port,timestamp_start,transport_protocol,application_protocol,duration,bytes_sent,bytes_received,pkts_sent,pkts_received,direction,label,bytes_per_second,packets_per_second,bytes_per_packet,bytes_sent_ratio,packets_sent_ratio,hour,day_of_week,is_weekend,is_business_hours,src_is_private,dst_is_private,is_internal,dst_port_is_common,events_in_window,malicious_events_in_window,unique_malicious_ips,events_pct_change,malicious_events_pct_change,burst_indicator,events_to_dst_port,total_events_for_protocol,malicious_events_for_protocol,malicious_ratio_for_protocol
0,16.62.107.190,10.128.0.2,0,0,2025-10-24 02:44:06.405778,icmp,unknown,0.0,82,0,1,0,L2R,malicious,0.0,0.0,82.0,1.0,1.0,2,4,0,0,0,1,0,0,2488,2488,106,0.0,0.0,0,1948,2157,2157,100.0
1,169.254.169.254,10.128.0.2,53,36113,2025-10-24 02:44:07.619878,udp,failed,0.000592,164,172,2,1,R2L,malicious,567567.6,5067.567568,112.0,0.488095,0.666667,2,4,0,0,1,1,1,0,2488,2488,106,0.0,0.0,0,1,152,152,100.0
2,10.128.0.2,216.239.34.174,59728,443,2025-10-24 02:44:07.623239,tcp,unknown,0.0,0,0,0,0,unknown,malicious,0.0,0.0,0.0,0.0,0.0,2,4,0,0,1,0,0,1,2488,2488,106,0.0,0.0,0,271,2157,2157,100.0
3,10.128.0.2,216.239.34.174,53790,443,2025-10-24 02:44:08.117768,tcp,unknown,0.000123,156,0,2,0,L2R,malicious,1268293.0,16260.162602,78.0,1.0,1.0,2,4,0,0,1,0,0,1,2488,2488,106,0.0,0.0,0,271,2157,2157,100.0
4,216.239.34.174,10.128.0.2,443,53790,2025-10-24 02:44:08.118390,tcp,unknown,0.000745,156,66,2,1,R2L,malicious,297986.6,4026.845638,74.0,0.702703,0.666667,2,4,0,0,0,1,0,0,2488,2488,106,0.0,0.0,0,1,2157,2157,100.0


In [None]:

print(df_normal_traffic.head(2))


         generated            appName  totalSourceBytes  \
0  3/11/2014 18:21        Unknown_UDP             16076   
1  3/11/2014 18:21  HTTPImageTransfer               384   

   totalDestinationBytes  totalDestinationPackets  totalSourcePackets  \
0                      0                        0                 178   
1                      0                        0                   6   

  sourcePayloadAsBase64 sourcePayloadAsUTF destinationPayloadAsBase64  \
0                  None               None                       None   
1                  None               None                       None   

  destinationPayloadAsUTF direction sourceTCPFlagsDescription  \
0                    None       L2R                      None   
1                    None       L2R                       F,A   

  destinationTCPFlagsDescription         source protocolName  sourcePort  \
0                           None  192.168.5.122       udp_ip        5353   
1                           None  

In [None]:

df_formatter.normal_traffic_df.head()

Unnamed: 0,source_ip,destination_ip,source_port,destination_port,timestamp_start,transport_protocol,application_protocol,duration,bytes_sent,bytes_received,pkts_sent,pkts_received,direction,label,bytes_per_second,packets_per_second,bytes_per_packet,bytes_sent_ratio,packets_sent_ratio,hour,day_of_week,is_weekend,is_business_hours,src_is_private,dst_is_private,is_internal,dst_port_is_common,events_in_window,malicious_events_in_window,unique_malicious_ips,events_pct_change,malicious_events_pct_change,burst_indicator,events_to_dst_port,total_events_for_protocol,malicious_events_for_protocol,malicious_ratio_for_protocol
0,192.168.5.122,224.0.0.251,5353,5353,2010-06-13 23:57:00,udp,unknown_udp,840.0,16076,0,178,0,L2R,benign,19.138095,0.211905,90.314607,1.0,1.0,23,6,1,0,1,0,0,0,142,,0,0.0,0.0,0,1,2,0,0.0
1,192.168.2.111,206.217.198.186,4435,80,2010-06-13 23:58:00,tcp,httpimagetransfer,180.0,384,0,6,0,L2R,benign,2.133333,0.033333,64.0,1.0,1.0,23,6,1,0,1,0,0,1,142,,0,0.0,0.0,0,45,9,0,0.0
2,192.168.4.119,192.168.5.122,4428,53,2010-06-13 23:58:00,udp,dns,60.0,171,642,2,4,L2L,benign,13.55,0.1,135.5,0.210332,0.333333,23,6,1,0,1,1,1,1,142,,0,0.0,0.0,0,32,32,0,0.0
3,192.168.4.119,219.94.203.105,3639,80,2010-06-13 23:58:00,tcp,httpimagetransfer,120.0,384,0,6,0,L2R,benign,3.2,0.05,64.0,1.0,1.0,23,6,1,0,1,0,0,1,142,,0,0.0,0.0,0,45,9,0,0.0
4,192.168.4.119,98.137.80.50,3641,80,2010-06-13 23:58:00,tcp,httpimagetransfer,60.0,186,128,2,2,L2R,benign,5.233333,0.066667,78.5,0.592357,0.5,23,6,1,0,1,0,0,1,142,,0,0.0,0.0,0,45,9,0,0.0


In [None]:
print(df_formatter.suricata_df.info())
print(df_formatter.normal_traffic_df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71679 entries, 0 to 71678
Data columns (total 37 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   source_ip                      71679 non-null  object        
 1   destination_ip                 71679 non-null  object        
 2   source_port                    71679 non-null  Int64         
 3   destination_port               71679 non-null  Int64         
 4   timestamp_start                71679 non-null  datetime64[ns]
 5   transport_protocol             71679 non-null  object        
 6   application_protocol           71679 non-null  object        
 7   duration                       71679 non-null  float64       
 8   bytes_sent                     71679 non-null  int64         
 9   bytes_received                 71679 non-null  int64         
 10  pkts_sent                      71679 non-null  int64         
 11  pkts_received  

In [None]:
import requests

def get_location(ip_address="161.185.160.93"):
    response = requests.get(f'https://ipapi.co/{ip_address}/json/').json()
    location_data = {
        "ip": ip_address,
        "city": response.get("city"),
        "region": response.get("region"),
        "country": response.get("country_name")
    }
    return location_data

location_info = get_location("161.185.160.93")
print(location_info)

{'ip': '161.185.160.93', 'city': None, 'region': None, 'country': None}


In [None]:
import requests

ip = "8.8.8.8"
url = f"https://ipwho.is/{ip}"

response = requests.get(url).json()

print(response)

{'ip': '8.8.8.8', 'success': True, 'type': 'IPv4', 'continent': 'North America', 'continent_code': 'NA', 'country': 'United States', 'country_code': 'US', 'region': 'California', 'region_code': 'CA', 'city': 'Mountain View', 'latitude': 37.3860517, 'longitude': -122.0838511, 'is_eu': False, 'postal': '94039', 'calling_code': '1', 'capital': 'Washington D.C.', 'borders': 'CA,MX', 'flag': {'img': 'https://cdn.ipwhois.io/flags/us.svg', 'emoji': 'ðŸ‡ºðŸ‡¸', 'emoji_unicode': 'U+1F1FA U+1F1F8'}, 'connection': {'asn': 15169, 'org': 'Google LLC', 'isp': 'Google LLC', 'domain': 'google.com'}, 'timezone': {'id': 'America/Los_Angeles', 'abbr': 'PST', 'is_dst': False, 'offset': -28800, 'utc': '-08:00', 'current_time': '2025-12-02T03:51:08-08:00'}}
