In [1]:
import pandas as pd
import numpy as np

In [38]:
masterk = pd.read_parquet('masterkag.parquet')
master2019 = pd.read_parquet('master2019.parquet')

In [43]:
print(masterk.shape)
pd.set_option('display.max_rows', None)    # Display all rows (in case you use .head())
pd.set_option('display.max_columns', None) # Display all columns
pd.set_option('display.width', 1000)
print(masterk.dtypes)

(8303334, 65)
protocol_0           float32
protocol_6           float32
protocol_17          float32
flow_duration        float32
flow_byts_s          float32
flow_pkts_s          float32
fwd_pkts_s           float32
bwd_pkts_s           float32
tot_fwd_pkts         float32
tot_bwd_pkts         float32
totlen_fwd_pkts      float32
totlen_bwd_pkts      float32
fwd_pkt_len_max      float32
fwd_pkt_len_min      float32
fwd_pkt_len_mean     float32
fwd_pkt_len_std      float32
bwd_pkt_len_max      float32
bwd_pkt_len_min      float32
bwd_pkt_len_mean     float32
bwd_pkt_len_std      float32
pkt_len_max          float32
pkt_len_min          float32
pkt_len_mean         float32
pkt_len_std          float32
pkt_len_var          float32
fwd_seg_size_min     float32
fwd_act_data_pkts    float32
flow_iat_mean        float32
flow_iat_max         float32
flow_iat_min         float32
flow_iat_std         float32
fwd_iat_tot          float32
fwd_iat_max          float32
fwd_iat_min          float32


In [44]:
print(master2019.shape)
pd.set_option('display.max_rows', None)    # Display all rows (in case you use .head())
pd.set_option('display.max_columns', None) # Display all columns
pd.set_option('display.width', 1000)
print(master2019.dtypes)

(9902502, 65)
protocol_0           float32
protocol_6           float32
protocol_17          float32
flow_duration        float32
flow_byts_s          float32
flow_pkts_s          float32
fwd_pkts_s           float32
bwd_pkts_s           float32
tot_fwd_pkts         float32
tot_bwd_pkts         float32
totlen_fwd_pkts      float32
totlen_bwd_pkts      float32
fwd_pkt_len_max      float32
fwd_pkt_len_min      float32
fwd_pkt_len_mean     float32
fwd_pkt_len_std      float32
bwd_pkt_len_max      float32
bwd_pkt_len_min      float32
bwd_pkt_len_mean     float32
bwd_pkt_len_std      float32
pkt_len_max          float32
pkt_len_min          float32
pkt_len_mean         float32
pkt_len_std          float32
pkt_len_var          float32
fwd_seg_size_min     float32
fwd_act_data_pkts    float32
flow_iat_mean        float32
flow_iat_max         float32
flow_iat_min         float32
flow_iat_std         float32
fwd_iat_tot          float32
fwd_iat_max          float32
fwd_iat_min          float32


In [41]:
import pandas as pd
import numpy as np

def standardise_to_float32(df: pd.DataFrame) -> pd.DataFrame:
    print(f"--- Processing DataFrame (Shape: {df.shape}) ---")
    
    # 1. Drop NaNs first (Crucial step)
    before = len(df)
    df.dropna(inplace=True)
    after = len(df)
    if before != after:
        print(f"   Dropped {before - after} rows containing NaN values.")
    
    # 2. Identify columns to process
    # We want to convert all numbers, BUT we usually keep 'Label' as Int/Category
    # and we skip strings (like IPs if you still have them)
    cols_to_convert = []
    
    for col in df.columns:
        # Skip Label (Keep it as int for classification)
        if col == 'Label':
            continue
            
        # Skip Objects (IPs, Timestamp, Flow ID)
        if df[col].dtype == 'object' or df[col].dtype.name == 'category':
            continue
            
        # Add to list
        cols_to_convert.append(col)

    # 3. Handle Infinity (Replace with 1e15)
    # Check if any columns in our list actually have infinity
    if np.isinf(df[cols_to_convert]).values.any():
        print("   Found Infinity values. Capping at 1e15...")
        df[cols_to_convert] = df[cols_to_convert].replace([np.inf, -np.inf], 1e15)

    # 4. Force Conversion to float32
    print(f"   Converting {len(cols_to_convert)} columns to float32...")
    df[cols_to_convert] = df[cols_to_convert].astype('float32')

    # 5. Final Memory Check
    mem_usage = df.memory_usage(deep=True).sum() / (1024**2)
    print(f"   Done. New Memory Usage: {mem_usage:.2f} MB")
    
    return df

# ==========================================
# APPLY TO YOUR DATASETS HERE
# ==========================================

# Assuming your datasets are named 'df_old' and 'df_new'

# 1. Convert OLD Dataset
print("Standardizing OLD Dataset...")
master2019 = standardise_to_float32(master2019)

# 2. Convert NEW Dataset
print("\nStandardizing NEW Dataset...")
masterk = standardise_to_float32(masterk)

# 3. Verify Compatibility
print("\n--- Verification ---")
print(f"Old 'Flow Bytes/s' dtype: {master2019['Flow Bytes/s'].dtype}")
print(f"New 'Flow Bytes/s' dtype: {masterk['Flow Bytes/s'].dtype}")

Standardizing OLD Dataset...
--- Processing DataFrame (Shape: (9902502, 67)) ---
   Converting 66 columns to float32...
   Done. New Memory Usage: 2530.93 MB

Standardizing NEW Dataset...
--- Processing DataFrame (Shape: (8303334, 66)) ---
   Converting 65 columns to float32...
   Done. New Memory Usage: 2090.53 MB

--- Verification ---
Old 'Flow Bytes/s' dtype: float32


KeyError: 'Flow Bytes/s'

In [42]:
import pandas as pd

# 1. Update the Dictionary to include the missing columns
to_snake_case_mapping = {
    # --- Identifiers / Protocols (UPDATED FOR PCA) ---
    'Protocol': 'protocol',        # Kept just in case
    'Protocol_0': 'protocol_0',    # <--- ADDED
    'Protocol_6': 'protocol_6',    # <--- ADDED
    'Protocol_17': 'protocol_17',  # <--- ADDED
    'Flow Duration': 'flow_duration',
    
    # --- Packets ---
    'Total Fwd Packets': 'tot_fwd_pkts',      'Tot Fwd Pkts': 'tot_fwd_pkts',
    'Total Backward Packets': 'tot_bwd_pkts', 'Tot Bwd Pkts': 'tot_bwd_pkts',
    
    # --- Lengths ---
    'Total Length of Fwd Packets': 'totlen_fwd_pkts', 'TotLen Fwd Pkts': 'totlen_fwd_pkts',
    'Total Length of Bwd Packets': 'totlen_bwd_pkts', 'TotLen Bwd Pkts': 'totlen_bwd_pkts',
    'Fwd Packet Length Max': 'fwd_pkt_len_max',       'Fwd Pkt Len Max': 'fwd_pkt_len_max',
    'Fwd Packet Length Min': 'fwd_pkt_len_min',       'Fwd Pkt Len Min': 'fwd_pkt_len_min',
    'Fwd Packet Length Mean': 'fwd_pkt_len_mean',     'Fwd Pkt Len Mean': 'fwd_pkt_len_mean',
    'Fwd Packet Length Std': 'fwd_pkt_len_std',       'Fwd Pkt Len Std': 'fwd_pkt_len_std',
    'Bwd Packet Length Max': 'bwd_pkt_len_max',       'Bwd Pkt Len Max': 'bwd_pkt_len_max',
    'Bwd Packet Length Min': 'bwd_pkt_len_min',       'Bwd Pkt Len Min': 'bwd_pkt_len_min',
    'Bwd Packet Length Mean': 'bwd_pkt_len_mean',     'Bwd Pkt Len Mean': 'bwd_pkt_len_mean',
    'Bwd Packet Length Std': 'bwd_pkt_len_std',       'Bwd Pkt Len Std': 'bwd_pkt_len_std',
    
    # --- Rates ---
    'Flow Bytes/s': 'flow_byts_s',     'Flow Byts/s': 'flow_byts_s',
    'Flow Packets/s': 'flow_pkts_s',   'Flow Pkts/s': 'flow_pkts_s',
    'Fwd Packets/s': 'fwd_pkts_s',     'Fwd Pkts/s': 'fwd_pkts_s',
    'Bwd Packets/s': 'bwd_pkts_s',     'Bwd Pkts/s': 'bwd_pkts_s',
    
    # --- IAT (Inter-Arrival Time) ---
    'Flow IAT Mean': 'flow_iat_mean',
    'Flow IAT Std': 'flow_iat_std',
    'Flow IAT Max': 'flow_iat_max',
    'Flow IAT Min': 'flow_iat_min',
    'Fwd IAT Total': 'fwd_iat_tot',    'Fwd IAT Tot': 'fwd_iat_tot',
    'Fwd IAT Mean': 'fwd_iat_mean',
    'Fwd IAT Std': 'fwd_iat_std',
    'Fwd IAT Max': 'fwd_iat_max',
    'Fwd IAT Min': 'fwd_iat_min',
    'Bwd IAT Total': 'bwd_iat_tot',    'Bwd IAT Tot': 'bwd_iat_tot',
    'Bwd IAT Mean': 'bwd_iat_mean',
    'Bwd IAT Std': 'bwd_iat_std',
    'Bwd IAT Max': 'bwd_iat_max',
    'Bwd IAT Min': 'bwd_iat_min',
    
    # --- Flags ---
    'Fwd PSH Flags': 'fwd_psh_flags',
    'Bwd PSH Flags': 'bwd_psh_flags',
    'Fwd URG Flags': 'fwd_urg_flags',
    'Bwd URG Flags': 'bwd_urg_flags',
    'FIN Flag Count': 'fin_flag_cnt',   'FIN Flag Cnt': 'fin_flag_cnt',
    'SYN Flag Count': 'syn_flag_cnt',   'SYN Flag Cnt': 'syn_flag_cnt',
    'RST Flag Count': 'rst_flag_cnt',   'RST Flag Cnt': 'rst_flag_cnt',
    'PSH Flag Count': 'psh_flag_cnt',   'PSH Flag Cnt': 'psh_flag_cnt',
    'ACK Flag Count': 'ack_flag_cnt',   'ACK Flag Cnt': 'ack_flag_cnt',
    'URG Flag Count': 'urg_flag_cnt',   'URG Flag Cnt': 'urg_flag_cnt',
    'ECE Flag Count': 'ece_flag_cnt',   'ECE Flag Cnt': 'ece_flag_cnt',
    'CWE Flag Count': 'cwr_flag_count', 'CWE Flag Cnt': 'cwr_flag_count', 
    
    # --- Stats ---
    'Down/Up Ratio': 'down_up_ratio',
    'Min Packet Length': 'pkt_len_min',       'Pkt Len Min': 'pkt_len_min',
    'Max Packet Length': 'pkt_len_max',       'Pkt Len Max': 'pkt_len_max',
    'Packet Length Mean': 'pkt_len_mean',     'Pkt Len Mean': 'pkt_len_mean',
    'Packet Length Std': 'pkt_len_std',       'Pkt Len Std': 'pkt_len_std',
    'Packet Length Variance': 'pkt_len_var',  'Pkt Len Var': 'pkt_len_var',
    
    # --- Subflow / Active / Idle ---
    'Active Max': 'active_max',
    'Active Min': 'active_min',
    'Active Mean': 'active_mean',
    'Active Std': 'active_std',
    'Idle Max': 'idle_max',
    'Idle Min': 'idle_min',
    'Idle Mean': 'idle_mean',
    'Idle Std': 'idle_std',
    
    # --- Weird Ones ---
    'Init_Win_bytes_forward': 'init_fwd_win_byts',  'Init Fwd Win Byts': 'init_fwd_win_byts',
    'Init_Win_bytes_backward': 'init_bwd_win_byts', 'Init Bwd Win Byts': 'init_bwd_win_byts',
    'act_data_pkt_fwd': 'fwd_act_data_pkts',        'Fwd Act Data Pkts': 'fwd_act_data_pkts',
    'min_seg_size_forward': 'fwd_seg_size_min',     'Fwd Seg Size Min': 'fwd_seg_size_min',
    
    'Label': 'Label' 
}

def standardize_to_live_format(df):
    # 1. Rename columns
    df = df.rename(columns=to_snake_case_mapping)
    
    # 2. Drop columns that are NOT in the live tool output (Safety)
    # UPDATED list to include Protocols and Fwd Header
    live_cols = [
        # Protocols (Now included for PCA)
        'protocol_0', 'protocol_6', 'protocol_17',
        
        # Identifiers / Basics
        'flow_duration', 'flow_byts_s', 'flow_pkts_s', 'fwd_pkts_s', 'bwd_pkts_s',
        
        # Headers (Now included)
        'fwd_header_len', 
        
        # Packet Stats
        'tot_fwd_pkts', 'tot_bwd_pkts', 'totlen_fwd_pkts', 'totlen_bwd_pkts',
        'fwd_pkt_len_max', 'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
        'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean', 'bwd_pkt_len_std',
        'pkt_len_max', 'pkt_len_min', 'pkt_len_mean', 'pkt_len_std', 'pkt_len_var',
        'fwd_seg_size_min', 'fwd_act_data_pkts', 
        
        # IAT
        'flow_iat_mean', 'flow_iat_max', 'flow_iat_min', 'flow_iat_std',
        'fwd_iat_tot', 'fwd_iat_max', 'fwd_iat_min', 'fwd_iat_mean', 'fwd_iat_std',
        'bwd_iat_tot', 'bwd_iat_max', 'bwd_iat_min', 'bwd_iat_mean', 'bwd_iat_std',
        
        # Flags
        'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags',
        'fin_flag_cnt', 'syn_flag_cnt', 'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt',
        'urg_flag_cnt', 'ece_flag_cnt', 'cwr_flag_count',
        
        # Stats
        'down_up_ratio', 'init_fwd_win_byts', 'init_bwd_win_byts',
        'active_max', 'active_min', 'active_mean', 'active_std',
        'idle_max', 'idle_min', 'idle_mean', 'idle_std', 
        
        'Label'
    ]
    
    # Filter: Keep only columns that exist in both the DF and the Live Tool list
    cols_to_keep = [c for c in live_cols if c in df.columns]
    
    return df[cols_to_keep]

# --- Usage ---
masterk = standardize_to_live_format(masterk)
master2019 = standardize_to_live_format(master2019)

print('done - Protocols and Fwd Header Length preserved')

done - Protocols and Fwd Header Length preserved


In [45]:
gm = pd.concat([master2019, masterk], ignore_index=True)

In [49]:
print(gm.shape)
print(gm.dtypes)

(18205836, 65)
protocol_0           float32
protocol_6           float32
protocol_17          float32
flow_duration        float32
flow_byts_s          float32
flow_pkts_s          float32
fwd_pkts_s           float32
bwd_pkts_s           float32
tot_fwd_pkts         float32
tot_bwd_pkts         float32
totlen_fwd_pkts      float32
totlen_bwd_pkts      float32
fwd_pkt_len_max      float32
fwd_pkt_len_min      float32
fwd_pkt_len_mean     float32
fwd_pkt_len_std      float32
bwd_pkt_len_max      float32
bwd_pkt_len_min      float32
bwd_pkt_len_mean     float32
bwd_pkt_len_std      float32
pkt_len_max          float32
pkt_len_min          float32
pkt_len_mean         float32
pkt_len_std          float32
pkt_len_var          float32
fwd_seg_size_min     float32
fwd_act_data_pkts    float32
flow_iat_mean        float32
flow_iat_max         float32
flow_iat_min         float32
flow_iat_std         float32
fwd_iat_tot          float32
fwd_iat_max          float32
fwd_iat_min          float32

In [2]:
import pandas as pd
gm = pd.read_parquet('gmgm.parquet')

In [3]:
# fiixng inf values cuz it's problematic
import numpy as np

def clean_infinity(df):
    # Select only numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        # 1. Check if the column has Infinity
        if np.isinf(df[col]).any():
            # 2. Find the maximum value in that column that ISN'T infinity
            max_val = df.loc[~np.isinf(df[col]), col].max()
            
            # 3. Replace Infinity with that max value
            # (Optional: Multiply by 1.1 to show it's slightly higher, but max_val is usually safe)
            df[col] = df[col].replace([np.inf, -np.inf], max_val)
            
            print(f"Fixed 'inf' in {col}: Replaced with {max_val}")
            
    return df

# Usage
gm = clean_infinity(gm)
print('done')

done


In [4]:
y = gm['Label']

print("--- Binary Label Counts ---")
print(y.value_counts())

total_count = len(y)
print("\n--- Percentage Balance ---")
print(f"Attack (1): {y.value_counts()[1] / total_count * 100:.2f}%")
print(f"Benign (0): {y.value_counts()[0] / total_count * 100:.2f}%")

--- Binary Label Counts ---
Label
1    13784153
0     4421683
Name: count, dtype: int64

--- Percentage Balance ---
Attack (1): 75.71%
Benign (0): 24.29%


In [None]:
pd.set_option('display.max_rows', None)    # Display all rows (in case you use .head())
pd.set_option('display.max_columns', None) # Display all columns
pd.set_option('display.width', 1000)
print(gm.head(200))

In [8]:
#final preprocessing check
std_devs = gm.std()
cols_to_drop = std_devs[std_devs == 0].index
gm = gm.drop(columns=cols_to_drop)
print(f"Dropped zero-variance columns: {list(cols_to_drop)}")

Dropped zero-variance columns: ['fwd_urg_flags', 'bwd_urg_flags']


In [10]:
# Check shape before
print(f"Shape before dedup: {gm.shape}")

# Drop duplicates
gm = gm.drop_duplicates()

# Check shape after
print(f"Shape after dedup: {gm.shape}")

Shape before dedup: (18205836, 63)
Shape after dedup: (17143447, 63)


In [13]:
y = gm['Label']

print("--- Binary Label Counts ---")
print(y.value_counts())

total_count = len(y)
print("\n--- Percentage Balance ---")
print(f"Attack (1): {y.value_counts()[1] / total_count * 100:.2f}%")
print(f"Benign (0): {y.value_counts()[0] / total_count * 100:.2f}%")

--- Binary Label Counts ---
Label
1    12758955
0     4384492
Name: count, dtype: int64

--- Percentage Balance ---
Attack (1): 74.42%
Benign (0): 25.58%


In [18]:
import pandas as pd
import numpy as np

def clean_all_negatives(df):
    """
    Checks the entire dataframe for ANY negative value.
    If found, replaces them with 0.
    """
    # 1. Select numeric columns only
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    print("--- Scanning for ANY negative values ---")
    found = False
    
    for col in numeric_cols:
        # Check for values less than 0
        neg_mask = df[col] < 0
        neg_count = neg_mask.sum()
        
        if neg_count > 0:
            found = True
            # Find the worst negative value (just for info)
            min_val = df.loc[neg_mask, col].min()
            print(f" -> Found {neg_count} negatives in '{col}'. Lowest: {min_val}")
            
            # CRITICAL FIX: Clip values to 0
            # This turns -1 into 0, -50 into 0, etc.
            df[col] = df[col].clip(lower=0)
            
    if not found:
        print(" -> Data is clean. No negative values found.")
    else:
        print("--- Fixed: All negative values clipped to 0 ---")
        
    return df

# --- USAGE ORDER ---
# 1. Clean Negatives
gm = clean_all_negatives(gm)


# 2. Clean Infinity
# (Run the infinity cleaner we made earlier)

# 3. Log Transform (Safe now!)
# masterk_log = np.log1p(masterk)

--- Scanning for ANY negative values ---
 -> Found 13 negatives in 'flow_pkts_s'. Lowest: -2000000.0
 -> Found 458663 negatives in 'fwd_seg_size_min'. Lowest: -1408237568.0
 -> Found 17 negatives in 'flow_iat_min'. Lowest: -13.0
--- Fixed: All negative values clipped to 0 ---


In [19]:
np.isinf(gm).sum()

protocol_0           0
protocol_6           0
protocol_17          0
flow_duration        0
flow_byts_s          0
flow_pkts_s          0
fwd_pkts_s           0
bwd_pkts_s           0
tot_fwd_pkts         0
tot_bwd_pkts         0
totlen_fwd_pkts      0
totlen_bwd_pkts      0
fwd_pkt_len_max      0
fwd_pkt_len_min      0
fwd_pkt_len_mean     0
fwd_pkt_len_std      0
bwd_pkt_len_max      0
bwd_pkt_len_min      0
bwd_pkt_len_mean     0
bwd_pkt_len_std      0
pkt_len_max          0
pkt_len_min          0
pkt_len_mean         0
pkt_len_std          0
pkt_len_var          0
fwd_seg_size_min     0
fwd_act_data_pkts    0
flow_iat_mean        0
flow_iat_max         0
flow_iat_min         0
flow_iat_std         0
fwd_iat_tot          0
fwd_iat_max          0
fwd_iat_min          0
fwd_iat_mean         0
fwd_iat_std          0
bwd_iat_tot          0
bwd_iat_max          0
bwd_iat_min          0
bwd_iat_mean         0
bwd_iat_std          0
fwd_psh_flags        0
bwd_psh_flags        0
fin_flag_cn

In [23]:
(gm.select_dtypes(include='number') < 0).any()

protocol_0           False
protocol_6           False
protocol_17          False
flow_duration        False
flow_byts_s          False
flow_pkts_s          False
fwd_pkts_s           False
bwd_pkts_s           False
tot_fwd_pkts         False
tot_bwd_pkts         False
totlen_fwd_pkts      False
totlen_bwd_pkts      False
fwd_pkt_len_max      False
fwd_pkt_len_min      False
fwd_pkt_len_mean     False
fwd_pkt_len_std      False
bwd_pkt_len_max      False
bwd_pkt_len_min      False
bwd_pkt_len_mean     False
bwd_pkt_len_std      False
pkt_len_max          False
pkt_len_min          False
pkt_len_mean         False
pkt_len_std          False
pkt_len_var          False
fwd_seg_size_min     False
fwd_act_data_pkts    False
flow_iat_mean        False
flow_iat_max         False
flow_iat_min         False
flow_iat_std         False
fwd_iat_tot          False
fwd_iat_max          False
fwd_iat_min          False
fwd_iat_mean         False
fwd_iat_std          False
bwd_iat_tot          False
b

In [24]:
gm.to_parquet('gmgm.parquet', index = False)
print('done')

done
