In [8]:
# Essential Imports
import os
import numpy as np
import pandas as pd
from dask import dataframe as dd
from sklearn.preprocessing import RobustScaler




In [9]:
# ====== File Path Validation ======
file_path = r"C:\Users\dhruv\Desktop\project\Fedrated_Privacy_Proj\02-14-2018.csv"
assert os.path.exists(file_path), f"File not found at {file_path}"

# ====== Memory-Optimized Loading ======
dtypes = {
    'Flow Duration': 'uint32',
    'Tot Fwd Pkts': 'uint16',
    'Flow Byts/s': 'float32',
    'Flow Pkts/s': 'float32',
    'Label': 'category'
}

# Load in chunks if memory constrained
def process_chunk(chunk):
    return chunk.replace([np.inf, -np.inf], np.nan)

df = pd.read_csv(file_path, dtype=dtypes, low_memory=False, 
                 parse_dates=['Timestamp'], 
                 infer_datetime_format=True)


  df = pd.read_csv(file_path, dtype=dtypes, low_memory=False,
  df = pd.read_csv(file_path, dtype=dtypes, low_memory=False,


In [10]:
# ====== Critical Column Checks ======
print("Initial Data Shape:", df.shape)
print("Missing Values:\n", df.isna().sum())
print("Label Categories:", df['Label'].unique())

# ====== Infinite Value Handling ======
inf_cols = ['Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Max', 'Idle Max']
df[inf_cols] = df[inf_cols].replace([np.inf, -np.inf], np.nan)

# Protocol-aware imputation
for col in inf_cols:
    df[col] = df.groupby('Protocol', observed=True)[col].transform(
        lambda x: x.fillna(x.median())
    )

# ====== Irrelevant Column Removal ======
cols_to_drop = [
    'Timestamp', 'Fwd URG Flags', 'Bwd URG Flags', 
    'Init Fwd Win Byts', 'Init Bwd Win Byts'
]
df = df.drop(columns=cols_to_drop)

# ====== Categorical Conversion ======
df['Protocol'] = df['Protocol'].astype('category').cat.codes  # TCP=0, UDP=1
df['Dst Port'] = df['Dst Port'].astype('category')


Initial Data Shape: (1048575, 80)
Missing Values:
 Dst Port         0
Protocol         0
Timestamp        0
Flow Duration    0
Tot Fwd Pkts     0
                ..
Idle Mean        0
Idle Std         0
Idle Max         0
Idle Min         0
Label            0
Length: 80, dtype: int64
Label Categories: ['Benign', 'FTP-BruteForce', 'SSH-Bruteforce']
Categories (3, object): ['Benign', 'FTP-BruteForce', 'SSH-Bruteforce']


In [11]:
# ====== Robust Scaling ======
scaler = RobustScaler(quantile_range=(5, 95), 
                      with_centering=False,  # Avoid negative values
                      unit_variance=True)

robust_features = [
    'Flow Byts/s', 'Flow Pkts/s',
    'Flow IAT Max', 'Idle Max'
]

# Ensure float32 to prevent overflow
df[robust_features] = df[robust_features].astype('float32')

# Quantile-based clipping (prevent post-scaling outliers)
for col in robust_features:
    q1 = df[col].quantile(0.05)
    q3 = df[col].quantile(0.95)
    df[col] = np.clip(df[col], q1, q3)

# Apply scaling
df[robust_features] = scaler.fit_transform(df[robust_features])


In [12]:
# ====== Federated Client Simulation ======
# Strategy 1: Split by protocol type
tcp_data = df[df['Protocol'] == 0].sample(frac=0.5, random_state=42)
udp_data = df[df['Protocol'] == 1].sample(frac=0.5, random_state=42)

# Strategy 2: Temporal splitting (using original timestamp order)
df_sorted = df.sort_values('Flow Duration')
client_count = 5
client_datasets = np.array_split(df_sorted, client_count)

# Save partitions
for i, client_df in enumerate(client_datasets):
    client_df.to_parquet(
        f'client_{i}.parquet',
        engine='pyarrow',
        compression='ZSTD'
    )


  return bound(*args, **kwds)


In [13]:
# ====== Post-Processing Verification ======
assert not df[robust_features].isnull().any().any(), "NaNs present!"
assert not np.isinf(df[robust_features]).any().any(), "Infinite values!"
assert df[robust_features].max().max() < 100, "Scaling overflow"
assert df[robust_features].min().min() >= 0, "Negative scaled values"

# Label distribution check
label_dist = df['Label'].value_counts(normalize=True)
assert label_dist.min() > 0.01, "Severe class imbalance remains"

# Memory check (target <4GB)
print("Final Memory Usage:", df.memory_usage().sum()/1024**3, "GB")


Final Memory Usage: 0.5572707038372755 GB


In [14]:
# Save processed dataset
df.to_parquet('processed_data.parquet', 
             engine='pyarrow',
             compression='ZSTD',
             index=False)

# Save scaler for federated clients
import joblib
joblib.dump(scaler, 'robust_scaler.pkl')


['robust_scaler.pkl']

In [1]:
# federated_feature_selection.py
import pygad
import flwr as fl
import numpy as np
from flwr.common.parameter import parameters_to_ndarrays

class FedChimpStrategy(fl.server.strategy.FedAvg):
    def aggregate_fit(self, results, failures):
        # Aggregate feature masks from clients
        all_masks = [parameters_to_ndarrays(r.parameters)[0] for r in results]
        global_mask = np.mean(all_masks, axis=0) > 0.5  # Majority voting
        return fl.common.ndarrays_to_parameters([global_mask.astype(int)])

def chimp_optimization(X_client):
    # Windows-compatible ChOA implementation
    ga = pygad.GA(
        num_generations=20,
        num_parents_mating=5,
        fitness_func=lambda sol, _: fitness(sol, X_client),
        gene_type=int,
        gene_space=[0, 1],
        suppress_warnings=True  # Windows console compatibility
    )
    ga.run()
    return ga.best_solution()[0]

def fitness(solution, X):
    selected = X.columns[np.where(solution == 1)[0]]
    return X[selected].var().sum()  # Maximize feature variance

