In [12]:
import pandas as pd
from sklearn.utils import resample

# Load your CSV
df = pd.read_csv("./processed_data/merged_cic_andmal2017.csv")

# Map benign and malware labels
benign_labels = ["benign", "malware_2015_1016"]
df['label'] = df['label'].apply(lambda x: 0 if x in benign_labels else x)

# Assign malware classes 1,2,3... based on unique labels
malware_labels = df[df['label'] != 0]['label'].unique()
malware_label_map = {label: idx + 1 for idx, label in enumerate(malware_labels)}
df['label'] = df['label'].apply(lambda x: malware_label_map.get(x, x) if x != 0 else 0)

# Undersample benign samples to 1200
benign_df = df[df['label'] == 0]
benign_sampled = resample(benign_df, replace=False, n_samples=1200, random_state=42)

# Keep malware samples for augmentation
malware_df = df[df['label'] != 0]

print("Benign samples:", len(benign_sampled))
print("Malware class distribution:\n", malware_df['label'].value_counts())


Benign samples: 1200
Malware class distribution:
 label
4    109
3    109
2    101
1     84
Name: count, dtype: int64


In [16]:
import torch
import torch.nn as nn

# Generator for 1D tabular data
class Generator1D(nn.Module):
    def __init__(self, input_dim):
        super(Generator1D, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(64, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        return self.model(x)

# Discriminator for 1D tabular data
class Discriminator1D(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator1D, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(64, 32),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Dynamically determine input dimension based on the dataset
input_dim = benign_sampled.select_dtypes(include=[np.number]).shape[1]

# Initialize models with the correct input dimension
G_AB = Generator1D(input_dim=input_dim)
G_BA = Generator1D(input_dim=input_dim)
D_A = Discriminator1D(input_dim=input_dim)
D_B = Discriminator1D(input_dim=input_dim)

# Move models to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
G_AB.to(device)
G_BA.to(device)
D_A.to(device)
D_B.to(device)


Discriminator1D(
  (model): Sequential(
    (0): Linear(in_features=12, out_features=64, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): LeakyReLU(negative_slope=0.2, inplace=True)
    (4): Linear(in_features=32, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [17]:
# Replace the third cell with this improved implementation

from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Ensure only numeric columns are used
benign_data = benign_sampled.select_dtypes(include=[np.number]).values
malware_data = malware_df.select_dtypes(include=[np.number]).values

# Get counts
n_benign = len(benign_data)
n_malware = len(malware_data)

# Convert to tensors
benign_tensor = torch.tensor(benign_data, dtype=torch.float32)
malware_tensor = torch.tensor(malware_data, dtype=torch.float32)

# Balance batch sizes - we'll use the smaller dataset size to determine batch structure
batch_size = 64
n_batches = min(n_benign, n_malware) // batch_size

# Create separate dataloaders
benign_dataset = TensorDataset(benign_tensor)
malware_dataset = TensorDataset(malware_tensor)

benign_loader = DataLoader(benign_dataset, batch_size=batch_size, shuffle=True)
malware_loader = DataLoader(malware_dataset, batch_size=batch_size, shuffle=True)

print(f"DataLoaders created with {n_batches} batches of size {batch_size}")
print(f"Benign samples: {n_benign}")
print(f"Malware samples: {n_malware}")

DataLoaders created with 6 batches of size 64
Benign samples: 1200
Malware samples: 403


In [20]:
# Replace the training loop cell with this fixed version

import torch.optim as optim

# Loss functions
adversarial_loss = nn.MSELoss()
cycle_loss = nn.L1Loss()

# Optimizers
lr = 0.0002
optimizer_G = optim.Adam(list(G_AB.parameters()) + list(G_BA.parameters()), lr=lr, betas=(0.5, 0.999))
optimizer_D_A = optim.Adam(D_A.parameters(), lr=lr, betas=(0.5, 0.999))
optimizer_D_B = optim.Adam(D_B.parameters(), lr=lr, betas=(0.5, 0.999))
loss_history_G = []
loss_history_D_A = []
loss_history_D_B = []

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    epoch_loss_G = 0
    epoch_loss_D_A = 0
    epoch_loss_D_B = 0
    batches = 0
    
    # Create iterators for the dataloaders
    benign_iter = iter(benign_loader)
    malware_iter = iter(malware_loader)
    
    for i in range(min(len(benign_loader), len(malware_loader))):
        try:
            real_A = next(benign_iter)[0].to(device)
            real_B = next(malware_iter)[0].to(device)
            
            # Ensure batch sizes match by truncating the larger batch
            min_batch_size = min(real_A.size(0), real_B.size(0))
            real_A = real_A[:min_batch_size]
            real_B = real_B[:min_batch_size]

            # Dynamically create valid and fake tensors to match the batch size
            valid = torch.ones(min_batch_size, 1).to(device)
            fake = torch.zeros(min_batch_size, 1).to(device)

            # ------------------
            #  Train Generators
            # ------------------
            optimizer_G.zero_grad()

            fake_B = G_AB(real_A)
            recov_A = G_BA(fake_B)

            fake_A = G_BA(real_B)
            recov_B = G_AB(fake_A)

            loss_GAN_AB = adversarial_loss(D_B(fake_B), valid)
            loss_GAN_BA = adversarial_loss(D_A(fake_A), valid)
            loss_cycle_A = cycle_loss(recov_A, real_A)
            loss_cycle_B = cycle_loss(recov_B, real_B)

            loss_G = loss_GAN_AB + loss_GAN_BA + 10 * (loss_cycle_A + loss_cycle_B)
            loss_G.backward()
            optimizer_G.step()

            # -----------------------
            #  Train Discriminator A
            # -----------------------
            optimizer_D_A.zero_grad()
            loss_real = adversarial_loss(D_A(real_A), valid)
            loss_fake = adversarial_loss(D_A(fake_A.detach()), fake)
            loss_D_A = (loss_real + loss_fake) * 0.5
            loss_D_A.backward()
            optimizer_D_A.step()

            # -----------------------
            #  Train Discriminator B
            # -----------------------
            optimizer_D_B.zero_grad()
            loss_real = adversarial_loss(D_B(real_B), valid)
            loss_fake = adversarial_loss(D_B(fake_B.detach()), fake)
            loss_D_B = (loss_real + loss_fake) * 0.5
            loss_D_B.backward()
            optimizer_D_B.step()
            
            # Track losses
            epoch_loss_G += loss_G.item()
            epoch_loss_D_A += loss_D_A.item()
            epoch_loss_D_B += loss_D_B.item()
            batches += 1
            
        except StopIteration:
            break

    # Store average losses for this epoch
    loss_history_G.append(epoch_loss_G / batches)
    loss_history_D_A.append(epoch_loss_D_A / batches)
    loss_history_D_B.append(epoch_loss_D_B / batches)

    if epoch % 10 == 0:
        print(f"Epoch {epoch}/{num_epochs}: Generator Loss {loss_history_G[-1]:.4f}, D_A Loss {loss_history_D_A[-1]:.4f}, D_B Loss {loss_history_D_B[-1]:.4f}")


Epoch 0/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 100/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 100/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 200/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 200/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 300/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 300/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 400/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 400/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 500/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 500/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 600/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 600/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 700/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 700/1000: Generator Loss nan, D_A Loss nan, D_B Loss nan
Epoch 800/1000: Generator Loss nan, D_A Loss nan, D_B Los

In [None]:
# Plot Generator and Discriminator Losses
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(loss_history_G, label='Generator Loss', linewidth=2)
plt.plot(loss_history_D_A, label='Discriminator A Loss', linestyle='--')
plt.plot(loss_history_D_B, label='Discriminator B Loss', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('CycleGAN Training Loss Over Time')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Replace the sample generation and final dataset creation cells

# Improved sample generation function
def generate_samples(generator, base_samples, target_count, batch_size=64):
    generated = []
    
    # Process in batches to prevent memory issues
    with torch.no_grad():
        for i in range(0, len(base_samples), batch_size):
            batch = base_samples[i:i+batch_size]
            base_tensor = torch.tensor(batch, dtype=torch.float32).to(device)
            fake_data = generator(base_tensor).cpu().numpy()
            generated.extend(fake_data)
    
    # If we need more samples than we generated, repeat the process
    while len(generated) < target_count:
        # Randomly select from what we've already generated
        idx = np.random.choice(len(generated), min(batch_size, target_count - len(generated)))
        batch = np.array([generated[i] for i in idx])
        base_tensor = torch.tensor(batch, dtype=torch.float32).to(device)
        fake_data = generator(base_tensor).cpu().numpy()
        generated.extend(fake_data)
    
    return np.array(generated[:target_count])

# Balance the dataset - set target count per class
target_per_class = 1200  # Same as benign samples
balanced_data = []
balanced_labels = []

# Add all benign samples
balanced_data.append(benign_sampled.drop(columns=['label']).values)
balanced_labels.extend([0] * len(benign_sampled))

# Get unique malware classes
malware_classes = sorted(malware_df['label'].unique())

for malware_class in malware_classes:
    # Get real samples for this class
    class_samples = malware_df[malware_df['label'] == malware_class].drop(columns=['label']).values
    class_count = len(class_samples)
    
    print(f"Processing malware class {malware_class}: {class_count} real samples available")
    
    if class_count >= target_per_class:
        # Downsample if we have too many
        selected_idx = np.random.choice(class_count, target_per_class, replace=False)
        balanced_data.append(class_samples[selected_idx])
    else:
        # Use all real samples
        balanced_data.append(class_samples)
        
        # Generate synthetic samples to reach the target
        needed = target_per_class - class_count
        print(f"  Generating {needed} synthetic samples")
        synthetic_samples = generate_samples(G_AB, class_samples, needed)
        balanced_data.append(synthetic_samples)
    
    # Add labels
    balanced_labels.extend([malware_class] * target_per_class)

# Combine all data
all_data = np.vstack(balanced_data)
final_df = pd.DataFrame(all_data, columns=benign_sampled.columns.drop('label'))
final_df['label'] = balanced_labels

# Save balanced dataset
final_df.to_csv("balanced_augmented_dataset.csv", index=False)
print(f"Final balanced dataset created with {len(final_df)} samples")
print("Class distribution:")
print(final_df['label'].value_counts())


In [None]:
# After generating synthetic samples for each malware class
final_dataset = pd.concat([benign_sampled, malware_df], ignore_index=True)

# Replace original data with generated malware samples as needed
final_dataset.to_csv("balanced_augmented_dataset.csv", index=False)
print("Final dataset saved.")
