In [2]:
!pip install -q pytorch_lightning


In [3]:
!pip install -q neptune

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.9/487.9 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for bravado-core (setup.py) ... [?25l[?25hdone


Import Libraries and Setup


In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import neptune
import warnings
warnings.filterwarnings('ignore')

Data Loading and Preprocessing


In [5]:
data = pd.read_csv('/content/adult.arff', comment='@', header=None)
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex',
         'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data.columns = columns

data_clean = data.copy()
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                  'relationship', 'race', 'sex', 'native-country', 'income']
continuous_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week', 'education-num']

for col in categorical_cols:
   data_clean[col] = data_clean[col].str.strip()

continuous_data = data_clean[continuous_cols].copy()
categorical_data = data_clean[categorical_cols].copy()

scaler = StandardScaler()
continuous_scaled = scaler.fit_transform(continuous_data)
continuous_scaled = pd.DataFrame(continuous_scaled, columns=continuous_cols)

categorical_encoded = pd.get_dummies(categorical_data, drop_first=False)
final_data = pd.concat([continuous_scaled, categorical_encoded], axis=1).astype(float)

X_train, X_test = train_test_split(final_data.values, test_size=0.2, random_state=42)
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)

train_dataset = TensorDataset(X_train_tensor)
test_dataset = TensorDataset(X_test_tensor)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

input_dim = X_train.shape[1]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Regularized VAE Model


In [6]:
class RegularizedVAE(nn.Module):
    def __init__(self, input_dim, latent_dim=64, hidden_dims=[512, 256]):
        super(RegularizedVAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim

        encoder_layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            encoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.LeakyReLU(0.2),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim

        self.encoder = nn.Sequential(*encoder_layers)
        self.fc_mu = nn.Linear(prev_dim, latent_dim)
        self.fc_logvar = nn.Linear(prev_dim, latent_dim)

        decoder_layers = []
        prev_dim = latent_dim

        for hidden_dim in reversed(hidden_dims):
            decoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.LeakyReLU(0.2),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim

        decoder_layers.append(nn.Linear(prev_dim, input_dim))
        self.decoder = nn.Sequential(*decoder_layers)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def encode(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        logvar = torch.clamp(logvar, min=-10, max=3)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z)
        return x_recon, mu, logvar

Early Stopping Utility


In [7]:
class EarlyStopping:
    def __init__(self, patience=15, min_delta=0.001, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1

        if self.counter >= self.patience:
            if self.restore_best_weights:
                self.restore_checkpoint(model)
            return True
        return False

    def save_checkpoint(self, model):
        self.best_weights = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    def restore_checkpoint(self, model):
        if self.best_weights is not None:
            model.load_state_dict({k: v.to(model.fc_mu.weight.device) for k, v in self.best_weights.items()})

VAE Loss Function


In [8]:
def vae_loss_with_logging(x_recon, x, mu, logvar, beta=1.0):
    batch_size = x.size(0)

    recon_loss = nn.MSELoss(reduction='sum')(x_recon, x) / batch_size

    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / batch_size

    total_loss = recon_loss + beta * kl_loss

    return total_loss, recon_loss, kl_loss

VAE Training Function


In [9]:
def train_vae_with_early_stopping(model, train_loader, test_loader, neptune_run, epochs=200):
    optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.7, patience=10, verbose=True
    )

    early_stopping = EarlyStopping(patience=20, min_delta=0.001)

    for epoch in range(epochs):
        if epoch < epochs * 0.7:
            beta = 0.001 * (epoch / (epochs * 0.7))
        else:
            beta = min(0.001 + 0.01 * ((epoch - epochs * 0.7) / (epochs * 0.3)), 0.01)

        model.train()
        train_total_loss = 0
        train_recon_loss = 0
        train_kl_loss = 0
        num_batches = 0

        for batch_idx, (data,) in enumerate(train_loader):
            data = data.to(device)
            optimizer.zero_grad()

            x_recon, mu, logvar = model(data)
            loss, recon_loss, kl_loss = vae_loss_with_logging(x_recon, data, mu, logvar, beta)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_total_loss += loss.item()
            train_recon_loss += recon_loss.item()
            train_kl_loss += kl_loss.item()
            num_batches += 1

        model.eval()
        val_total_loss = 0
        val_recon_loss = 0
        val_kl_loss = 0
        val_batches = 0

        with torch.no_grad():
            for data, in test_loader:
                data = data.to(device)
                x_recon, mu, logvar = model(data)
                loss, recon_loss, kl_loss = vae_loss_with_logging(x_recon, data, mu, logvar, beta)

                val_total_loss += loss.item()
                val_recon_loss += recon_loss.item()
                val_kl_loss += kl_loss.item()
                val_batches += 1

        avg_train_loss = train_total_loss / num_batches
        avg_train_recon = train_recon_loss / num_batches
        avg_train_kl = train_kl_loss / num_batches

        avg_val_loss = val_total_loss / val_batches
        avg_val_recon = val_recon_loss / val_batches
        avg_val_kl = val_kl_loss / val_batches

        if neptune_run:
            neptune_run["vae/train/total_loss"].append(avg_train_loss)
            neptune_run["vae/train/reconstruction_loss"].append(avg_train_recon)
            neptune_run["vae/train/kl_loss"].append(avg_train_kl)
            neptune_run["vae/val/total_loss"].append(avg_val_loss)
            neptune_run["vae/val/reconstruction_loss"].append(avg_val_recon)
            neptune_run["vae/val/kl_loss"].append(avg_val_kl)
            neptune_run["vae/beta"].append(beta)
            neptune_run["vae/learning_rate"].append(optimizer.param_groups[0]['lr'])

            total_norm = 0
            for p in model.parameters():
                if p.grad is not None:
                    param_norm = p.grad.data.norm(2)
                    total_norm += param_norm.item() ** 2
            total_norm = total_norm ** (1. / 2)
            neptune_run["vae/gradient_norm"].append(total_norm)

        scheduler.step(avg_val_loss)

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}:")
            print(f"  Train Loss: {avg_train_loss:.4f} (Recon: {avg_train_recon:.4f}, KL: {avg_train_kl:.4f})")
            print(f"  Val Loss: {avg_val_loss:.4f} (Recon: {avg_val_recon:.4f}, KL: {avg_val_kl:.4f})")
            print(f"  Beta: {beta:.6f}, LR: {optimizer.param_groups[0]['lr']:.6f}")

        if early_stopping(avg_val_loss, model):
            print(f"Early stopping at epoch {epoch+1}")
            break

    return model

GAN Generator Model


In [10]:
class RegularizedGenerator(nn.Module):
    def __init__(self, noise_dim, latent_dim, hidden_dims=[256, 512, 256]):
        super(RegularizedGenerator, self).__init__()
        layers = []
        prev_dim = noise_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.LeakyReLU(0.2),
                nn.Dropout(0.2)
            ])
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, latent_dim))
        self.model = nn.Sequential(*layers)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, noise):
        return self.model(noise)

GAN Discriminator Model


In [11]:
class RegularizedDiscriminator(nn.Module):
    def __init__(self, latent_dim, hidden_dims=[256, 128]):
        super(RegularizedDiscriminator, self).__init__()
        layers = []
        prev_dim = latent_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.LeakyReLU(0.2),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, 1))
        self.model = nn.Sequential(*layers)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, latent_code):
        return self.model(latent_code)

Gradient Penalty Function


In [12]:
def gradient_penalty(discriminator, real_data, fake_data, device, labels=None):
    batch_size = real_data.size(0)
    alpha = torch.rand(batch_size, 1, device=device)
    alpha = alpha.expand_as(real_data)

    interpolated = alpha * real_data + (1 - alpha) * fake_data
    interpolated.requires_grad_(True)

    if labels is not None:
        prob_interpolated = discriminator(interpolated, labels)
    else:
        prob_interpolated = discriminator(interpolated)

    gradients = torch.autograd.grad(
        outputs=prob_interpolated,
        inputs=interpolated,
        grad_outputs=torch.ones_like(prob_interpolated),
        create_graph=True,
        retain_graph=True
    )[0]

    gradients = gradients.view(batch_size, -1)
    gradients_norm = torch.sqrt(torch.sum(gradients ** 2, dim=1) + 1e-12)

    return ((gradients_norm - 1) ** 2).mean()

Regularized Latent GAN Model


In [13]:
class RegularizedLatentGAN_Pure(nn.Module):
    def __init__(self, vae_model, neptune_run=None, noise_dim=64, latent_dim=64, lr=1e-4):
        super().__init__()
        self.vae_model = vae_model
        self.vae_model.eval()
        for param in self.vae_model.parameters():
            param.requires_grad = False

        self.generator = RegularizedGenerator(noise_dim, latent_dim)
        self.discriminator = RegularizedDiscriminator(latent_dim)

        self.noise_dim = noise_dim
        self.latent_dim = latent_dim
        self.lr = lr
        self.neptune_run = neptune_run

        self.lambda_gp = 10.0
        self.n_critic = 3

    def setup_optimizers(self, device):
        self.optimizer_g = torch.optim.Adam(self.generator.parameters(), lr=self.lr/2, betas=(0.5, 0.9), weight_decay=1e-5)
        self.optimizer_d = torch.optim.Adam(self.discriminator.parameters(), lr=self.lr, betas=(0.5, 0.9), weight_decay=1e-5)

    def training_step_manual(self, real_data, batch_idx, device):
        batch_size = real_data.size(0)

        real_data = real_data.to(device)

        with torch.no_grad():
            real_latent, _ = self.vae_model.encode(real_data)
            real_latent = real_latent.to(device)

        self.optimizer_d.zero_grad()

        real_pred = self.discriminator(real_latent)

        noise = torch.randn(batch_size, self.noise_dim, device=device)
        fake_latent = self.generator(noise)
        fake_pred = self.discriminator(fake_latent.detach())

        d_loss = -torch.mean(real_pred) + torch.mean(fake_pred)

        gp = gradient_penalty(self.discriminator, real_latent, fake_latent, device)
        d_loss += self.lambda_gp * gp

        d_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.discriminator.parameters(), max_norm=1.0)
        self.optimizer_d.step()

        g_loss = torch.tensor(0.0, device=device)
        if batch_idx % self.n_critic == 0:
            self.optimizer_g.zero_grad()

            noise = torch.randn(batch_size, self.noise_dim, device=device)
            fake_latent = self.generator(noise)
            fake_pred = self.discriminator(fake_latent)

            g_loss = -torch.mean(fake_pred)

            g_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.generator.parameters(), max_norm=1.0)
            self.optimizer_g.step()

        if self.neptune_run and batch_idx % 10 == 0:
            self.neptune_run["gan/train/discriminator_loss"].append(d_loss.item())
            self.neptune_run["gan/train/generator_loss"].append(g_loss.item())
            self.neptune_run["gan/train/gradient_penalty"].append(gp.item())
            self.neptune_run["gan/train/real_pred_mean"].append(torch.mean(real_pred).item())
            self.neptune_run["gan/train/fake_pred_mean"].append(torch.mean(fake_pred).item())

        return {'d_loss': d_loss.item(), 'g_loss': g_loss.item(), 'gp': gp.item()}

    def generate_synthetic_data(self, num_samples, device):
        self.eval()
        with torch.no_grad():
            batch_size = 1000
            all_samples = []

            for i in range(0, num_samples, batch_size):
                current_batch_size = min(batch_size, num_samples - i)
                noise = torch.randn(current_batch_size, self.noise_dim, device=device)
                fake_latent = self.generator(noise)
                synthetic_data = self.vae_model.decode(fake_latent)
                all_samples.append(synthetic_data.cpu().numpy())

            return np.vstack(all_samples)

Conditional VAE Model


In [14]:
class RegularizedConditionalVAE(nn.Module):
    def __init__(self, input_dim, latent_dim=64, num_classes=2, hidden_dims=[512, 256]):
        super(RegularizedConditionalVAE, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.num_classes = num_classes

        encoder_input_dim = input_dim + num_classes
        encoder_layers = []
        prev_dim = encoder_input_dim

        for hidden_dim in hidden_dims:
            encoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.LeakyReLU(0.2),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim

        self.encoder = nn.Sequential(*encoder_layers)
        self.fc_mu = nn.Linear(prev_dim, latent_dim)
        self.fc_logvar = nn.Linear(prev_dim, latent_dim)

        decoder_input_dim = latent_dim + num_classes
        decoder_layers = []
        prev_dim = decoder_input_dim

        for hidden_dim in reversed(hidden_dims):
            decoder_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.LeakyReLU(0.2),
                nn.Dropout(0.3)
            ])
            prev_dim = hidden_dim

        decoder_layers.append(nn.Linear(prev_dim, input_dim))
        self.decoder = nn.Sequential(*decoder_layers)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def encode(self, x, labels):
        labels_onehot = torch.nn.functional.one_hot(labels, self.num_classes).float()
        x_labeled = torch.cat([x, labels_onehot], dim=1)
        h = self.encoder(x_labeled)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        logvar = torch.clamp(logvar, min=-10, max=3)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, labels):
        labels_onehot = torch.nn.functional.one_hot(labels, self.num_classes).float()
        z_labeled = torch.cat([z, labels_onehot], dim=1)
        return self.decoder(z_labeled)

    def forward(self, x, labels):
        mu, logvar = self.encode(x, labels)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z, labels)
        return x_recon, mu, logvar

Conditional VAE Training Function


In [15]:
def train_conditional_vae_with_early_stopping(model, train_loader, neptune_run, epochs=150):
    optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=10)
    early_stopping = EarlyStopping(patience=20, min_delta=0.001)

    for epoch in range(epochs):
        if epoch < epochs * 0.7:
            beta = 0.001 * (epoch / (epochs * 0.7))
        else:
            beta = min(0.001 + 0.01 * ((epoch - epochs * 0.7) / (epochs * 0.3)), 0.01)

        model.train()
        total_loss = 0
        total_recon = 0
        total_kl = 0
        num_batches = 0

        for batch_idx, (data, labels) in enumerate(train_loader):
            data, labels = data.to(device), labels.to(device)
            optimizer.zero_grad()
            x_recon, mu, logvar = model(data, labels)
            loss, recon_loss, kl_loss = vae_loss_with_logging(x_recon, data, mu, logvar, beta)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            total_recon += recon_loss.item()
            total_kl += kl_loss.item()
            num_batches += 1

        avg_loss = total_loss / num_batches
        avg_recon = total_recon / num_batches
        avg_kl = total_kl / num_batches

        if neptune_run:
            neptune_run["conditional_vae/train/total_loss"].append(avg_loss)
            neptune_run["conditional_vae/train/reconstruction_loss"].append(avg_recon)
            neptune_run["conditional_vae/train/kl_loss"].append(avg_kl)
            neptune_run["conditional_vae/beta"].append(beta)
            neptune_run["conditional_vae/learning_rate"].append(optimizer.param_groups[0]['lr'])

        scheduler.step(avg_loss)

        if (epoch + 1) % 15 == 0:
            print(f"Conditional VAE Epoch {epoch+1}: Loss = {avg_loss:.4f} (Recon: {avg_recon:.4f}, KL: {avg_kl:.4f}), Beta = {beta:.6f}")

        if early_stopping(avg_loss, model):
            print(f"Conditional VAE early stopping at epoch {epoch+1}")
            break

    return model

Conditional Data Setup


In [16]:
income_labels = (data_clean['income'] == '>50K').astype(int).values
categorical_cols_no_income = [col for col in categorical_cols if col != 'income']
categorical_data_no_income = data_clean[categorical_cols_no_income].copy()
categorical_encoded_no_income = pd.get_dummies(categorical_data_no_income, drop_first=False)
final_data_conditional = pd.concat([continuous_scaled, categorical_encoded_no_income], axis=1).astype(float)

X_train_cond, X_test_cond, y_train_cond, y_test_cond = train_test_split(
   final_data_conditional.values, income_labels, test_size=0.2, random_state=42, stratify=income_labels
)

X_train_cond_tensor = torch.FloatTensor(X_train_cond)
y_train_cond_tensor = torch.LongTensor(y_train_cond)
train_dataset_cond = TensorDataset(X_train_cond_tensor, y_train_cond_tensor)
train_loader_cond = DataLoader(train_dataset_cond, batch_size=batch_size, shuffle=True)

input_dim_cond = X_train_cond.shape[1]
num_classes = 2

Neptune Runs Initialization


In [17]:
run_vae = neptune.init_run(
   project="alon.sadot02/DeepLearning-task4",
   api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI4ODJjMTE2MC0wNTk5LTRlOGYtOWMxMC0zNTg5NzdkYzlkZjcifQ==",
   tags=["regularized-VAE", "early-stopping", "adult-dataset"],
   name="Regularized-VAE-Training"
)

run_gan = neptune.init_run(
   project="alon.sadot02/DeepLearning-task4",
   api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI4ODJjMTE2MC0wNTk5LTRlOGYtOWMxMC0zNTg5NzdkYzlkZjcifQ==",
   tags=["regularized-GAN", "early-stopping", "WGAN-GP", "adult-dataset"],
   name="Regularized-GAN-Training"
)

run_cgan = neptune.init_run(
  project="alon.sadot02/DeepLearning-task4",
  api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI4ODJjMTE2MC0wNTk5LTRlOGYtOWMxMC0zNTg5NzdkYzlkZjcifQ==",
  tags=["regularized-cGAN", "early-stopping", "conditional", "adult-dataset"],
  name="Regularized-Conditional-GAN-Training"
)

[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/alon.sadot02/DeepLearning-task4/e/DEEP1-100
[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/alon.sadot02/DeepLearning-task4/e/DEEP1-101
[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/alon.sadot02/DeepLearning-task4/e/DEEP1-102


Hyperparameters Logging


In [18]:
run_vae["hyperparameters"] = {
   "latent_dim": 64,
   "hidden_dims": [512, 256],
   "learning_rate": 5e-4,
   "weight_decay": 1e-4,
   "dropout": 0.3,
   "early_stopping_patience": 20,
   "batch_size": batch_size,
   "max_epochs": 200,
   "beta_max": 0.01
}

run_gan["hyperparameters"] = {
   "noise_dim": 64,
   "latent_dim": 64,
   "generator_hidden": [256, 512, 256],
   "discriminator_hidden": [256, 128],
   "learning_rate_g": 5e-5,
   "learning_rate_d": 1e-4,
   "weight_decay": 1e-5,
   "dropout": 0.2,
   "lambda_gp": 10.0,
   "n_critic": 3,
   "batch_size": batch_size,
   "max_epochs": 150
}

Model Training


In [19]:
print("Training Regularized VAE with Early Stopping...")
regularized_vae = RegularizedVAE(input_dim, latent_dim=64).to(device)
regularized_vae = train_vae_with_early_stopping(regularized_vae, train_loader, test_loader, run_vae, epochs=200)

print("Training Regularized Conditional VAE with Early Stopping...")
regularized_conditional_vae = RegularizedConditionalVAE(input_dim_cond, latent_dim=64, num_classes=2).to(device)
regularized_conditional_vae = train_conditional_vae_with_early_stopping(regularized_conditional_vae, train_loader_cond, run_cgan, epochs=150)

print("Training Regularized GANs with Early Stopping...")

regularized_gan = RegularizedLatentGAN_Pure(
    vae_model=regularized_vae,
    neptune_run=run_gan,
    noise_dim=64,
    latent_dim=64,
    lr=1e-4
)

print("Moving models to device...")
regularized_vae = regularized_vae.to(device)
regularized_conditional_vae = regularized_conditional_vae.to(device)

Training Regularized VAE with Early Stopping...
Epoch 10/200:
  Train Loss: 2.3635 (Recon: 2.3437, KL: 307.8863)
  Val Loss: 1.1809 (Recon: 1.1628, KL: 282.1476)
  Beta: 0.000064, LR: 0.000500
Epoch 20/200:
  Train Loss: 1.7174 (Recon: 1.6846, KL: 241.2368)
  Val Loss: 0.8437 (Recon: 0.8136, KL: 221.6266)
  Beta: 0.000136, LR: 0.000500
Epoch 30/200:
  Train Loss: 1.4690 (Recon: 1.4288, KL: 193.9276)
  Val Loss: 0.6401 (Recon: 0.6030, KL: 179.2718)
  Beta: 0.000207, LR: 0.000500
Epoch 40/200:
  Train Loss: 1.3244 (Recon: 1.2762, KL: 172.9607)
  Val Loss: 0.5234 (Recon: 0.4784, KL: 161.7161)
  Beta: 0.000279, LR: 0.000500
Epoch 50/200:
  Train Loss: 1.2383 (Recon: 1.1823, KL: 159.9030)
  Val Loss: 0.4489 (Recon: 0.3973, KL: 147.3377)
  Beta: 0.000350, LR: 0.000500
Epoch 60/200:
  Train Loss: 1.2041 (Recon: 1.1403, KL: 151.4395)
  Val Loss: 0.4445 (Recon: 0.3840, KL: 143.6146)
  Beta: 0.000421, LR: 0.000500
Epoch 70/200:
  Train Loss: 1.1748 (Recon: 1.1032, KL: 145.3891)
  Val Loss: 0.408

GAN Training Function with Early Stopping


In [20]:
def train_gan_with_early_stopping_fixed(gan_model, train_loader, device, epochs=150):
    gan_model = gan_model.to(device)
    gan_model.setup_optimizers(device)

    d_losses = []
    g_losses = []
    best_combined_loss = float('inf')
    patience_counter = 0
    patience = 30

    print(f"Training GAN on device: {device}")

    for epoch in range(epochs):
        epoch_d_losses = []
        epoch_g_losses = []

        gan_model.train()

        for batch_idx, (data,) in enumerate(train_loader):
            result = gan_model.training_step_manual(data, batch_idx, device)
            epoch_d_losses.append(result['d_loss'])
            if result['g_loss'] != 0:
                epoch_g_losses.append(result['g_loss'])

        avg_d_loss = np.mean(epoch_d_losses)
        avg_g_loss = np.mean(epoch_g_losses) if epoch_g_losses else 0.0

        d_losses.append(avg_d_loss)
        g_losses.append(avg_g_loss)

        combined_loss = abs(avg_d_loss) + abs(avg_g_loss)

        if gan_model.neptune_run:
            gan_model.neptune_run["gan/epoch/discriminator_loss"].append(avg_d_loss)
            gan_model.neptune_run["gan/epoch/generator_loss"].append(avg_g_loss)
            gan_model.neptune_run["gan/epoch/combined_loss"].append(combined_loss)

        if combined_loss < best_combined_loss:
            best_combined_loss = combined_loss
            patience_counter = 0
            torch.save(gan_model.state_dict(), 'best_gan_model.pth')
        else:
            patience_counter += 1

        if (epoch + 1) % 10 == 0:
            print(f"GAN Epoch {epoch+1}: D_loss = {avg_d_loss:.4f}, G_loss = {avg_g_loss:.4f}, Combined = {combined_loss:.4f}")

        if patience_counter >= patience:
            print(f"GAN early stopping at epoch {epoch+1}")
            gan_model.load_state_dict(torch.load('best_gan_model.pth'))
            break

    return gan_model

GAN Training Execution


In [21]:
trainer_gan = Trainer(
   max_epochs=1,
   accelerator='gpu' if torch.cuda.is_available() else 'cpu',
   devices=1,
   logger=False,
   enable_checkpointing=False,
   enable_progress_bar=False
)

regularized_gan = train_gan_with_early_stopping_fixed(regularized_gan, train_loader, device, epochs=150)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Training GAN on device: cuda
GAN Epoch 10: D_loss = -1.3229, G_loss = 1.2850, Combined = 2.6079
GAN Epoch 20: D_loss = -1.4638, G_loss = 1.6194, Combined = 3.0831
GAN Epoch 30: D_loss = -1.3088, G_loss = 1.3141, Combined = 2.6229
GAN early stopping at epoch 37


Conditional GAN Models


In [22]:
class RegularizedConditionalGenerator(nn.Module):
   def __init__(self, noise_dim, latent_dim, num_classes, hidden_dims=[256, 512, 256]):
       super(RegularizedConditionalGenerator, self).__init__()
       self.num_classes = num_classes
       input_dim = noise_dim + num_classes

       layers = []
       prev_dim = input_dim
       for hidden_dim in hidden_dims:
           layers.extend([
               nn.Linear(prev_dim, hidden_dim),
               nn.BatchNorm1d(hidden_dim),
               nn.LeakyReLU(0.2),
               nn.Dropout(0.2)
           ])
           prev_dim = hidden_dim

       layers.append(nn.Linear(prev_dim, latent_dim))
       self.model = nn.Sequential(*layers)
       self.apply(self._init_weights)

   def _init_weights(self, module):
       if isinstance(module, nn.Linear):
           nn.init.xavier_uniform_(module.weight)
           if module.bias is not None:
               nn.init.constant_(module.bias, 0)

   def forward(self, noise, labels):
       labels_onehot = torch.nn.functional.one_hot(labels, self.num_classes).float()
       input_tensor = torch.cat([noise, labels_onehot], dim=1)
       return self.model(input_tensor)

class RegularizedConditionalDiscriminator(nn.Module):
   def __init__(self, latent_dim, num_classes, hidden_dims=[256, 128]):
       super(RegularizedConditionalDiscriminator, self).__init__()
       self.num_classes = num_classes
       input_dim = latent_dim + num_classes

       layers = []
       prev_dim = input_dim
       for hidden_dim in hidden_dims:
           layers.extend([
               nn.Linear(prev_dim, hidden_dim),
               nn.LeakyReLU(0.2),
               nn.Dropout(0.3)
           ])
           prev_dim = hidden_dim

       layers.append(nn.Linear(prev_dim, 1))
       self.model = nn.Sequential(*layers)
       self.apply(self._init_weights)

   def _init_weights(self, module):
       if isinstance(module, nn.Linear):
           nn.init.xavier_uniform_(module.weight)
           if module.bias is not None:
               nn.init.constant_(module.bias, 0)

   def forward(self, latent_code, labels):
       labels_onehot = torch.nn.functional.one_hot(labels, self.num_classes).float()
       input_tensor = torch.cat([latent_code, labels_onehot], dim=1)
       return self.model(input_tensor)

Conditional Latent GAN Model


In [23]:
class RegularizedConditionalLatentGAN_Pure(nn.Module):
    def __init__(self, conditional_vae_model, neptune_run=None, noise_dim=64, latent_dim=64, num_classes=2, lr=1e-4):
        super().__init__()
        self.conditional_vae_model = conditional_vae_model
        self.conditional_vae_model.eval()
        for param in self.conditional_vae_model.parameters():
            param.requires_grad = False

        self.generator = RegularizedConditionalGenerator(noise_dim, latent_dim, num_classes)
        self.discriminator = RegularizedConditionalDiscriminator(latent_dim, num_classes)

        self.noise_dim = noise_dim
        self.latent_dim = latent_dim
        self.num_classes = num_classes
        self.lr = lr
        self.neptune_run = neptune_run

        self.lambda_gp = 10.0
        self.n_critic = 3

    def setup_optimizers(self, device):
        self.optimizer_g = torch.optim.Adam(self.generator.parameters(), lr=self.lr/2, betas=(0.5, 0.9), weight_decay=1e-5)
        self.optimizer_d = torch.optim.Adam(self.discriminator.parameters(), lr=self.lr, betas=(0.5, 0.9), weight_decay=1e-5)

    def training_step_manual(self, real_data, labels, batch_idx, device):
        batch_size = real_data.size(0)

        real_data = real_data.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            real_latent, _ = self.conditional_vae_model.encode(real_data, labels)
            real_latent = real_latent.to(device)

        self.optimizer_d.zero_grad()

        real_pred = self.discriminator(real_latent, labels)

        noise = torch.randn(batch_size, self.noise_dim, device=device)
        fake_latent = self.generator(noise, labels)
        fake_pred = self.discriminator(fake_latent.detach(), labels)

        d_loss = -torch.mean(real_pred) + torch.mean(fake_pred)

        gp = gradient_penalty(self.discriminator, real_latent, fake_latent, device, labels)
        d_loss += self.lambda_gp * gp

        d_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.discriminator.parameters(), max_norm=1.0)
        self.optimizer_d.step()

        g_loss = torch.tensor(0.0, device=device)
        if batch_idx % self.n_critic == 0:
            self.optimizer_g.zero_grad()

            noise = torch.randn(batch_size, self.noise_dim, device=device)
            fake_latent = self.generator(noise, labels)
            fake_pred = self.discriminator(fake_latent, labels)

            g_loss = -torch.mean(fake_pred)

            g_loss.backward()
            torch.nn.utils.clip_grad_norm_(self.generator.parameters(), max_norm=1.0)
            self.optimizer_g.step()

        if self.neptune_run and batch_idx % 10 == 0:
            self.neptune_run["conditional_gan/train/discriminator_loss"].append(d_loss.item())
            self.neptune_run["conditional_gan/train/generator_loss"].append(g_loss.item())
            self.neptune_run["conditional_gan/train/gradient_penalty"].append(gp.item())

        return {'d_loss': d_loss.item(), 'g_loss': g_loss.item()}

    def generate_conditional_synthetic_data(self, num_samples, target_labels, device):
        self.eval()
        with torch.no_grad():
            batch_size = 1000
            all_samples = []

            for i in range(0, num_samples, batch_size):
                current_batch_size = min(batch_size, num_samples - i)
                start_idx = i
                end_idx = min(i + current_batch_size, len(target_labels))

                noise = torch.randn(end_idx - start_idx, self.noise_dim, device=device)
                batch_labels = torch.tensor(target_labels[start_idx:end_idx], device=device)
                fake_latent = self.generator(noise, batch_labels)
                synthetic_data = self.conditional_vae_model.decode(fake_latent, batch_labels)
                all_samples.append(synthetic_data.cpu().numpy())

            return np.vstack(all_samples)

Conditional GAN Training Function


In [24]:
def train_conditional_gan_with_early_stopping_fixed(cgan_model, train_loader, device, epochs=150):
    cgan_model = cgan_model.to(device)
    cgan_model.setup_optimizers(device)

    d_losses = []
    g_losses = []
    best_combined_loss = float('inf')
    patience_counter = 0
    patience = 30

    print(f"Training Conditional GAN on device: {device}")

    for epoch in range(epochs):
        epoch_d_losses = []
        epoch_g_losses = []

        cgan_model.train()

        for batch_idx, (data, labels) in enumerate(train_loader):
            result = cgan_model.training_step_manual(data, labels, batch_idx, device)
            epoch_d_losses.append(result['d_loss'])
            if result['g_loss'] != 0:
                epoch_g_losses.append(result['g_loss'])

        avg_d_loss = np.mean(epoch_d_losses)
        avg_g_loss = np.mean(epoch_g_losses) if epoch_g_losses else 0.0

        d_losses.append(avg_d_loss)
        g_losses.append(avg_g_loss)

        combined_loss = abs(avg_d_loss) + abs(avg_g_loss)

        if cgan_model.neptune_run:
            cgan_model.neptune_run["conditional_gan/epoch/discriminator_loss"].append(avg_d_loss)
            cgan_model.neptune_run["conditional_gan/epoch/generator_loss"].append(avg_g_loss)
            cgan_model.neptune_run["conditional_gan/epoch/combined_loss"].append(combined_loss)

        if combined_loss < best_combined_loss:
            best_combined_loss = combined_loss
            patience_counter = 0
            torch.save(cgan_model.state_dict(), 'best_cgan_model.pth')
        else:
            patience_counter += 1

        if (epoch + 1) % 10 == 0:
            print(f"Conditional GAN Epoch {epoch+1}: D_loss = {avg_d_loss:.4f}, G_loss = {avg_g_loss:.4f}")

        if patience_counter >= patience:
            print(f"Conditional GAN early stopping at epoch {epoch+1}")
            cgan_model.load_state_dict(torch.load('best_cgan_model.pth'))
            break

    return cgan_model

Conditional GAN Training Execution


In [25]:
regularized_cgan = RegularizedConditionalLatentGAN_Pure(
    conditional_vae_model=regularized_conditional_vae,
    neptune_run=run_cgan,
    noise_dim=64,
    latent_dim=64,
    num_classes=2,
    lr=1e-4
)

regularized_cgan = train_conditional_gan_with_early_stopping_fixed(regularized_cgan, train_loader_cond, device, epochs=150)

Training Conditional GAN on device: cuda
Conditional GAN Epoch 10: D_loss = -1.4239, G_loss = 1.7963
Conditional GAN Epoch 20: D_loss = -1.3716, G_loss = 1.8531
Conditional GAN Epoch 30: D_loss = -1.2162, G_loss = 1.6927
Conditional GAN early stopping at epoch 37


Synthetic Data Generation


In [26]:
print("Generating synthetic data from regularized models...")

def generate_regularized_synthetic_datasets(gan_model, cgan_model, train_size, y_train_cond):
   regular_synthetic_data = gan_model.generate_synthetic_data(train_size, device)

   class_counts = np.bincount(y_train_cond)
   low_income_count = class_counts[0]
   high_income_count = class_counts[1]

   low_income_labels = [0] * low_income_count
   high_income_labels = [1] * high_income_count

   low_income_synthetic = cgan_model.generate_conditional_synthetic_data(low_income_count, low_income_labels, device)
   high_income_synthetic = cgan_model.generate_conditional_synthetic_data(high_income_count, high_income_labels, device)

   synthetic_low_labels = np.zeros(low_income_count, dtype=int)
   synthetic_high_labels = np.ones(high_income_count, dtype=int)

   conditional_synthetic_data = np.vstack([low_income_synthetic, high_income_synthetic])
   conditional_synthetic_labels = np.concatenate([synthetic_low_labels, synthetic_high_labels])

   shuffle_indices = np.random.permutation(len(conditional_synthetic_data))
   conditional_synthetic_data = conditional_synthetic_data[shuffle_indices]
   conditional_synthetic_labels = conditional_synthetic_labels[shuffle_indices]

   return regular_synthetic_data, conditional_synthetic_data, conditional_synthetic_labels

regularized_regular_synthetic, regularized_conditional_synthetic, regularized_conditional_labels = generate_regularized_synthetic_datasets(
  regularized_gan, regularized_cgan, len(X_train), y_train_cond
)

Generating synthetic data from regularized models...


Data Format Conversion Functions


In [27]:
def convert_to_original_format(synthetic_array, scaler, categorical_cols, continuous_cols, final_data):
  continuous_synthetic = synthetic_array[:, :len(continuous_cols)]
  categorical_synthetic = synthetic_array[:, len(continuous_cols):]

  continuous_original = scaler.inverse_transform(continuous_synthetic)
  continuous_df = pd.DataFrame(continuous_original, columns=continuous_cols)

  categorical_encoded_cols = final_data.columns[len(continuous_cols):]
  categorical_df_encoded = pd.DataFrame(categorical_synthetic, columns=categorical_encoded_cols)

  categorical_df = pd.DataFrame()
  for cat_col in categorical_cols:
      cat_columns = [col for col in categorical_encoded_cols if col.startswith(f'{cat_col}_')]
      if cat_columns:
          cat_data = categorical_df_encoded[cat_columns].values
          cat_indices = np.argmax(cat_data, axis=1)
          categories = [col.replace(f'{cat_col}_', '') for col in cat_columns]
          categorical_values = [categories[i] for i in cat_indices]
          categorical_df[cat_col] = categorical_values

  result_df = pd.concat([continuous_df, categorical_df], axis=1)
  all_columns = continuous_cols + categorical_cols
  result_df = result_df[all_columns]

  for col in continuous_cols:
      if col in ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']:
          result_df[col] = result_df[col].clip(lower=0).round().astype(int)

  return result_df

def convert_conditional_to_original_format(synthetic_array, labels, scaler, categorical_cols, continuous_cols, final_data_conditional):
  continuous_synthetic = synthetic_array[:, :len(continuous_cols)]
  categorical_synthetic = synthetic_array[:, len(continuous_cols):]

  continuous_original = scaler.inverse_transform(continuous_synthetic)
  continuous_df = pd.DataFrame(continuous_original, columns=continuous_cols)

  categorical_cols_no_income = [col for col in categorical_cols if col != 'income']
  categorical_encoded_cols_no_income = [col for col in final_data_conditional.columns[len(continuous_cols):]]
  categorical_df_encoded = pd.DataFrame(categorical_synthetic, columns=categorical_encoded_cols_no_income)

  categorical_df = pd.DataFrame()
  for cat_col in categorical_cols_no_income:
      cat_columns = [col for col in categorical_encoded_cols_no_income if col.startswith(f'{cat_col}_')]
      if cat_columns:
          cat_data = categorical_df_encoded[cat_columns].values
          cat_indices = np.argmax(cat_data, axis=1)
          categories = [col.replace(f'{cat_col}_', '') for col in cat_columns]
          categorical_values = [categories[i] for i in cat_indices]
          categorical_df[cat_col] = categorical_values

  income_values = ['<=50K' if label == 0 else '>50K' for label in labels]
  categorical_df['income'] = income_values

  result_df = pd.concat([continuous_df, categorical_df], axis=1)
  all_columns = continuous_cols + categorical_cols
  result_df = result_df[all_columns]

  for col in continuous_cols:
      if col in ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']:
          result_df[col] = result_df[col].clip(lower=0).round().astype(int)

  return result_df

Data Conversion and Constraints


In [28]:
regularized_dataset1_original = convert_to_original_format(regularized_regular_synthetic, scaler, categorical_cols, continuous_cols, final_data)
regularized_dataset2_original = convert_conditional_to_original_format(regularized_conditional_synthetic, regularized_conditional_labels, scaler, categorical_cols, continuous_cols, final_data_conditional)

def minimal_domain_constraints(synthetic_data):
   processed = synthetic_data.copy()

   if 'age' in processed.columns:
       processed['age'] = processed['age'].clip(17, 90)

   if 'hours-per-week' in processed.columns:
       processed['hours-per-week'] = processed['hours-per-week'].clip(1, 99)

   financial_cols = ['fnlwgt', 'capital-gain', 'capital-loss']
   for col in financial_cols:
       if col in processed.columns:
           processed[col] = processed[col].clip(lower=0)

   int_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
   for col in int_cols:
       if col in processed.columns:
           processed[col] = processed[col].round().astype(int)

   return processed

regularized_dataset1_final = minimal_domain_constraints(regularized_dataset1_original)
regularized_dataset2_final = minimal_domain_constraints(regularized_dataset2_original)

regularized_dataset1_final.to_csv("regularized_synthetic_adult_regular_gan.csv", index=False)
regularized_dataset2_final.to_csv("regularized_synthetic_adult_conditional_gan.csv", index=False)
data_clean.to_csv("original_adult_reference.csv", index=False)

Evaluation Functions


In [29]:
def detection_evaluation_4fold(original_training_data, synthetic_data):
  from sklearn.model_selection import KFold

  def prepare_ml_data(df):
      df_ml = df.copy()
      for col in df.select_dtypes(include=['object']).columns:
          le = LabelEncoder()
          df_ml[col] = le.fit_transform(df_ml[col].astype(str))
      return df_ml

  original_ml = prepare_ml_data(original_training_data)
  synthetic_ml = prepare_ml_data(synthetic_data)

  min_size = min(len(original_ml), len(synthetic_ml))
  real_sample = original_ml.sample(n=min_size, random_state=42).reset_index(drop=True)
  synthetic_sample = synthetic_ml.sample(n=min_size, random_state=42).reset_index(drop=True)

  kf = KFold(n_splits=4, shuffle=True, random_state=42)

  real_folds = []
  synthetic_folds = []

  for train_idx, test_idx in kf.split(real_sample):
      real_folds.append({
          'train': real_sample.iloc[train_idx],
          'test': real_sample.iloc[test_idx]
      })

  for train_idx, test_idx in kf.split(synthetic_sample):
      synthetic_folds.append({
          'train': synthetic_sample.iloc[train_idx],
          'test': synthetic_sample.iloc[test_idx]
      })

  auc_scores = []

  for fold in range(4):
      real_train_folds = [real_folds[i]['train'] for i in range(4) if i != fold]
      synthetic_train_folds = [synthetic_folds[i]['train'] for i in range(4) if i != fold]

      real_train = pd.concat(real_train_folds, ignore_index=True)
      synthetic_train = pd.concat(synthetic_train_folds, ignore_index=True)

      real_test = real_folds[fold]['test']
      synthetic_test = synthetic_folds[fold]['test']

      X_train = pd.concat([real_train, synthetic_train], ignore_index=True)
      y_train = np.concatenate([np.zeros(len(real_train)), np.ones(len(synthetic_train))])

      X_test = pd.concat([real_test, synthetic_test], ignore_index=True)
      y_test = np.concatenate([np.zeros(len(real_test)), np.ones(len(synthetic_test))])

      train_shuffle_idx = np.random.RandomState(42 + fold).permutation(len(X_train))
      test_shuffle_idx = np.random.RandomState(42 + fold).permutation(len(X_test))

      X_train = X_train.iloc[train_shuffle_idx].reset_index(drop=True)
      y_train = y_train[train_shuffle_idx]
      X_test = X_test.iloc[test_shuffle_idx].reset_index(drop=True)
      y_test = y_test[test_shuffle_idx]

      rf = RandomForestClassifier(n_estimators=100, random_state=42 + fold, n_jobs=-1)
      rf.fit(X_train, y_train)

      y_pred_proba = rf.predict_proba(X_test)[:, 1]
      auc = roc_auc_score(y_test, y_pred_proba)
      auc_scores.append(auc)

  mean_auc = np.mean(auc_scores)
  return mean_auc, auc_scores

def efficacy_evaluation(original_train_data, original_test_data, synthetic_data):
    def prepare_ml_data_consistent(train_df, test_df, synthetic_df):
        all_data = pd.concat([train_df, test_df, synthetic_df], ignore_index=True)
        categorical_cols_eval = train_df.select_dtypes(include=['object']).columns
        label_encoders = {}

        for col in categorical_cols_eval:
            le = LabelEncoder()
            le.fit(all_data[col].astype(str))
            label_encoders[col] = le

        train_ml = train_df.copy()
        test_ml = test_df.copy()
        synthetic_ml = synthetic_df.copy()

        for col in categorical_cols_eval:
            train_ml[col] = label_encoders[col].transform(train_df[col].astype(str))
            test_ml[col] = label_encoders[col].transform(test_df[col].astype(str))
            synthetic_ml[col] = label_encoders[col].transform(synthetic_df[col].astype(str))

        return train_ml, test_ml, synthetic_ml

    original_train_ml, original_test_ml, synthetic_ml = prepare_ml_data_consistent(
        original_train_data, original_test_data, synthetic_data
    )

    X_train_orig = original_train_ml.drop('income', axis=1)
    y_train_orig = original_train_ml['income']
    X_test_orig = original_test_ml.drop('income', axis=1)
    y_test_orig = original_test_ml['income']

    rf_baseline = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_baseline.fit(X_train_orig, y_train_orig)
    baseline_auc = roc_auc_score(y_test_orig, rf_baseline.predict_proba(X_test_orig)[:, 1])

    X_train_synth = synthetic_ml.drop('income', axis=1)
    y_train_synth = synthetic_ml['income']

    X_train_synth = X_train_synth[X_train_orig.columns]

    rf_synthetic = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    rf_synthetic.fit(X_train_synth, y_train_synth)
    synthetic_auc = roc_auc_score(y_test_orig, rf_synthetic.predict_proba(X_test_orig)[:, 1])

    efficacy_ratio = synthetic_auc / baseline_auc

    return baseline_auc, synthetic_auc, efficacy_ratio

Model Evaluation


In [30]:
original_train = data_clean.iloc[:len(X_train)].reset_index(drop=True)
original_test = data_clean.iloc[len(X_train):].reset_index(drop=True)

print("Running Regularized Model Evaluation...")
reg_detection_auc_reg, _ = detection_evaluation_4fold(original_train, regularized_dataset1_final)
reg_detection_auc_cond, _ = detection_evaluation_4fold(original_train, regularized_dataset2_final)

print("Running Regularized Efficacy Evaluation...")
reg_baseline_auc_reg, reg_synthetic_auc_reg, reg_efficacy_ratio_reg = efficacy_evaluation(original_train, original_test, regularized_dataset1_final)
reg_baseline_auc_cond, reg_synthetic_auc_cond, reg_efficacy_ratio_cond = efficacy_evaluation(original_train, original_test, regularized_dataset2_final)

Running Regularized Model Evaluation...
Running Regularized Efficacy Evaluation...
