# **0. Introduction**
This notebook build a GAN, cGAn and Variational AutoEncoder for data augmentation and
latent space feature absraction

In [1]:
# Import necesary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
print('Libraries loaded')

Libraries loaded


# **1. GAN**

In [3]:
# Define a simple GAN for tabular data generation
# Define Generator
class Generator(nn.Module):
    def __init__(self, input_size, output_size):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, output_size)
        )
    
    def forward(self, x):
        return self.model(x)
# Define Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_size):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

# Generate random tabular data for training
def generate_random_data(num_samples, input_size):
    return torch.randn(num_samples, input_size)

# Training parameters
batch_size = 32
num_epochs = 100
input_size = 10  # Number of features in the tabular data
latent_size = 20  # Size of the input noise vector for the generator
data_size = 1000  # Number of samples in the dataset

# Create generator and discriminator models
generator = Generator(latent_size, input_size)
discriminator = Discriminator(input_size)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# Binary cross-entropy loss
criterion = nn.BCELoss()

# Generate random tabular data
real_data = generate_random_data(data_size, input_size)
data_loader = DataLoader(TensorDataset(real_data), batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for i, data in enumerate(data_loader):
        real_samples = data[0]
        batch_size = real_samples.size(0)
        real_labels = torch.ones(batch_size, 1)  # Labels for real data
        fake_labels = torch.zeros(batch_size, 1)  # Labels for generated data
        
        # Train Discriminator 
        discriminator.zero_grad()
        
        # Train with real data
        real_output = discriminator(real_samples)
        d_loss_real = criterion(real_output, real_labels)
        
        # Train with fake data
        z = torch.randn(batch_size, latent_size)
        fake_samples = generator(z)
        fake_output = discriminator(fake_samples.detach())  # Detach to avoid backpropagating through G
        d_loss_fake = criterion(fake_output, fake_labels)
        
        # Combined loss
        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_D.step()
        
        # Train Generator 
        generator.zero_grad()
        
        # Generate fake data
        z = torch.randn(batch_size, latent_size)
        fake_samples = generator(z)
        fake_output = discriminator(fake_samples)
        
        # Generator loss (maximize log(D(G(z))))
        g_loss = criterion(fake_output, real_labels)
        g_loss.backward()
        optimizer_G.step()
        
        # Print progress
        if i % 100 == 0:
            print(f"Epoch [{epoch}/{num_epochs}], Step [{i}/{len(data_loader)}], "
                  f"Generator Loss: {g_loss.item():.4f}, Discriminator Loss: {d_loss.item():.4f}")
# Generate augmented data
augmented_examples = 10
print(generator(torch.randn(augmented_examples, latent_size)))


Epoch [0/100], Step [0/32], Generator Loss: 0.7762, Discriminator Loss: 1.4646
Epoch [1/100], Step [0/32], Generator Loss: 0.6864, Discriminator Loss: 1.4245
Epoch [2/100], Step [0/32], Generator Loss: 0.6427, Discriminator Loss: 1.4087
Epoch [3/100], Step [0/32], Generator Loss: 0.6989, Discriminator Loss: 1.3179
Epoch [4/100], Step [0/32], Generator Loss: 0.7660, Discriminator Loss: 1.2429
Epoch [5/100], Step [0/32], Generator Loss: 0.7843, Discriminator Loss: 1.1737
Epoch [6/100], Step [0/32], Generator Loss: 0.7729, Discriminator Loss: 1.1436
Epoch [7/100], Step [0/32], Generator Loss: 0.7632, Discriminator Loss: 1.2006
Epoch [8/100], Step [0/32], Generator Loss: 0.7291, Discriminator Loss: 1.1476
Epoch [9/100], Step [0/32], Generator Loss: 0.6632, Discriminator Loss: 1.1909
Epoch [10/100], Step [0/32], Generator Loss: 0.5584, Discriminator Loss: 1.3916
Epoch [11/100], Step [0/32], Generator Loss: 0.5333, Discriminator Loss: 1.3338
Epoch [12/100], Step [0/32], Generator Loss: 0.576

# **2. cGAN**

In [4]:

# Define a simple cGAN for tabular data generation
class Generator(nn.Module):
    def __init__(self, latent_size, input_size, num_classes, embedding_dim=16):
        super(Generator, self).__init__()
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.model = nn.Sequential(
            nn.Linear(latent_size + embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_size),
            nn.Sigmoid()  # For scaling data between 0 and 1
        )

    def forward(self, z, labels):
        label_embedding = self.embedding(labels)
        combined_input = torch.cat((z, label_embedding), dim=1)
        return self.model(combined_input)

class Discriminator(nn.Module):
    def __init__(self, input_size, num_classes, embedding_dim=16):
        super(Discriminator, self).__init__()
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.model = nn.Sequential(
            nn.Linear(input_size + embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x, labels):
        label_embedding = self.embedding(labels)
        combined_input = torch.cat((x, label_embedding), dim=1)
        return self.model(combined_input)

# Sample data generation (replace with your actual data loading logic)
def generate_data(num_samples, input_size, num_classes):
    # Assuming data is normally distributed within a specific range
    data = np.random.rand(num_samples, input_size) * (upper_bound - lower_bound) + lower_bound
    labels = np.random.randint(0, num_classes, size=(num_samples,))
    return data, labels

# Training parameters
batch_size = 32
num_epochs = 100
input_size = 10  # Number of features in the data
latent_size = 20  # Size of the input noise vector for the generator
data_size = 1000  # Number of samples in the dataset
num_classes = 2  # Number of classes in your data (adjust as needed)
lower_bound = 0  # Lower bound for data values (adjust as needed)
upper_bound = 1  # Upper bound for data values (adjust as needed)

# Create generator and discriminator models
generator = Generator(latent_size, input_size, num_classes)
discriminator = Discriminator(input_size, num_classes)

# Optimizers
optimizer_G = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0002)

# Binary cross-entropy loss
criterion = nn.BCELoss()

# Generate real tabular data
real_data, real_labels = generate_data(data_size, input_size, num_classes)
data_tensor = torch.from_numpy(np.concatenate((real_data, real_labels.reshape(-1, 1)), axis=1))
data_loader = DataLoader(TensorDataset(data_tensor), batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    for i, data in enumerate(data_loader):
        real_samples, real_labels = data[0][:, :-1], data[0][:, -1]
        batch_size = real_samples.size(0)

        # Train Discriminator
        discriminator.zero_grad()

        # Train with real data
        real_output = discriminator(real_samples.float(), real_labels.long())
        d_loss_real = criterion(real_output, torch.ones(batch_size, 1))

        # Train with fake data
        z = torch.randn(batch_size, latent_size)
        fake_labels = torch.randint(0, num_classes, size=(batch_size,))  # Random labels for fake data (can be adjusted)
        fake_samples = generator(z, fake_labels)
        fake_output = discriminator(fake_samples.detach(), fake_labels)  # Detach to avoid backprop through G
        d_loss_fake = criterion(fake_output, torch.zeros(batch_size, 1))

        # Combined loss
        d_loss = d_loss_real + d_loss_fake
        d_loss.backward(retain_graph=True)
        optimizer_D.step()

        # Train Generator
        generator.zero_grad()

        # Generate fake data (with a specific class label for illustration)
        specific_class_label = 1  # You can choose any class label here
        z = torch.randn(batch_size, latent_size)
        fake_samples = generator(z, torch.full((batch_size, ), specific_class_label))
        fake_output = discriminator(fake_samples, torch.full((batch_size, ), specific_class_label))

        # Generator loss (maximize log(D(G(z))))
        g_loss = criterion(fake_output, torch.ones(batch_size, 1))
        g_loss.backward(retain_graph=True)
        optimizer_G.step()

        # Print progress
        if i % 10 == 0:
            print(f"Epoch [{epoch}/{num_epochs}], Step [{i}/{len(data_loader)}], "
                  f"Generator Loss: {g_loss.item():.4f}, Discriminator Loss: {d_loss.item():.4f}")

# Sample generated data (optional)
def sample_data(generator, latent_size, num_samples, specific_class_label, device):
    with torch.no_grad():
        z = torch.randn(num_samples, latent_size).to(device)
        fake_labels = torch.full((num_samples, ), specific_class_label).to(device)
        return generator(z, fake_labels)

# Generate data with a specific class label (optional)
specific_class_label = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generated_data = sample_data(generator.to(device), latent_size, 10, specific_class_label, device)
print("Generated data (class label:", specific_class_label, "):")
print(generated_data.cpu().numpy())

Epoch [0/100], Step [0/32], Generator Loss: 0.7127, Discriminator Loss: 1.3845
Epoch [0/100], Step [10/32], Generator Loss: 0.7194, Discriminator Loss: 1.3795
Epoch [0/100], Step [20/32], Generator Loss: 0.7173, Discriminator Loss: 1.3773
Epoch [0/100], Step [30/32], Generator Loss: 0.7129, Discriminator Loss: 1.3743
Epoch [1/100], Step [0/32], Generator Loss: 0.7101, Discriminator Loss: 1.3713
Epoch [1/100], Step [10/32], Generator Loss: 0.7022, Discriminator Loss: 1.3742
Epoch [1/100], Step [20/32], Generator Loss: 0.7012, Discriminator Loss: 1.3644
Epoch [1/100], Step [30/32], Generator Loss: 0.6954, Discriminator Loss: 1.3704
Epoch [2/100], Step [0/32], Generator Loss: 0.6985, Discriminator Loss: 1.3634
Epoch [2/100], Step [10/32], Generator Loss: 0.6914, Discriminator Loss: 1.3665
Epoch [2/100], Step [20/32], Generator Loss: 0.6890, Discriminator Loss: 1.3722
Epoch [2/100], Step [30/32], Generator Loss: 0.6964, Discriminator Loss: 1.3729
Epoch [3/100], Step [0/32], Generator Loss:

# **3. Autoencoder**

In [6]:

# Define the Autoencoder model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, input_dim),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Generate random data function
def generate_data(size, factor_range=(0, 1)):
    return torch.rand(size, input_dim) * (factor_range[1] - factor_range[0]) + factor_range[0]

# Define model parameters
input_dim = 10  # Number of features
latent_dim = 4  # Dimension of the latent space

# Generate training data
train_data = generate_data(size=1000, factor_range=(0, 10))

# Train the autoencoder
model = Autoencoder(input_dim, latent_dim)
optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

for epoch in range(10):
    for i in range(len(train_data)):
        # Get a mini-batch (consider using a DataLoader for larger datasets)
        batch = train_data[i:i+batch_size]  # Replace 'batch_size' with your desired size

        # Forward pass
        reconstructed = model(batch)
        loss = loss_fn(reconstructed, batch)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print training progress (optional)
        if i % 100 == 0:
            print(f"Epoch: {epoch+1}, Batch: {i+1}, Loss: {loss.item()}")

# Generate data for augmentation
new_data = generate_data(size=100, factor_range=(0, 10))  # Generate 100 data points

# Assuming no preprocessing needed for this example
processed_data = new_data

# Get latent representation
with torch.no_grad():
    latent_representation = model.encoder(processed_data)

# Augment data using latent space manipulation
noise = torch.randn(latent_representation.size()) * 0.1  # Add small noise
augmented_latent_rep = latent_representation + noise

# Reconstruct data from augmented latent representation
augmented_data = model.decoder(augmented_latent_rep)

# Print original and augmented data (optional)
print("Original Data:")
print(processed_data)
print("Augmented Data:")
print(augmented_data)
print('Latent Representation:')
print(latent_representation)

Epoch: 1, Batch: 1, Loss: 32.25492477416992
Epoch: 1, Batch: 101, Loss: 5.7697343826293945
Epoch: 1, Batch: 201, Loss: 4.383053779602051
Epoch: 1, Batch: 301, Loss: 4.34030818939209
Epoch: 1, Batch: 401, Loss: 6.303532600402832
Epoch: 1, Batch: 501, Loss: 6.092098236083984
Epoch: 1, Batch: 601, Loss: 5.223959922790527
Epoch: 1, Batch: 701, Loss: 5.139459609985352
Epoch: 1, Batch: 801, Loss: 4.191755771636963
Epoch: 1, Batch: 901, Loss: 4.849924087524414
Epoch: 2, Batch: 1, Loss: 4.327437400817871
Epoch: 2, Batch: 101, Loss: 4.087098121643066
Epoch: 2, Batch: 201, Loss: 4.1558098793029785
Epoch: 2, Batch: 301, Loss: 4.613105773925781
Epoch: 2, Batch: 401, Loss: 5.213890552520752
Epoch: 2, Batch: 501, Loss: 5.258609294891357
Epoch: 2, Batch: 601, Loss: 5.1807146072387695
Epoch: 2, Batch: 701, Loss: 4.436731815338135
Epoch: 2, Batch: 801, Loss: 4.099480628967285
Epoch: 2, Batch: 901, Loss: 4.4074811935424805
Epoch: 3, Batch: 1, Loss: 4.249795913696289
Epoch: 3, Batch: 101, Loss: 4.4476995