## Input parameters

In [None]:
batch_size = 32
learning_rate = 0.0005
noise_dim = 32 # dimension of random noise as generator's input
layers_dim = 128 # dimension at layers inside NN
epochs = 100000+1
model_name = 'model'
noise_dim = 32

## Install libraries

In [None]:
import pandas as pd
import numpy as np
import sklearn.datasets as ds
from sklearn.preprocessing import StandardScaler

import tensorflow
from tensorflow.keras.layers import Input, Dense, Dropout, ReLU, LeakyReLU
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
import keras

import os
import sys

import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import wasserstein_distance
from table_evaluator import load_data, TableEvaluator

## Prepare data

In [None]:
# Load data

dfdiabetes_x = pd.DataFrame(ds.load_diabetes().data)
dfdiabetes_x.columns = ds.load_diabetes().feature_names
dfdiabetes_y = pd.DataFrame(ds.load_diabetes().target)
dfdiabetes_y.columns = ['Outcome']
df = pd.concat([dfdiabetes_x,dfdiabetes_y], axis=1)
df.head()

In [None]:
# Scale data

scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df.head()

## GAN

In [None]:
class GAN():
    
    def __init__(self, batch_size, learning_rate, noise_dim, data_dim, layers_dim):
        # Initialize input values
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.noise_dim = noise_dim
        self.data_dim = data_dim
        self.layers_dim = layers_dim
        
        def create_generator(batch_size, input_shape, layers_dim, data_dim):
            input = Input(shape=input_shape, batch_size=batch_size)
            x = Dense(layers_dim)(input)
            x = ReLU()(x)
            x = Dense(layers_dim * 2)(x)
            x = ReLU()(x)
            x = Dense(layers_dim * 4)(x)
            x = ReLU()(x)
            x = Dense(data_dim)(x)
            return Model(inputs=input, outputs=x)
    
        def create_discriminator(batch_size, input_shape, layers_dim):
            input = Input(shape=input_shape, batch_size=batch_size)
            x = Dense(layers_dim * 4)(input)
            x = LeakyReLU()(x)
            x = Dropout(0.1)(x)
            x = Dense(layers_dim * 2)(x)
            x = LeakyReLU()(x)
            x = Dropout(0.1)(x)
            x = Dense(layers_dim)(x)
            x = LeakyReLU()(x)
            x = Dense(1, activation='sigmoid')(x)
            return Model(inputs=input, outputs=x)
        
        self.generator = create_generator(self.batch_size, (self.noise_dim,), self.layers_dim, self.data_dim)
        self.discriminator = create_discriminator(self.batch_size, (self.data_dim,), self.layers_dim)
        
        # Adam optimizer
        opt = Adam(self.learning_rate)
        # Discriminator is a binary classification real/fake -> loss is binary crossentropy and metric accuracy
        self.discriminator.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])        
        
        # STRUCTURE
        def create_gan(self):
            # For the combined model, only train the generator
            self.discriminator.trainable = False
            
            gen_input = Input(shape=(self.noise_dim,)) # The generator takes noise as input
            gen_output = self.generator(gen_input)
            
            disc_output = self.discriminator(gen_output) # The discriminator takes generated images as input 
            
            # GAN model: generator + discriminator
            gan = Model(gen_input, disc_output)
            gan.compile(loss='binary_crossentropy', optimizer=opt)
            
            return gan
        
        self.gan = create_gan(self)

    def train(self, data, epochs, model_name):        
        ones = np.ones((self.batch_size, 1))
        zeros = np.zeros((self.batch_size, 1))

        for epoch in range(epochs):    
            # First input: to the generator 
            noise = tensorflow.random.normal((self.batch_size, self.noise_dim)) #batch_size x noise_dim 
            # Generator - Noise inputs the generator 
            gen_data = self.generator.predict(noise)
            # Second input: to the discriminator from real data
            real_data = data.sample(n = batch_size).to_numpy()
             
            # Train the discriminator on both paths: from real data and from generated data
            # When data is real, it outputs 1; when data is generated, it outputs 0
            dloss_real = self.discriminator.train_on_batch(real_data, ones)
            dloss_gen = self.discriminator.train_on_batch(gen_data, zeros)
            d_loss = np.add(dloss_real, dloss_gen)/2
    
            # Train the generator to fool the discriminator
            # When data comes from noise and is generated, it outputs 1
            noise = tensorflow.random.normal((self.batch_size, self.noise_dim))
            g_loss = self.gan.train_on_batch(noise, ones)
    
            # Save losses from generator and discriminator
            genlosses.append(g_loss)
            disclosses.append(d_loss[0])
            
            # Plot the progress every 10 epochs
            if epoch % 10 == 0:
                print("Epoch %d with discriminator loss %f and generator loss %f (x100)" % (epoch, d_loss[0]*100, g_loss*100))
                        
            # Save model every 100 epochs
            if epoch % 100 == 0:
                self.generator.save_weights(model_name + '/gen_weights_' + str(epoch) + '.h5')
                self.discriminator.save_weights(model_name + '/disc_weights_' + str(epoch) + '.h5')
            noise = tensorflow.random.normal((123, self.noise_dim))
            gen_data = self.generator(noise)
    
    # Save generator weights
    def save(self, path):
        self.generator.save_weights(path)

## GAN training

In [None]:
genlosses = []
disclosses = []
data_dim = df.shape[1]

if not os.path.exists(model_name):
    os.mkdir(model_name)

model = GAN(batch_size, learning_rate, noise_dim, data_dim, layers_dim)
model.train(df, epochs, model_name)
model.save(model_name + '/gan/saved/generator')

## Training output

In [None]:
# Loss functions

fig, ax = plt.subplots()
pd.DataFrame(genlosses).plot(ax=ax, title='Loss Functions')
pd.DataFrame(disclosses).plot(ax=ax)

In [None]:
# Generator schema

model.generator.summary()

In [None]:
# Discriminator schema

model.discriminator.summary()

In [None]:
# GAN schema

model.gan.summary()

## Tabular data generation

In [None]:
test_size = len(df) # number of generated cases
noise = np.random.normal(size=(test_size, noise_dim))
generator_model = model.generator
generator_model.load_weights(model_name+'/gen_weights_'+str(epochs-1)+'.h5')

g_z = generator_model.predict(noise)
dfgen = pd.DataFrame(g_z, columns=df.columns)
dfgen.to_csv(model_name+'/Generated_data.csv')

dfgen.head()

## Testing

In [None]:
# Wasserstein distance

wd = []
for c in df.columns:
    wdistance = wasserstein_distance(df[c], dfgen[c])
    wd.append(wdistance)

distances = pd.DataFrame(np.array(wd), np.array(df.columns), columns = ['Distance'])
distances = distances.reset_index()
distances.columns = ['Variable', 'Distance']

print(distances) 
print('\nMean Wasserstein Distance:', np.mean(wd))

In [None]:
# Visual metrics

table_evaluator = TableEvaluator(df, dfgen)
table_evaluator.visual_evaluation()

## Save unstandarized generated dataset

In [None]:
dfgen_inv = pd.DataFrame(scaler.inverse_transform(dfgen), columns=dfgen.columns)
dfgen_inv.to_csv(model_name+'/Unstandarized_Generated_data.csv')

dfgen_inv.head()