In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score,\
                            accuracy_score, balanced_accuracy_score,classification_report,\
                            plot_confusion_matrix, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, Concatenate
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal
import tensorflow.keras.backend as K
from sklearn.utils import shuffle
import keras

np.random.seed(1635848)

In [2]:
class Generator(keras.Model):
    def __init__(self, latent_dim=32, out_shape=14, num_classes=2):
        super(Generator, self).__init__(name="generator")
        
        self.latent_dim = latent_dim
        self.out_shape = out_shape 
        self.num_classes = num_classes
        
        self.dense_in = Dense(128, use_bias=False, input_dim=self.latent_dim, name="Dense1")
        self.dense_out = Dense(self.out_shape, activation='tanh')
        self.dense1 = Dense(256)
        self.dense2 = Dense(512)
        self.dropout02 = Dropout(0.2)
        self.bn1 = BatchNormalization(momentum=0.4)
        self.bn2 = BatchNormalization(momentum=0.8)
        self.leaky_relu01 = LeakyReLU(alpha=0.1)
        
        
    def call(self, model_input):
        x = self.dense_in(model_input)
        x = self.dropout02(x)
        x = self.leaky_relu01(x)
        x = self.bn1(x)
        x = self.dense1(x)
        x = self.dropout02(x)
        x = self.leaky_relu01(x)
        x = self.bn2(x)
        x = self.dense2(x)
        x = self.dropout02(x)
        x = self.leaky_relu01(x)
        gen_sample = self.dense_out(x)
        return gen_sample

In [3]:
class Discriminator(keras.Model):
    def __init__(self, out_shape=14, num_classes=2):
        super(Discriminator, self).__init__(name="discriminator")
        
        self.out_shape = out_shape 
        self.num_classes = num_classes
        
        self.init = RandomNormal(mean=0.0, stddev=0.02)
        self.dense_in = Dense(512, input_dim=self.out_shape, kernel_initializer=self.init)
        self.dense_out = Dense(1, activation='sigmoid')
        self.leaky_relu02 = LeakyReLU(alpha=0.2)
        self.dropout04 = Dropout(0.4)
        self.dense1 = Dense(256, kernel_initializer=self.init)
        self.dense2 = Dense(128, kernel_initializer=self.init)
        
    def call(self, model_input):
        x = self.dense_in(model_input)
        x = self.leaky_relu02(x)
        
        x = self.dense1(x)
        x = self.leaky_relu02(x)
        x = self.dropout04(x)
        
        x = self.dense2(x)
        x = self.leaky_relu02(x)
        x = self.dropout04(x)
        
        validity = self.dense_out(x)
        return validity

In [None]:
class cGAN():
    
    def __init__(self, latent_dim=32, out_shape=14, num_classes=2):
        
        self.latent_dim = latent_dim
        self.out_shape = out_shape 
        self.num_classes = num_classes
        
        # creating discriminator and generator objects
        self.discriminator_obj = Discriminator(out_shape=self.out_shape, num_classes=self.num_classes)
        self.generator_obj = Generator(latent_dim=self.latent_dim, out_shape=self.out_shape, num_classes=self.num_classes)
        
        # using Adam as our optimizer
        optimizer = Adam(0.0002, 0.5)
        
        # building the discriminator
        self.discriminator = self.discriminator_obj.create_discriminator()
        self.discriminator.compile(loss=['binary_crossentropy'],
                                   optimizer=optimizer,
                                   metrics=['accuracy'])

        # building the generator
        self.generator = self.generator_obj.create_generator()

        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,))
        gen_samples = self.generator([noise, label])
        
        # we don't train discriminator when training generator
        self.discriminator.trainable = False
        valid = self.discriminator([gen_samples, label])

        # combining both models
        self.combined = Model([noise, label], valid)
        self.combined.compile(loss=['binary_crossentropy'],
                              optimizer=optimizer,
                             metrics=['accuracy'])
        
    def train(self, X_train, y_train, pos_index, neg_index, epochs, sampling=False, batch_size=32, sample_interval=100, plot=True): 
        
        # though not recommended, defining losses as global helps as in analysing our cgan out of the class
        global G_losses
        global D_losses
        
        G_losses = []
        D_losses = []
        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):
            
            # if sampling==True --> train discriminator with 8 sample from postivite class and rest with negative class
            if sampling:
                idx1 = np.random.choice(pos_index, 8)
                idx0 = np.random.choice(neg_index, batch_size-8)
                idx = np.concatenate((idx1, idx0))
            # if sampling!=True --> train discriminator using random instances in batches of 32
            else:
                idx = np.random.choice(len(y_train), batch_size)
            samples, labels = X_train[idx], y_train[idx]
            samples, labels = shuffle(samples, labels)
            
            # Sample noise as generator input
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            #print(noise)
            #print(labels)
            gen_samples = self.generator.predict([noise, labels])

            # label smoothing
            if epoch < epochs//1.5:
                valid_smooth = (valid+0.1)-(np.random.random(valid.shape)*0.1)
                fake_smooth = (fake-0.1)+(np.random.random(fake.shape)*0.1)
            else:
                valid_smooth = valid 
                fake_smooth = fake
                
            # Train the discriminator
            self.discriminator.trainable = True
            d_loss_real = self.discriminator.train_on_batch([samples, labels], valid_smooth)
            d_loss_fake = self.discriminator.train_on_batch([gen_samples, labels], fake_smooth)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            self.discriminator.trainable = False
            sampled_labels = np.random.randint(0, 2, batch_size).reshape(-1, 1)
            # Train the generator
            g_loss = self.combined.train_on_batch([noise, sampled_labels], valid)

            if (epoch+1)%sample_interval==0:
                print('[%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f'
                  % (epoch, epochs, d_loss[0], g_loss[0]))
            G_losses.append(g_loss[0])
            D_losses.append(d_loss[0])
            if plot:
                if epoch+1==epochs:
                    plt.figure(figsize=(10,5))
                    plt.title("Generator and Discriminator Loss")
                    plt.plot(G_losses,label="G")
                    plt.plot(D_losses,label="D")
                    plt.xlabel("iterations")
                    plt.ylabel("Loss")
                    plt.legend()
                    plt.show()

In [None]:
latent_dim = 32
num_classes = 2
out_shape = 14

noise = Input(shape=(latent_dim,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(num_classes, latent_dim)(label))
generator_input = multiply([noise, label_embedding])

In [None]:
generator = Generator(latent_dim=latent_dim, out_shape=out_shape, num_classes=num_classes)

In [None]:
gen_samples = generator(generator_input)

In [None]:
gen_samples

In [None]:
gen_sample = Input(shape=(out_shape,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(num_classes, out_shape)(label))
discriminator_input = multiply([gen_samples, label_embedding])

In [None]:
discriminator_input

In [None]:
discriminator = Discriminator(out_shape=out_shape, num_classes=num_classes)

In [None]:
discriminator.trainable = False
valid = discriminator(discriminator_input)

In [None]:
valid

In [None]:
class cGAN():
    
    def __init__(self, latent_dim=32, out_shape=14, num_classes=2):
        
        self.latent_dim = latent_dim
        self.out_shape = out_shape 
        self.num_classes = num_classes
        
        # creating discriminator and generator objects
        self.discriminator = Discriminator(out_shape=out_shape, num_classes=num_classes)
        self.generator = Generator(latent_dim=latent_dim, out_shape=out_shape, num_classes=num_classes)
        
        # using Adam as our optimizer
        optimizer = Adam(0.0002, 0.5)
        
        # building the discriminator
        self.discriminator.compile(loss=['binary_crossentropy'],
                                   optimizer=optimizer,
                                   metrics=['accuracy'])

        # building the generator
        noise = Input(shape=(latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(num_classes, latent_dim)(label))
        generator_input = multiply([noise, label_embedding])
        gen_samples = self.generator(generator_input)
        
        # we don't train discriminator when training generator
        self.discriminator.trainable = False
        gen_sample = Input(shape=(out_shape,))
        
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(num_classes, out_shape)(label))
        discriminator_input = multiply([gen_samples, label_embedding])
        
        valid = self.discriminator(discriminator_input)

        # combining both models
        self.combined = Model([noise, label], valid)
        self.combined.compile(loss=['binary_crossentropy'],
                              optimizer=optimizer,
                             metrics=['accuracy'])
        
    def train(self, X_train, y_train, pos_index, neg_index, epochs, sampling=False, batch_size=32, sample_interval=100, plot=True): 
        
        # though not recommended, defining losses as global helps as in analysing our cgan out of the class
        global G_losses
        global D_losses
        
        G_losses = []
        D_losses = []
        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):
            
            # if sampling==True --> train discriminator with 8 sample from postivite class and rest with negative class
            if sampling:
                idx1 = np.random.choice(pos_index, 8)
                idx0 = np.random.choice(neg_index, batch_size-8)
                idx = np.concatenate((idx1, idx0))
            # if sampling!=True --> train discriminator using random instances in batches of 32
            else:
                idx = np.random.choice(len(y_train), batch_size)
            samples, labels = X_train[idx], y_train[idx]
            samples, labels = shuffle(samples, labels)
            
            # Sample noise as generator input
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            #print(noise)
            #print(labels)
            gen_samples = self.generator.predict([noise, labels])

            # label smoothing
            if epoch < epochs//1.5:
                valid_smooth = (valid+0.1)-(np.random.random(valid.shape)*0.1)
                fake_smooth = (fake-0.1)+(np.random.random(fake.shape)*0.1)
            else:
                valid_smooth = valid 
                fake_smooth = fake
                
            # Train the discriminator
            self.discriminator.trainable = True
            d_loss_real = self.discriminator.train_on_batch([samples, labels], valid_smooth)
            d_loss_fake = self.discriminator.train_on_batch([gen_samples, labels], fake_smooth)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            self.discriminator.trainable = False
            sampled_labels = np.random.randint(0, 2, batch_size).reshape(-1, 1)
            # Train the generator
            g_loss = self.combined.train_on_batch([noise, sampled_labels], valid)

            if (epoch+1)%sample_interval==0:
                print('[%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f'
                  % (epoch, epochs, d_loss[0], g_loss[0]))
            G_losses.append(g_loss[0])
            D_losses.append(d_loss[0])
            if plot:
                if epoch+1==epochs:
                    plt.figure(figsize=(10,5))
                    plt.title("Generator and Discriminator Loss")
                    plt.plot(G_losses,label="G")
                    plt.plot(D_losses,label="D")
                    plt.xlabel("iterations")
                    plt.ylabel("Loss")
                    plt.legend()
                    plt.show()

In [4]:
latent_dim = 32
num_classes = 2
out_shape = 14

In [5]:
latent_dim = latent_dim
out_shape = out_shape 
num_classes = num_classes

In [6]:
# using Adam as our optimizer
optimizer = keras.optimizer_v2.adam.Adam()

In [7]:
# building the discriminator
discriminator = Discriminator(out_shape=out_shape, num_classes=num_classes)
discriminator.compile(loss=['binary_crossentropy'],
                            optimizer=optimizer,
                            metrics=['accuracy'])

In [13]:
discriminator

<__main__.Discriminator at 0x23087716850>

In [14]:
# building the generator
generator = Generator(latent_dim=latent_dim, out_shape=out_shape, num_classes=num_classes)

noise = Input(shape=(latent_dim,))
label = Input(shape=(1,), dtype='int32')

label_embedding = Flatten()(Embedding(num_classes, latent_dim)(label))
generator_input = multiply([noise, label_embedding])
gen_samples = generator(generator_input)

In [25]:
gen_sample

<KerasTensor: shape=(None, 14) dtype=float32 (created by layer 'input_3')>

In [23]:
gen_samples

<KerasTensor: shape=(None, 14) dtype=float32 (created by layer 'dense_4')>

In [24]:
# we don't train discriminator when training generator
discriminator.trainable = False
gen_sample = Input(shape=(out_shape,))

In [27]:
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(num_classes, out_shape)(label))
discriminator_input = multiply([gen_samples, label_embedding])

In [28]:
valid = discriminator(discriminator_input)

In [46]:
class cGAN(keras.Model):
    def __inti__(self, latent_dim=32, out_shape=14, num_classes=2):
        #super().__init__()
        
        self.latent_dim = latent_dim
        self.out_shape = out_shape 
        self.num_classes = num_classes
        
        optimizer = keras.optimizer_v2.adam.Adam()
        self.discriminator = Discriminator(out_shape=out_shape, num_classes=num_classes)
        self.discriminator.compile(loss=['binary_crossentropy'],
                                    optimizer=optimizer,
                                    metrics=['accuracy'])
        self.discriminator.trainable = False
        
        self.generator = Generator(latent_dim=latent_dim, out_shape=out_shape, num_classes=num_classes)
    
    def call(self):
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
        generator_input = multiply([noise, label_embedding])
        gen_samples = self.generator(generator_input)
        label_embedding2 = Flatten()(Embedding(self.num_classes, self.out_shape)(label))
        discriminator_input = multiply([gen_samples, label_embedding2])
        valid = self.discriminator(discriminator_input)
        return valid

In [47]:
cgan = cGAN()

In [48]:
prob = cgan()

IndexError: list index out of range

In [None]:
# building the discriminator
self.discriminator = self.discriminator()
self.discriminator.compile(loss=['binary_crossentropy'],
                           optimizer=optimizer,
                           metrics=['accuracy'])

# building the generator
self.generator = self.generator()

noise = Input(shape=(self.latent_dim,))
label = Input(shape=(1,))
gen_samples = self.generator([noise, label])

# we don't train discriminator when training generator
self.discriminator.trainable = False
valid = self.discriminator([gen_samples, label])

# combining both models
self.combined = Model([noise, label], valid)
self.combined.compile(loss=['binary_crossentropy'],
                      optimizer=optimizer,
                     metrics=['accuracy'])

In [None]:
df = pd.read_csv('adult.csv')
df.head()

Before employing any algorithms, we will first preprocess some data.

## Preprocessing

Since the goal of this notebook is to examine how good the generated synthetic data is, we won't analyse and do any feature engineering. It is also not that important that we get the best possible result with the algorithm, so that's one of the reasons why we will only use label-encoding (on some features normally one-hot encoding should be a better approach). 

In [None]:
le = preprocessing.LabelEncoder()
for i in ['workclass','education','marital.status','occupation','relationship','race','sex','native.country','income']:
    df[i] = le.fit_transform(df[i].astype(str))

In [None]:
df.head()

In [None]:
df.income.value_counts()

### Splitting the dataframe

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(df.drop('income', 1))
y = df['income'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Classifying using real trainset

In [None]:
lgb_1 = lgb.LGBMClassifier()
lgb_1.fit(X_train, y_train)

y_pred = lgb_1.predict(X_test)

# evaluation
print(classification_report(y_test, y_pred))
plot_confusion_matrix(lgb_1, X_test, y_test)
plt.show()

### Training cGAN

In [None]:
self.latent_dim = latent_dim
self.out_shape = out_shape 
self.num_classes = num_classes

# creating discriminator and generator objects
self.discriminator = Discriminator(out_shape=out_shape, num_classes=num_classes)
self.generator = Generator(latent_dim=latent_dim, out_shape=out_shape, num_classes=num_classes)

# using Adam as our optimizer
optimizer = Adam(0.0002, 0.5)

# building the discriminator

self.discriminator.compile(loss=['binary_crossentropy'],
                           optimizer=optimizer,
                           metrics=['accuracy'])

# building the generator

noise = Input(shape=(latent_dim,))
label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(num_classes, latent_dim)(label))
generator_input = multiply([noise, label_embedding])

gen_samples = self.generator(generator_input)

# we don't train discriminator when training generator
self.discriminator.trainable = False
gen_sample = Input(shape=(out_shape,))

label = Input(shape=(1,), dtype='int32')
label_embedding = Flatten()(Embedding(num_classes, out_shape)(label))
discriminator_input = multiply([gen_samples, label_embedding])

valid = self.discriminator(discriminator_input)

# combining both models
self.combined = Model([noise, label], valid)
self.combined.compile(loss=['binary_crossentropy'],
                      optimizer=optimizer,
                     metrics=['accuracy'])

In [None]:
cgan = cGAN(num_classes=2)

In [None]:
y_train = y_train.reshape(-1,1)
pos_index = np.where(y_train==1)[0]
neg_index = np.where(y_train==0)[0]
cgan.train(X_train, y_train, pos_index, neg_index, epochs=500)

### Generating new instances

In [None]:
# we want to generate 19758 instances with class value 0 since that represents how many 0s are in the label of the real training set
noise = np.random.normal(0, 1, (len(df[df["income"] == 0]), 32))
sampled_labels = np.zeros(len(df[df["income"] == 0])).reshape(-1, 1)


gen_samples = cgan.generator.predict([noise, sampled_labels])

gen_df = pd.DataFrame(data = gen_samples,
                      columns = df.drop('income',1).columns)

In [None]:
# we want to generate 6290 instances with class value 1 since that represents how many 1s are in the label of the real training set
noise_2 = np.random.normal(0, 1, (len(df[df["income"] == 1]), 32))
sampled_labels_2 = np.ones(len(df[df["income"] == 1])).reshape(-1, 1)


gen_samples_2 = cgan.generator.predict([noise_2, sampled_labels_2])

gen_df_2 = pd.DataFrame(data = gen_samples_2,
                      columns = df.drop('income',1).columns)

### Combining generated instances into a dataframe

In [None]:
gen_df_2['income'] = 1
gen_df['income']=0

df_gan = pd.concat([gen_df_2, gen_df], ignore_index=True, sort=False)
df_gan = df_gan.sample(frac=1).reset_index(drop=True)

X_train_2 = df_gan.drop('income', 1)
y_train_2 = df_gan['income'].values