# Imports

In [None]:
!pip install plotly_express

Collecting plotly_express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Installing collected packages: plotly-express
Successfully installed plotly-express-0.4.1


In [51]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from scipy.spatial import distance
import math
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Reshape
from keras.layers import Concatenate, Flatten, concatenate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.optimizers import Adam

path_to_drive_folder = "drive/MyDrive/DL/ass4/"
#path_to_drive_folder = "/content/"

In [None]:
!mkdir model/

# Load Datasets

In [None]:
def extract_data(filename):
  data = arff.loadarff(path_to_drive_folder+filename)
  df = pd.DataFrame(data[0])
  return df

def featurize_ger_df(df):
    categorical_features = ['3', '4', '9', '10', '12', '14', '15', '17']
    numeric_categorical_features = ['1', '6', '7', '19', '20']
    continuous_features = ['2', '5', '8', '11', '13', '16', '18']

    for c in numeric_categorical_features:
        df[c] = df[c].astype('category').cat.codes

    cat_dfs = []
    for c in categorical_features:
        cat_dfs.append(pd.get_dummies(df[c], prefix=c))
    cat_dfs = pd.concat(cat_dfs, axis=1)

    df = df.drop(columns=categorical_features)
    df = pd.concat([df, cat_dfs], axis=1)

    for column in continuous_features:
      min = df[column].min()
      max = df[column].max()
      feature_bins = pd.cut(df[column], bins=np.linspace(min, max, 21), labels=False)
      df.drop([column], axis=1, inplace=True)
      df = pd.concat([df, feature_bins], axis=1)

    pw = PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
    df[df.columns]=pw.fit_transform(df[df.columns])
    df.fillna(0, inplace=True)
    return df, pw

def featurize_diabetes_df(df):
    pw = PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
    df[df.columns]= pw.fit_transform(df[df.columns])
    df.fillna(0, inplace=True) 
    return df, pw

In [None]:
diabetes_df = extract_data('diabetes.arff')
diabetes_df['class'] = diabetes_df['class'].apply(lambda x: x.decode("utf-8"))
diabetes_df_y = diabetes_df['class'].apply(lambda x: 1 if x=='tested_positive' else 0)
diabetes_df.drop('class', axis='columns', inplace=True)
diabetes_df, diabetes_pw_scaler = featurize_diabetes_df(diabetes_df)
diabetes_df

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,0.813399,0.848665,0.068935,0.876707,-1.008294,0.174124,0.821764,1.364180
1,-0.833906,-1.123027,-0.306484,0.632185,-1.008294,-0.725726,-0.168409,0.126452
2,1.188996,1.930906,-0.427244,-1.430072,-1.008294,-1.129341,0.935284,0.230161
3,-0.833906,-0.996671,-0.306484,0.362347,0.859700,-0.537700,-1.298725,-1.480075
4,-1.603317,0.506848,-1.693204,0.876707,1.077013,1.477376,2.336680,0.327328
...,...,...,...,...,...,...,...,...
763,1.500834,-0.618705,0.329924,1.345791,1.102624,0.081666,-1.267837,1.765331
764,-0.339229,0.039240,-0.058370,0.545442,-1.008294,0.603281,-0.222191,-0.368619
765,0.590955,0.008000,0.068935,0.362347,0.925586,-0.775397,-0.753116,0.015535
766,-0.833906,0.164114,-0.662035,-1.430072,-1.008294,-0.282815,-0.178081,1.238144


In [114]:
german_credit_df = extract_data('german_credit.arff')
german_credit_df['21'] = german_credit_df['21'].apply(lambda x: x.decode("utf-8"))
german_credit_df_y = german_credit_df['21'].apply(lambda x: 1 if x=='2' else 0)

german_credit_df.drop('21', axis='columns', inplace=True)
german_credit_df, german_pw_scaler = featurize_ger_df(german_credit_df)
german_credit_df


divide by zero encountered in log



Unnamed: 0,1,6,7,19,20,3_b'A30',3_b'A31',3_b'A32',3_b'A33',3_b'A34',4_b'A40',4_b'A41',4_b'A410',4_b'A42',4_b'A43',4_b'A44',4_b'A45',4_b'A46',4_b'A48',4_b'A49',9_b'A91',9_b'A92',9_b'A93',9_b'A94',10_b'A101',10_b'A102',10_b'A103',12_b'A121',12_b'A122',12_b'A123',12_b'A124',14_b'A141',14_b'A142',14_b'A143',15_b'A151',15_b'A152',15_b'A153',17_b'A171',17_b'A172',17_b'A173',17_b'A174',2,5,8,11,13,16,18
0,-1.337742,1.468573,1.335051,1.214598,-0.196014,-0.204124,-0.226991,-1.061913,-0.31063,1.553374,-0.552705,-0.338862,-0.110208,-0.470108,1.603567,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,-0.670280,0.908195,-0.318311,0.320212,-0.206768,-0.234206,1.595650,-0.549621,-0.704987,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,0.634448,-0.347960,-0.149983,-0.5,0.766356,-0.416784,-1.941306,-0.531713,0.851245,0.960839,2.028867,-0.319216,0.0
1,-0.335073,-0.795431,-0.314804,-0.823318,-0.196014,-0.204124,-0.226991,0.941697,-0.31063,-0.643760,-0.552705,-0.338862,-0.110208,-0.470108,1.603567,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,1.491914,-1.101086,-0.318311,0.320212,-0.206768,-0.234206,1.595650,-0.549621,-0.704987,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,0.634448,-0.347960,-0.149983,-0.5,0.766356,-0.416784,1.752756,1.200643,-1.454248,-1.258944,-1.394315,0.000000,0.0
2,1.089210,-0.795431,0.511789,-0.823318,-0.196014,-0.204124,-0.226991,-1.061913,-0.31063,1.553374,-0.552705,-0.338862,-0.110208,-0.470108,-0.623610,-0.110208,-0.149983,4.358899,-0.095298,-0.327749,-0.229416,-0.670280,0.908195,-0.318311,0.320212,-0.206768,-0.234206,1.595650,-0.549621,-0.704987,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,0.634448,-0.347960,-0.149983,2.0,-1.304877,-0.416784,-0.641366,0.069025,-1.454248,-0.060885,1.150717,0.000000,-1.0
3,-1.337742,-0.795431,0.511789,-0.823318,-0.196014,-0.204124,-0.226991,0.941697,-0.31063,-0.643760,-0.552705,-0.338862,-0.110208,2.127172,-0.623610,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,-0.670280,0.908195,-0.318311,-3.122929,-0.206768,4.269750,-0.626704,1.819435,-0.704987,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,-1.576173,2.873893,-0.149983,-0.5,0.766356,-0.416784,1.598290,1.506687,-1.454248,0.960839,0.988827,0.000000,-1.0
4,-1.337742,-0.795431,-0.314804,-0.823318,-0.196014,-0.204124,-0.226991,-1.061913,3.21926,-0.643760,1.809283,-0.338862,-0.110208,-0.470108,-0.623610,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,-0.670280,0.908195,-0.318311,0.320212,-0.206768,-0.234206,-0.626704,-0.549621,-0.704987,2.343823,-0.401796,-0.222076,0.478018,-0.466933,-1.576173,2.873893,-0.149983,-0.5,0.766356,-0.416784,0.382001,1.006478,-0.441155,0.960839,1.440662,-0.319216,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.089210,-0.795431,0.511789,-0.823318,-0.196014,-0.204124,-0.226991,0.941697,-0.31063,-0.643760,-0.552705,-0.338862,-0.110208,2.127172,-0.623610,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,1.491914,-1.101086,-0.318311,0.320212,-0.206768,-0.234206,1.595650,-0.549621,-0.704987,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,0.634448,-0.347960,-0.149983,2.0,-1.304877,-0.416784,-0.641366,-0.531713,-0.441155,0.960839,-0.115216,0.000000,0.0
996,-1.337742,-0.795431,-0.314804,1.214598,-0.196014,-0.204124,-0.226991,0.941697,-0.31063,-0.643760,-0.552705,2.951057,-0.110208,-0.470108,-0.623610,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,4.358899,-0.670280,-1.101086,-0.318311,0.320212,-0.206768,-0.234206,-0.626704,1.819435,-0.704987,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,0.634448,-0.347960,-0.149983,-0.5,-1.304877,2.399324,0.861447,0.471292,0.851245,0.960839,0.619581,0.000000,0.0
997,1.089210,-0.795431,1.335051,-0.823318,-0.196014,-0.204124,-0.226991,0.941697,-0.31063,-0.643760,-0.552705,-0.338862,-0.110208,-0.470108,1.603567,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,-0.670280,0.908195,-0.318311,0.320212,-0.206768,-0.234206,-0.626704,-0.549621,1.418467,-0.426653,-0.401796,-0.222076,0.478018,-0.466933,0.634448,-0.347960,-0.149983,-0.5,0.766356,-0.416784,-0.641366,-1.657571,0.851245,0.960839,0.405013,0.000000,0.0
998,-1.337742,-0.795431,-0.314804,1.214598,-0.196014,-0.204124,-0.226991,0.941697,-0.31063,-0.643760,-0.552705,-0.338862,-0.110208,-0.470108,1.603567,-0.110208,-0.149983,-0.229416,-0.095298,-0.327749,-0.229416,-0.670280,0.908195,-0.318311,0.320212,-0.206768,-0.234206,-0.626704,-0.549621,-0.704987,2.343823,-0.401796,-0.222076,0.478018,-0.466933,-1.576173,2.873893,-0.149983,-0.5,0.766356,-0.416784,1.752756,-0.531713,0.851245,0.960839,-1.394315,0.000000,0.0


# Visualization

In [None]:
def drow_loss_plot(hist, epochs, x_axis='epoch_num'):
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['d_loss'],
                      mode='lines',
                      name='d loss'))
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['g_loss'],
                      mode='lines',
                      name='g loss'))
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['d_real_loss'],
                      mode='lines', name='d real loss'))
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['d_fake_loss'],
                      mode='lines', name='d fake loss'))

  fig.update_layout(title=f"GAN Loss for {epochs} epochs",
      xaxis_title="Epoch Number",
      yaxis_title="Loss")
  fig.show()


def drow_acc_plot(hist, epochs, x_axis='epoch_num'):
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['d_acc'],
                      mode='lines',
                      name='d acc'))
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['d_real_acc'],
                      mode='lines', name='d real acc'))
  fig.add_trace(go.Scatter(x=hist[x_axis], y=hist['d_fake_acc'],
                      mode='lines', name='d fake acc'))

  fig.update_layout(title=f"GAN accuracy for {epochs} epochs",
      xaxis_title="Epoch Number",
      yaxis_title="Accuracy")
  fig.show()


def draw_mean_stats_for_hist(hist):
  hist['epoch_100'] = hist['epoch_num'].apply(lambda x: round(x/100,0))
  mean_hist = hist.groupby('epoch_100').mean().reset_index()
  mean_hist['epoch_100'] = mean_hist['epoch_100'].apply(lambda x: x*100)
  drow_loss_plot(mean_hist, epochs, 'epoch_100')
  drow_acc_plot(mean_hist, epochs, 'epoch_100')

# Part 1 - Train GAN Model

In [None]:
class Generator():

    def __init__(self, input_shape, layers_size, output_size):
      self.input_shape = input_shape
      self.layers_size = layers_size
      self.output_size = output_size

    def build_model(self):
      input1 = Input(shape=self.input_shape)
      x = Dense(self.layers_size, activation='relu')(input1)
      x = Dense(self.layers_size * 2, activation='relu')(x)
      x = Dense(self.layers_size * 3, activation='relu')(x)
      x = Dense(self.layers_size * 4, activation='relu')(x)
      x = Dense(self.output_size)(x)
      self.model = Model(inputs=input1, outputs=x, name='Gen')
      print(self.model.summary())


class Discriminator():

    def __init__(self, input_shape, layers_size):
      self.input_shape = input_shape
      self.layers_size = layers_size
      
    def build_model(self):
      input1 = Input(shape=self.input_shape)
      x = Dense(self.layers_size * 2, activation='relu')(input1)
      x = Dropout(0.5)(x)
      x = Dense(self.layers_size, activation='relu')(x)
      x = Dropout(0.5)(x)
      x = Dense(1, activation='sigmoid')(x)
      self.model = Model(inputs=input1, outputs=x, name='Dis')
      print(self.model.summary())


In [None]:
class GAN():
    
    def __init__(self, batch_size, lr, noise_dim, output_size, layers_size):
        self.batch_size = batch_size
        self.noise_dim = noise_dim
        self.output_size = output_size

        self.generator = Generator((self.noise_dim,), layers_size, self.output_size)
        self.generator.build_model()

        self.discriminator = Discriminator((self.output_size,), layers_size)
        self.discriminator.build_model()
        
        optimizer = Adam(lr, 0.5)

        # Build and compile the discriminator
        self.discriminator.model.compile(loss='binary_crossentropy', optimizer=optimizer, 
                                         metrics=['accuracy'])

        # The generator takes noise as input and generates samples
        z = Input(shape=(self.noise_dim,))
        sample = self.generator.model(z)

        # For the combined model we will only train the generator
        self.discriminator.model.trainable = False

        # The discriminator takes generated samples as input and determines validity
        validity = self.discriminator.model(sample)

        # The combined model  (stacked generator and discriminator)
        # Trains the generator to fool the discriminator
        self.combined = Model(z, validity, name='GAN')
        print(self.combined.summary())
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
        
    def get_data_batch(self, train, batch_size, seed=0):
        start_i = (batch_size * seed) % len(train)
        stop_i = start_i + batch_size
        shuffle_seed = (batch_size * seed) // len(train)
        np.random.seed(shuffle_seed)
        train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) 
        train_ix = list(train_ix) + list(train_ix) 
        x = train.loc[train_ix[start_i: stop_i]].values
        reshaped = np.reshape(x, (batch_size, -1))
        return reshaped
        
    def generate_fake(self):
      noise = tf.random.normal((self.batch_size, self.noise_dim))
      gen_data = self.generator.model.predict(noise)
      return noise, gen_data    

    def train(self, data, train_arguments): 

        [cache_prefix, epochs, sample_interval] = train_arguments
        data_cols = data.columns       

        # Adversarial ground truths
        valid = np.ones((self.batch_size, 1))
        fake = np.zeros((self.batch_size, 1))
        
        d_loss_hist = []
        d_real_loss_hist = []
        d_fake_loss_hist = []
        d_acc_hist = []
        g_loss_hist = []
        d_real_acc_hist = []
        d_fake_acc_hist = []
        
        for epoch in range(epochs):    
            # ---------------------
            #  Train Discriminator
            # ---------------------
            batch_data = self.get_data_batch(data, self.batch_size)

            # Generate a batch of new samples
            noise, gen_data = self.generate_fake()
    
            # Train the discriminator
            d_loss_real = self.discriminator.model.train_on_batch(batch_data, valid)
            d_loss_fake = self.discriminator.model.train_on_batch(gen_data, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    
            # ---------------------
            #  Train Generator
            # ---------------------
#           
            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch(noise, valid)        

   
            # Plot the progress
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss))
            
            d_loss_hist.append(d_loss[0])
            d_acc_hist.append(d_loss[1])
            g_loss_hist.append(g_loss)

            d_real_loss_hist.append(d_loss_real[0])
            d_fake_loss_hist.append(d_loss_fake[0])

            d_real_acc_hist.append(d_loss_real[1])
            d_fake_acc_hist.append(d_loss_fake[1])
            
            # If at save interval => save generated events
            if epoch % sample_interval == 0:
                #Test here data generation step
                # save model checkpoints
                model_checkpoint_base_name = 'model/' + cache_prefix + '_{}_model_weights_step_{}.h5'
                self.generator.model.save_weights(model_checkpoint_base_name.format('generator', epoch))
                self.discriminator.model.save_weights(model_checkpoint_base_name.format('discriminator', epoch))

                #Here is generating the data
                z = tf.random.normal((1024, self.noise_dim))
                gen_data = self.generator.model.predict(z)
                print('generated_data')

        return d_loss_hist, d_acc_hist, g_loss_hist, d_real_loss_hist, d_fake_loss_hist, d_real_acc_hist, d_fake_acc_hist

    def save(self, path, name):
        assert os.path.isdir(path) == True, \
            "Please provide a valid path. Path must be a directory."
        model_path = os.path.join(path, name)
        self.generator.model.save_weights(model_path)  # Load the generator
        return
    
    def load(self, path):
        assert os.path.isdir(path) == True, \
            "Please provide a valid path. Path must be a directory."
        self.generator = Generator((self.noise_dim,), self.layers_size, self.output_size)
        self.generator.model = self.generator.model.load_weights(path)
        return self.generator

In [None]:
!mkdir model

mkdir: cannot create directory ‘model’: File exists


In [None]:
# train_ger_model = True
train_ger_model = True

if train_ger_model:
    # # ger (48)
    df = german_credit_df
    noise_dim = 64
    layers_size = 25
    batch_size = 64
    learning_rate = 1e-3
    cache_prefix = 'ger_v1'
else:
    # # # diabetes (8)    
    df = diabetes_df
    noise_dim = 32
    layers_size = 30 
    batch_size = 64
    learning_rate = 5e-4
    cache_prefix = 'dia_v1'

sample_interval = 100
epochs = 1000
data_dim = df.shape[1]
models_dir = 'model'


model = GAN(batch_size, learning_rate, noise_dim, data_dim, layers_size)
train_arguments = [cache_prefix, epochs, sample_interval]

d_loss_hist, d_acc_hist, g_loss_hist, d_real_loss_hist, d_fake_loss_hist, d_real_acc_hist, d_fake_acc_hist = model.train(df, train_arguments)

hist = pd.DataFrame([d_loss_hist, d_acc_hist, g_loss_hist, d_real_loss_hist, d_fake_loss_hist, d_real_acc_hist, d_fake_acc_hist]).T
hist = hist.reset_index()
hist.columns = ['epoch_num','d_loss', 'd_acc', 'g_loss', 'd_real_loss', 'd_fake_loss', 'd_real_acc', 'd_fake_acc']
hist['epoch_num'] = hist['epoch_num'].apply(lambda x: x+1)
hist

Model: "Gen"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        [(None, 64)]              0         
_________________________________________________________________
dense_32 (Dense)             (None, 25)                1625      
_________________________________________________________________
dense_33 (Dense)             (None, 50)                1300      
_________________________________________________________________
dense_34 (Dense)             (None, 75)                3825      
_________________________________________________________________
dense_35 (Dense)             (None, 100)               7600      
_________________________________________________________________
dense_36 (Dense)             (None, 48)                4848      
Total params: 19,198
Trainable params: 19,198
Non-trainable params: 0
___________________________________________________________

Unnamed: 0,epoch_num,d_loss,d_acc,g_loss,d_real_loss,d_fake_loss,d_real_acc,d_fake_acc
0,1,0.813981,0.578125,0.881631,1.061758,0.566203,0.406250,0.750000
1,2,0.846889,0.515625,0.818686,1.033822,0.659956,0.375000,0.656250
2,3,0.815044,0.484375,0.731158,0.943267,0.686822,0.437500,0.531250
3,4,0.813741,0.429688,0.681273,0.884557,0.742926,0.421875,0.437500
4,5,0.887389,0.429688,0.671260,0.988530,0.786248,0.468750,0.390625
...,...,...,...,...,...,...,...,...
995,996,0.678396,0.539062,0.719681,0.655272,0.701520,0.515625,0.562500
996,997,0.723406,0.445312,0.666688,0.692632,0.754180,0.421875,0.468750
997,998,0.723383,0.437500,0.664320,0.663569,0.783197,0.406250,0.468750
998,999,0.732176,0.468750,0.679063,0.638791,0.825562,0.500000,0.437500


In [None]:
draw_mean_stats_for_hist(hist)

In [None]:
drow_loss_plot(hist, epochs)
drow_acc_plot(hist, epochs)

## samples that fooled the descriminatot:


In [None]:
z = tf.random.normal((100, noise_dim))
gen_data = pd.DataFrame(model.generator.model.predict(z), columns=df.columns)
disc_threshold = 0.5
discriminator_preds = [1 if s > disc_threshold else 0 for s in model.discriminator.model.predict(gen_data)]
gen_data['discriminator_pred'] = discriminator_preds
fooled_samples = gen_data[gen_data['discriminator_pred'] == 1].iloc[:,:-1]
not_fooled_samples = gen_data[gen_data['discriminator_pred'] == 0].iloc[:,:-1]

In [None]:
print(f'number of fooled samples: {len(fooled_samples)}')
print(f'number of not fooled samples: {len(not_fooled_samples)}')

number of fooled samples: 43
number of not fooled samples: 57


In [None]:
# calculate auclidean distance from fooled_samples vs. random real samples
batch_data = model.get_data_batch(df, 70)
fooled_samples.reset_index(drop=True)
dst_list = []
for i in range(len(fooled_samples)):
  dst_list.append(distance.euclidean(fooled_samples.iloc[i].values, batch_data[i]))

# plot distance distribution
dst_hist = pd.DataFrame(pd.Series(dst_list).value_counts()).reset_index()
dst_hist.columns = ['auclidean distance', 'count']
fig = px.histogram(dst_hist, x="auclidean distance", y="count")
fig.update_layout(title=f"Auclidean distance distribution for fooled samples",
                  xaxis_title="Auclidean distance",
                  yaxis_title="Number of samples")
fig.show()

In [None]:
# visualization for fooled vs not fooled
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(gen_data)
principal_df = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

principal_df['discriminator_pred'] = gen_data['discriminator_pred'].apply(lambda x: 'fooled' if x==1 else 'not fooled')

In [None]:
fig = px.scatter(principal_df, x="principal component 1", y="principal component 2", color="discriminator_pred").for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))

fig.update_layout(title=f"Fooled samples visualization with PCA",
                  xaxis_title="principal component 1",
                  yaxis_title="principal component 2")
fig.show()

In [None]:
not_fooled_samples

In [None]:
df

# Part 2 - Train GAN Model with BB

In [90]:
class Generator2():

    def __init__(self, input_shape, layers_size, output_size):
      self.input_shape = input_shape
      self.layers_size = layers_size
      self.output_size = output_size

    def build_model(self):
      input1 = Input(shape=self.input_shape)
      input2 = Input(shape=(1,))      
      input_combined = Concatenate(axis=1)([input1,input2])
      x = Dense(self.layers_size, activation='relu')(input_combined)
      x = Dense(self.layers_size * 2, activation='relu')(x)      
      x = Dense(self.output_size)(x)
      self.model = Model(inputs=[input1,input2], outputs=x, name='Gen')
      print(self.model.summary())


class Discriminator2():

    def __init__(self, input_shape, layers_size):
      self.input_shape = input_shape
      self.layers_size = layers_size
      
    def build_model(self):
      input1 = Input(shape=self.input_shape)
      input2 = Input(shape=(1,))

      input_combined = Concatenate(axis=1)([input1,input2])
      
      x = Dense(self.layers_size * 2, activation='relu')(input_combined)
      x = Dropout(0.5)(x)
      x = Dense(self.layers_size, activation='relu')(x)
      x = Dropout(0.5)(x)
      x = Dense(1, activation='sigmoid')(x)
      self.model = Model(inputs=[input1,input2], outputs=x, name='Dis')
      print(self.model.summary())

In [72]:
class GAN2():
 
    def __init__(self, batch_size, lr, noise_dim, output_size, layers_size):
        self.batch_size = batch_size
        self.noise_dim = noise_dim
        self.output_size = output_size

        self.generator = Generator2((self.noise_dim,), layers_size, self.output_size)
        self.generator.build_model()

        self.discriminator = Discriminator2((self.output_size,), layers_size)
        self.discriminator.build_model()
        
        optimizer = Adam(lr, 0.5)

        # Build and compile the discriminator
        self.discriminator.model.compile(loss='binary_crossentropy', optimizer=optimizer, 
                                         metrics=['accuracy'])

        # The generator takes noise as input and generates samples
        z = Input(shape=(self.noise_dim,))
        fake_confidence = Input(shape=(1,))
        
        gen_sample = self.generator.model([z, fake_confidence])

        # For the combined model we will only train the generator
        self.discriminator.model.trainable = False
        # The discriminator takes generated samples as input and determines validity
        validity = self.discriminator.model([gen_sample, fake_confidence])
        
        # The combined model  (stacked generator and discriminator)
        # Trains the generator to fool the discriminator
        self.combined = Model(inputs=[z, fake_confidence], outputs=validity, name='GAN')  

        print(self.combined.summary())
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)
        
    def get_data_batch(self, train, batch_size, seed=0):
        start_i = (batch_size * seed) % len(train)
        stop_i = start_i + batch_size
        shuffle_seed = (batch_size * seed) // len(train)
        np.random.seed(shuffle_seed)
        train_ix = np.random.choice(list(train.index), replace=False, size=len(train)) 
        train_ix = list(train_ix) + list(train_ix) 
        x = train.loc[train_ix[start_i: stop_i]].values
        reshaped = np.reshape(x, (batch_size, -1))
        return reshaped
        
    
    def generate_fake(self):
        noise = tf.random.normal((self.batch_size, self.noise_dim))
        y = tf.random.uniform((self.batch_size, 1), minval=0, maxval=1)
        # Generate a batch of new samples
        gen_data = self.generator.model.predict([noise, y])
        return noise, gen_data, y 
   
    def train_rf(self, X_train, y_train, X_valid, y_valid):
        self.rf = RandomForestClassifier(n_estimators=500, criterion='entropy', random_state=11)
        self.rf.fit(X_train, y_train)
        pred = self.rf.predict(X_valid)
        acc = accuracy_score(y_valid,pred)
        print ('RF accuracy: ' + str(acc))
        return acc

    def train(self, train_arguments): 

        [cache_prefix, epochs, sample_interval] = train_arguments     

        # Adversarial ground truths
        valid = np.ones((self.batch_size, 1))
        fake = np.zeros((self.batch_size, 1))
        
        d_loss_hist = []
        d_real_loss_hist = []
        d_fake_loss_hist = []
        d_acc_hist = []
        g_loss_hist = []
        d_real_acc_hist = []
        d_fake_acc_hist = []
        
        for epoch in range(epochs):    
            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Generate a batch of new samples
            noise, gen_data, fake_confidence = self.generate_fake()
            # Generate the real confidence from RF
            real_confidence = self.rf.predict_proba(gen_data)    
            # extract confidence for the 1 class
            real_confidence = np.array([[x[1]] for x in real_confidence])          
    
            # Train the discriminator
            d_loss_real = self.discriminator.model.train_on_batch([gen_data, real_confidence], valid)
            d_loss_fake = self.discriminator.model.train_on_batch([gen_data, fake_confidence], fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)              
          
            # ---------------------
            #  Train Generator
            # ---------------------
         
            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch([noise, fake_confidence], valid)        

   
            # Plot the progress
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss))
            
            d_loss_hist.append(d_loss[0])
            d_acc_hist.append(d_loss[1])
            g_loss_hist.append(g_loss)

            d_real_loss_hist.append(d_loss_real[0])
            d_fake_loss_hist.append(d_loss_fake[0])

            d_real_acc_hist.append(d_loss_real[1])
            d_fake_acc_hist.append(d_loss_fake[1])
            
            # If at save interval => save generated events
            if epoch % sample_interval == 0:             
                # save model checkpoints
                model_checkpoint_base_name = 'model/' + cache_prefix + '_{}_model_weights_step_{}.h5'
                self.generator.model.save_weights(model_checkpoint_base_name.format('generator', epoch))
                self.discriminator.model.save_weights(model_checkpoint_base_name.format('discriminator', epoch))            

        return d_loss_hist, d_acc_hist, g_loss_hist, d_real_loss_hist, d_fake_loss_hist, d_real_acc_hist, d_fake_acc_hist


In [137]:
train_ger_model = True

if train_ger_model:
    # # ger (48)
    df = german_credit_df
    df_y = german_credit_df_y
    noise_dim = 48
    layers_size = 20
    batch_size = 64
    learning_rate = 1e-3
    cache_prefix = 'ger_v1'
    dataset_title = 'German Credit'
else:
    # # # diabetes (8)    
    df = diabetes_df
    df_y = diabetes_df_y
    noise_dim = 8
    layers_size = 16 
    batch_size = 64
    learning_rate = 5e-4
    cache_prefix = 'dia_v1'
    dataset_title = 'Diabetes'

sample_interval = 100
epochs = 1000
data_dim = df.shape[1]
models_dir = 'model'

X_df_train, X_df_valid, y_df_train, y_df_valid = train_test_split(df,df_y,test_size=0.3,shuffle=True, random_state=11) 

model = GAN2(batch_size, learning_rate, noise_dim, data_dim, layers_size)
train_arguments = [cache_prefix, epochs, sample_interval]
model.train_rf(X_df_train, y_df_train, X_df_valid, y_df_valid)
d_loss_hist, d_acc_hist, g_loss_hist, d_real_loss_hist, d_fake_loss_hist, d_real_acc_hist, d_fake_acc_hist = model.train(train_arguments)

hist = pd.DataFrame([d_loss_hist, d_acc_hist, g_loss_hist, d_real_loss_hist, d_fake_loss_hist, d_real_acc_hist, d_fake_acc_hist]).T
hist = hist.reset_index()
hist.columns = ['epoch_num','d_loss', 'd_acc', 'g_loss', 'd_real_loss', 'd_fake_loss', 'd_real_acc', 'd_fake_acc']
hist['epoch_num'] = hist['epoch_num'].apply(lambda x: x+1)
hist

Model: "Gen"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_87 (InputLayer)           [(None, 48)]         0                                            
__________________________________________________________________________________________________
input_88 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
tf.concat_26 (TFOpLambda)       (None, 49)           0           input_87[0][0]                   
                                                                 input_88[0][0]                   
__________________________________________________________________________________________________
dense_98 (Dense)                (None, 20)           1000        tf.concat_26[0][0]             

Unnamed: 0,epoch_num,d_loss,d_acc,g_loss,d_real_loss,d_fake_loss,d_real_acc,d_fake_acc
0,1,0.819998,0.460938,0.455943,0.514120,1.125876,0.828125,0.093750
1,2,0.820331,0.476562,0.412023,0.504689,1.135973,0.843750,0.109375
2,3,0.803070,0.492188,0.460131,0.481928,1.124213,0.906250,0.078125
3,4,0.791667,0.507812,0.502644,0.442059,1.141276,0.906250,0.109375
4,5,0.862804,0.453125,0.413796,0.421145,1.304463,0.890625,0.015625
...,...,...,...,...,...,...,...,...
995,996,0.693500,0.500000,0.709748,0.709873,0.677127,0.000000,1.000000
996,997,0.693773,0.500000,0.708863,0.709933,0.677614,0.015625,0.984375
997,998,0.693409,0.500000,0.709610,0.709735,0.677083,0.000000,1.000000
998,999,0.693055,0.500000,0.709720,0.709683,0.676428,0.000000,1.000000


The BB - random Forest performance

In [116]:
import math 

def draw_prob_histogram(valid_prob, dataset_title, prob_min, prob_max):
  valid_prob_df = pd.DataFrame(valid_prob, columns=['valid_prob'])
  bin_width= 0.05
  nbins = math.ceil((prob_max - prob_min) / bin_width)
  fig = px.histogram(valid_prob_df, x="valid_prob", nbins=nbins)
  fig.update_layout(title=f"Random Forest probability distribution for {dataset_title}",
                    xaxis_title="Probability",
                    yaxis_title="Count")
  fig.show()


def validate_random_forest_performance(GAN_model, X_df_valid, y_df_valid, dataset_title):
  valid_prob = GAN_model.rf.predict_proba(X_df_valid)[:,1]
  prob_min, prob_max, prob_mean = valid_prob.min(), valid_prob.max(), valid_prob.mean()

  print(f'{dataset_title}/n Min: {prob_min}, Max: {prob_max}, Mean: {prob_mean}') 
  valid_pred = [1 if prob>= 0.5 else 0 for prob in valid_prob]
  print(accuracy_score(y_df_valid, valid_pred))
  draw_prob_histogram(valid_prob, dataset_title, prob_min, prob_max)


validate_random_forest_performance(model, X_df_valid, y_df_valid, dataset_title)

German Credit/n Min: 0.014, Max: 0.742, Mean: 0.29430666666666666
0.7333333333333333


The Gan with BB performance

In [138]:
draw_mean_stats_for_hist(hist)

In [139]:
drow_acc_plot(hist, 2000)
drow_loss_plot(hist, 2000)

In [140]:
# generate fake samples
number_of_samples_to_generate = 1000
noise = tf.random.normal((number_of_samples_to_generate, model.noise_dim))
fake_confidence = tf.random.uniform((number_of_samples_to_generate, 1), minval=0, maxval=1)
# Generate a batch of new samples
gen_data = model.generator.model.predict([noise, fake_confidence])
real_confidence = model.rf.predict_proba(gen_data)[:,1]
prob_min, prob_max, prob_mean = real_confidence.min(), real_confidence.max(), real_confidence.mean()

print(f'{dataset_title}/n Min: {prob_min}, Max: {prob_max}, Mean: {prob_mean}') 
# draw_prob_histogram(real_confidence, dataset_title +' on generated samples', real_confidence.min(), real_confidence.max())
fake_confidence = np.array([x[0] for x in fake_confidence.numpy()])

German Credit/n Min: 0.31, Max: 0.518, Mean: 0.41891199999999995


In [145]:
prob_min, prob_max, prob_mean = fake_confidence.min(), fake_confidence.max(), fake_confidence.mean()

print(f'fake_confidence: {dataset_title}/n Min: {prob_min}, Max: {prob_max}, Mean: {prob_mean}') 

fake_confidence: German Credit/n Min: 8.809566497802734e-05, Max: 0.9979918003082275, Mean: 0.5164296627044678


In [141]:
bin_width= 0.05
nbins_real = math.ceil((real_confidence.max() - real_confidence.min()) / bin_width)
nbins_rf = math.ceil((fake_confidence.max() - fake_confidence.min()) / bin_width)

fig = go.Figure()
fig.add_trace(go.Histogram(x=real_confidence, nbinsx=nbins_real, name='RF Confidence'))
fig.add_trace(go.Histogram(x=fake_confidence, nbinsx=nbins_rf, name='Fake Confidence'))

# Overlay both histograms
fig.update_layout(barmode='overlay',
                  title=f"Confidence scores for generated samples on {dataset_title}",
                  xaxis_title="Confidence score",
                  yaxis_title="Samples count")
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [142]:
# identify to which class the generated samples looks like
pca = PCA(n_components=2)
principalComponents_origin_data = pca.fit_transform(df)
principal_origin_data_df = pd.DataFrame(data = principalComponents_origin_data
             , columns = ['principal component 1', 'principal component 2'])

principal_origin_data_df['y'] = df_y
principalComponents_generated_data = pca.transform(gen_data)
principal_generated_data_df = pd.DataFrame(data = principalComponents_generated_data
             , columns = ['principal component 1', 'principal component 2'])

principal_generated_data_df['y'] = ['generated data']*len(principal_generated_data_df)

principal_df = pd.concat([principal_origin_data_df, principal_generated_data_df])

fig = px.scatter(principal_df, x="principal component 1", y="principal component 2", color="y").for_each_trace(lambda t: t.update(name=t.name.split("=")[1]))

fig.update_layout(title=f"Origin and generated samples - visualization with PCA",
                  xaxis_title="principal component 1",
                  yaxis_title="principal component 2")
fig.show()

In [143]:
gen_samples_confidence = pd.DataFrame(list(zip(fake_confidence, real_confidence)), columns=['fake_confidence', 'real_confidence'])
gen_samples_confidence.corr()

Unnamed: 0,fake_confidence,real_confidence
fake_confidence,1.0,-0.041025
real_confidence,-0.041025,1.0


In [144]:
fig = px.scatter(gen_samples_confidence, x="fake_confidence", y="real_confidence")

fig.update_layout(title=f"Confidence scores for generated samples",
                  xaxis_title="Fake Confidence",
                  yaxis_title="RF Confidence")
fig.show()