In [None]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-1ed8c666-34c8-7707-18f0-d13cac717d1f)


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  proj_dir = "/content/drive/MyDrive/ece884_project/"
else:
  proj_dir = "../"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv(f"{proj_dir}data_clean/taxi.csv")

In [None]:
column_names = df.columns
df = df.to_numpy()

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(df) 
# Transfo
df = scaler.transform(df)

In [None]:
# TODO lets put this away as a script
import tensorflow as tf

import numpy as np
from tensorflow import keras

def build_network(output_dim, n_hidden, n_neurons, learning_rate):

    """

    output_dim: what do we want this to output
    Generator output n_columns of data
    Discriminator output 1, p(data_real|data_seen)

    n_hiden: number of layers of the neural net

    n_neurons: number of neuros in the network

    learning_rate: duhhh

    This outputs a keras neural net
    
    """
    model = keras.models.Sequential()
    model.add(keras.layers.Flatten())
    for _ in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="selu"))
        model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dense(output_dim + 10, activation="selu"))  
    model.add(keras.layers.Dense(output_dim, activation="sigmoid"))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer)
    return model

def train_gan(
    generator, discriminator, dataset, n_epochs=100, n_noise=20000, accuracy=0.9999
):
    """
    # TODO: UPDATE ARGS
    Inputs: 

    gan, this is a keras gan object made by combining two neural nets and
    restricting the trainability of one of them.

    dataset, this takes in regular tabular data. now this is training rowwise
    however i may change this to matrix wise like a picture.

    n_epochs, numper of times the gans go though training iterationations

    iterationations, number of times in gan iterationaton loop, 
    it would be a good idea to reduct this after the warmup period

    n_noise, this is the size of fake data generated

    
    Output:

    generators_saved, this is an iterationable list of keras objects that can be used
    
    discriminators_saved, same thing, these can be used to test

    for generator, discriminator in zip(gen, desc):
        noise = tf.random.normal(shape=dims)
        generated_data = generator(noise)
        judgement = discriminator(generated_data) # probs data is real
    """
    gan = keras.models.Sequential([generator, discriminator])
    
    discriminator.compile(loss="binary_crossentropy", optimizer="rmsprop")
    discriminator.trainable = False
    gan.compile(loss="binary_crossentropy", optimizer="rmsprop")
    generator, discriminator = gan.layers
    generators_saved = []
    discriminators_saved = []
    result_log = []
    for epoch in range(n_epochs):
        print(f"~~~~~~~~~~Epoch {epoch} of {n_epochs}~~~~~~~~~~~~~~")

        min_judgement = 0
        iteration = 0
        mu = np.random.randn()
        while min_judgement < accuracy:
            iteration += 1
            random_index = np.random.randint(len(dataset), size=n_noise)
            X_batch = dataset[random_index, :]
          

            noise = tf.random.normal(shape=X_batch.shape,
                                     mean=np.random.randn(),
                                     stddev=np.random.random() * 5) 

            # phase 1 - training the discriminator
            generated_data = generator(noise)
            X_fake_and_real = tf.concat([generated_data, X_batch], axis=0)
            y1 = tf.concat([tf.zeros(n_noise), tf.ones(n_noise)], axis=0)

            discriminator.trainable = True
            discriminator.train_on_batch(X_fake_and_real, y1)
            # phase 2 - training the generator

            noise = tf.random.normal(shape=X_batch.shape,
                                     mean=np.random.randn(),
                                     stddev=np.random.random() * 5) 
            
            discriminator.trainable = False
            gan.train_on_batch(noise, tf.ones(n_noise))
            
            noise = tf.random.normal(shape=X_batch.shape,
                                     mean=np.random.randn(),
                                     stddev=np.random.random() * 5) 
            
            generated_data = generator(noise)
            judgement = discriminator(generated_data) # probs data is real

          
            results = [epoch, iteration, np.mean(judgement), np.min(judgement), np.max(judgement)]
            print(
                "\n epoch", results[0],
                "\n iteration", results[1],
                "\n mean", results[2],
                "\n min ", results[3],
                "\n max ", results[4]
                )
            min_judgement = results[2]
            result_log.append(results)
        
        generators_saved.append(generator)
        discriminators_saved.append(discriminator)
            
    return generators_saved, discriminators_saved, result_log


lets consider initializing a new gan with each epoch or 

In [None]:
generator = build_network(output_dim=df.shape[1], n_hidden=10, n_neurons=500, learning_rate=5e-5)
discriminator = build_network(output_dim=1, n_hidden=10, n_neurons=500, learning_rate=1e-5)

In [None]:
generators_saved, discriminators_saved, result_log = train_gan(
    generator, discriminator, df, n_epochs=100, n_noise=100000, accuracy=0.9999999
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 min  4.8103154e-15 
 max  8.8234356e-07

 epoch 42 
 iteration 4612 
 mean 3.0668247e-13 
 min  3.0668247e-13 
 max  3.0668247e-13

 epoch 42 
 iteration 4613 
 mean 3.003775e-13 
 min  3.0037748e-13 
 max  3.0037748e-13

 epoch 42 
 iteration 4614 
 mean 2.9586945e-13 
 min  2.958695e-13 
 max  2.958695e-13

 epoch 42 
 iteration 4615 
 mean 2.928361e-13 
 min  2.9283612e-13 
 max  2.9283612e-13

 epoch 42 
 iteration 4616 
 mean 2.887595e-13 
 min  2.887595e-13 
 max  2.887595e-13

 epoch 42 
 iteration 4617 
 mean 2.8543834e-13 
 min  2.8543834e-13 
 max  2.8543834e-13

 epoch 42 
 iteration 4618 
 mean 2.826008e-13 
 min  2.8260082e-13 
 max  2.8260082e-13

 epoch 42 
 iteration 4619 
 mean 2.7903634e-13 
 min  2.7903632e-13 
 max  2.7903632e-13

 epoch 42 
 iteration 4620 
 mean 2.7625038e-13 
 min  2.762503e-13 
 max  2.762503e-13

 epoch 42 
 iteration 4621 
 mean 2.736049e-13 
 min  2.7360482e-13 
 max  2.7360482

In [None]:
import os
import re
import pickle
models = os.listdir(f"{proj_dir}saved_models/list_of_models/gen")
model_number = [int(re.sub("generators", "", x)) for x in models]
last_model = max(model_number)

In [None]:
with open(f"{proj_dir}saved_models/list_of_models/gen/generators{last_model+1}", "wb") as fp:
    pickle.dump(generators_saved, fp)

with open(f"{proj_dir}saved_models/list_of_models/disc/discriminators{last_model+1}", "wb") as fp:
    pickle.dump(discriminators_saved, fp)

with open(f"{proj_dir}logs/logGAN{last_model+1}", "wb") as fp:
    pickle.dump(result_log, fp)
  

In [None]:
def generated_data_filter(gen, desc, points_to_gen, threashold, dims):
    """
    inputs
    gen, is the list of gans we wrote with the gan.ipynb

    desc, is the list of discriminators in the notebook gan.ipynb
    
    points_to_gen, number of datapoints for each model to generate

    threashold, is what is the discriminator's predicted probability of the data being real
    we need to see to keep the data. 
    with a threashold = 0.99 we will drop every datapoint that the discriminator says has a 
    less than .99 change of being real. 
    we will need to play with this.

    """
    n_col = dims[1]
    quality_data = np.empty((0, n_col), np.float32)

    for generator, discriminator in zip(gen, desc):
        noise = tf.random.normal(shape=(points_to_gen, n_col))
        generated_data = generator(noise)
        judgement = discriminator(generated_data) # probs data is real
        data_fooling_discriminator = np.compress(np.ravel(judgement) > threashold, generated_data, axis=0)

        quality_data = np.append(quality_data, data_fooling_discriminator, axis=0)
    
    for discriminator in desc:
        judgement = discriminator(quality_data)
        quality_data = np.compress(np.ravel(judgement) > threashold, quality_data, axis=0)
    return quality_data

In [None]:
generated_dataset = generated_data_filter(generators_saved, discriminators_saved, points_to_gen=10, threashold=0.99, dims=df.shape)

In [None]:
generated_data = pd.DataFrame(scaler.inverse_transform(generated_dataset), columns=column_names) # revert data

In [None]:
results = pd.DataFrame(result_log, columns=["epoch", "iter", "mean", "min", "max"])
results.to_csv(f"{proj_dir}")