In [67]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-8d2d5397-0bb4-fca5-8820-0f076f36f052)


In [68]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [69]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')
  proj_dir = "/content/drive/MyDrive/ece884_project/"
else:
  proj_dir = "../"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
import os
import re
import pickle
models = os.listdir(f"{proj_dir}saved_models/list_of_models/gen")
model_number = [int(re.sub("generators", "", x)) for x in models]
last_model = max(model_number)

In [71]:
df = pd.read_csv(f"{proj_dir}data_clean/taxi.csv")
column_names = df.columns
df = df.to_numpy()
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(df)

In [72]:
# TODO lets put this away as a script
import tensorflow as tf

import numpy as np
from tensorflow import keras

def build_network(output_dim, n_hidden, n_neurons, learning_rate):

    """

    output_dim: what do we want this to output
    Generator output n_columns of data
    Discriminator output 1, p(data_real|data_seen)

    n_hiden: number of layers of the neural net

    n_neurons: number of neuros in the network

    learning_rate: duhhh

    This outputs a keras neural net
    
    """
    model = keras.models.Sequential()
    model.add(keras.layers.Flatten())
    for _ in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="selu"))
        model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dense(output_dim + 10, activation="selu"))  
    model.add(keras.layers.Dense(output_dim, activation="sigmoid"))
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer)
    return model

def train_gan(
    generator, discriminator, dataset, n_epochs=100, n_noise=20000
):
    """
    # TODO: UPDATE ARGS
    Inputs: 

    gan, this is a keras gan object made by combining two neural nets and
    restricting the trainability of one of them.

    dataset, this takes in regular tabular data. now this is training rowwise
    however i may change this to matrix wise like a picture.

    n_epochs, numper of times the gans go though training iterationations

    iterationations, number of times in gan iterationaton loop, 
    it would be a good idea to reduct this after the warmup period

    n_noise, this is the size of fake data generated

    
    Output:

    generators_saved, this is an iterationable list of keras objects that can be used
    
    discriminators_saved, same thing, these can be used to test

    for generator, discriminator in zip(gen, desc):
        noise = tf.random.normal(shape=dims)
        generated_data = generator(noise)
        judgement = discriminator(generated_data) # probs data is real
    """
    gan = keras.models.Sequential([generator, discriminator])
  
    discriminator.compile(loss="binary_crossentropy", optimizer="rmsprop")
    discriminator.trainable = False
    gan.compile(loss="binary_crossentropy", optimizer="rmsprop")
    generator, discriminator = gan.layers
    data_out = np.empty((0, dataset.shape[1]))

    
    for epoch in range(5):

        tf.random.set_seed(epoch*5)
        random_index = tf.random.uniform(shape=(n_noise,), minval=0, maxval=len(dataset), dtype=tf.int32)
        X_batch = dataset[random_index, :]

        for iteration in range(n_noise):

            noise = tf.random.normal(shape=X_batch.shape,
                                     mean=0,
                                     stddev=1) 

            generated_data = generator(noise)
            X_fake_and_real = tf.concat([generated_data, X_batch], axis=0)
            y1 = tf.concat([tf.zeros(n_noise), tf.ones(n_noise)], axis=0)
            
            # training discriminator
            discriminator.trainable = True
            discriminator.train_on_batch(X_fake_and_real, y1)
            # training the generator

            noise = tf.random.normal(shape=X_batch.shape,
                                     mean=0,
                                     stddev=1) 
            
            discriminator.trainable = False
            gan.train_on_batch(noise, tf.ones(n_noise))
            
            # testing quality of model

            # TODO:
            # Compute Covariance matrix
            # Order Stats of generated
            # classifier
            
        noise = tf.random.normal(shape=X_batch.shape,
                                  mean=0,
                                  stddev=1) 
        generated_data = generator(noise)
        rand = tf.random.uniform(shape=(1,), minval=0, maxval=X_batch.shape[0], dtype=tf.int32)

        data_out = np.concatenate([data_out, generated_data[ :5 , :]])
    
    return data_out
    


lets consider initializing a new gan with each epoch or 

In [73]:
from tqdm import tqdm

In [74]:
for i in tqdm(range(500)):

    generator = build_network(output_dim=df.shape[1], n_hidden=3, n_neurons=300+i, learning_rate=5e-3 + i * 1e-5)
    discriminator = build_network(output_dim=1, n_hidden=2, n_neurons=250+i, learning_rate=1e-3 + i * 1e-5) 
    gen_data = train_gan(generator, discriminator, df, n_epochs=10, n_noise=1000)
    output_path = f"{proj_dir}data_generated/gan_gen{last_model}.csv"
    generated_data = pd.DataFrame(scaler.inverse_transform(gen_data), columns=column_names) 
    generated_data.to_csv(output_path, mode='a', header=not os.path.exists(output_path))


  1%|          | 3/250 [06:06<8:23:08, 122.22s/it]


KeyboardInterrupt: ignored

In [None]:

def generated_data_filter(gen, desc, points_to_gen, threashold, dims):
    """
    inputs
    gen, is the list of gans we wrote with the gan.ipynb

    desc, is the list of discriminators in the notebook gan.ipynb
    
    points_to_gen, number of datapoints for each model to generate

    threashold, is what is the discriminator's predicted probability of the data being real
    we need to see to keep the data. 
    with a threashold = 0.99 we will drop every datapoint that the discriminator says has a 
    less than .99 change of being real. 
    we will need to play with this.

    """
    n_col = dims[1]
    quality_data = np.empty((0, n_col), np.float32)

    for generator, discriminator in zip(gen, desc):
        noise = tf.random.normal(shape=(points_to_gen, n_col))
        generated_data = generator(noise)
        judgement = discriminator(generated_data) # probs data is real
        data_fooling_discriminator = np.compress(np.ravel(judgement) > threashold, generated_data, axis=0)

        quality_data = np.append(quality_data, data_fooling_discriminator, axis=0)
    
    for discriminator in desc:
        judgement = discriminator(quality_data)
        quality_data = np.compress(np.ravel(judgement) > threashold, quality_data, axis=0)
    return quality_data