In [1]:
import os, sys
sys.path.append(os.getcwd())

import random
import time

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v1 as tf
import sklearn.datasets


import tflib as lib
import tflib.ops.linear
import tflib.plot

In [2]:
MODE = 'wgan-gp' # wgan or wgan-gp
DATASET = 'gaussian' # 8gaussians, 25gaussians, swissroll
DIM = 64 # Model dimensionality (number of neurons in the hidden layer(s))
FIXED_GENERATOR = False # whether to hold the generator fixed at real data plus
                        # Gaussian noise, as in the plots in the paper
LAMBDA = .1 # Smaller lambda makes things faster for toy tasks, but isn't
            # necessary if you increase CRITIC_ITERS enough
CRITIC_ITERS = 5 # How many critic iterations per generator iteration
BATCH_SIZE = 256 # Batch size
ITERS = 2500#100000 # how many generator iterations to train for
DATA_DIM = 32
LATENT_DIM = 4
INITIALIZATION = 'he'#'glorot'
COVARIANCE_SCALE = np.sqrt(DATA_DIM)
INITIALIZE_LAST = True
SAMPLE_SIZE = 100000
tf.compat.v1.disable_eager_execution()

In [3]:
lib.print_model_settings(locals().copy())

Uppercase local vars:
	BATCH_SIZE: 256
	COVARIANCE_SCALE: 5.656854249492381
	CRITIC_ITERS: 5
	DATASET: gaussian
	DATA_DIM: 32
	DIM: 64
	FIXED_GENERATOR: False
	INITIALIZATION: he
	INITIALIZE_LAST: True
	ITERS: 2500
	LAMBDA: 0.1
	LATENT_DIM: 4
	MODE: wgan-gp
	SAMPLE_SIZE: 100000


In [4]:
#not needed anymore
ML_cov = np.diag(np.concatenate([np.ones(LATENT_DIM), np.zeros(DATA_DIM-LATENT_DIM)]))/COVARIANCE_SCALE
def get_cov_diff(fake_sample):
    """
    Outputs frobenius norm of the difference between generated data covariance and ML distribution covariance.
    """
    generated_cov = np.cov(fake_sample.T)
    cov_mismatch = np.linalg.norm(generated_cov - ML_cov)
    return cov_mismatch

In [5]:
init_last = INITIALIZATION if INITIALIZE_LAST else None

def ReLULayer(name, n_in, n_out, inputs):
    output = lib.ops.linear.Linear(
        name+'.Linear',
        n_in,
        n_out,
        inputs,
        initialization=INITIALIZATION
    )
    output = tf.nn.relu(output)
    return output

def Generator(n_samples, real_data):
    if FIXED_GENERATOR:
        return real_data + (1.*tf.random_normal(tf.shape(real_data)))
    else:
        noise = tf.random_normal((n_samples, LATENT_DIM))
        output = ReLULayer('Generator1', LATENT_DIM, DIM, noise)
        output = ReLULayer('Generator2', DIM, DIM, output)
        output = ReLULayer('Generator3', DIM, DIM, output)
        output = lib.ops.linear.Linear('Generator4', DIM, DATA_DIM, output, initialization=init_last)#MAYBE THEY DIDN'T DO IT
        return output

def Discriminator(inputs):
    output = ReLULayer('Discriminator1', 32, DIM, inputs)
    output = ReLULayer('Discriminator2', DIM, DIM, output)
    output = ReLULayer('Discriminator3', DIM, DIM, output)
    output = lib.ops.linear.Linear('Discriminator4', DIM, 1, output, initialization=init_last)
    return tf.reshape(output, [-1])

In [6]:
real_data = tf.placeholder(shape=(BATCH_SIZE,DATA_DIM), dtype=tf.float32)
n_samples = BATCH_SIZE
fake_data = Generator(n_samples, real_data)

In [8]:
disc_real = Discriminator(real_data)
disc_fake = Discriminator(fake_data)

In [9]:
# WGAN loss
disc_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
gen_cost = -tf.reduce_mean(disc_fake)

In [10]:
# WGAN gradient penalty
if MODE == 'wgan-gp':
    alpha = tf.compat.v1.random_uniform(
        shape=[BATCH_SIZE,1], 
        minval=0.,
        maxval=1.
    )
    interpolates = alpha*real_data + ((1-alpha)*fake_data)
    disc_interpolates = Discriminator(interpolates)
    gradients = tf.gradients(disc_interpolates, [interpolates])[0]
    slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
    gradient_penalty = tf.reduce_mean((slopes-1)**2)
 
    disc_cost += LAMBDA*gradient_penalty

disc_params = lib.params_with_name('Discriminator')
gen_params = lib.params_with_name('Generator')

if MODE == 'wgan-gp':
    disc_train_op = tf.train.AdamOptimizer(
        learning_rate=1e-4, 
        beta1=0.5, 
        beta2=0.9
    ).minimize(
        disc_cost, 
        var_list=disc_params
    )
    if len(gen_params) > 0:
        gen_train_op = tf.train.AdamOptimizer(
            learning_rate=1e-4, 
            beta1=0.5, 
            beta2=0.9
        ).minimize(
            gen_cost, 
            var_list=gen_params
        )
    else:
        gen_train_op = tf.no_op()

else:
    disc_train_op = tf.train.RMSPropOptimizer(learning_rate=5e-5).minimize(
        disc_cost, 
        var_list=disc_params
    )
    if len(gen_params) > 0:
        gen_train_op = tf.train.RMSPropOptimizer(learning_rate=5e-5).minimize(
            gen_cost, 
            var_list=gen_params
        )
    else:
        gen_train_op = tf.no_op()


    # Build an op to do the weight clipping
    clip_ops = []
    for var in disc_params:
        clip_bounds = [-.01, .01]
        clip_ops.append(
            tf.assign(
                var, 
                tf.clip_by_value(var, clip_bounds[0], clip_bounds[1])
            )
        )
    clip_disc_weights = tf.group(*clip_ops)

In [11]:
print("Generator params:")
for var in lib.params_with_name('Generator'):
    print("\t{}\t{}".format(var.name, var.get_shape()))
print("Discriminator params:")
for var in lib.params_with_name('Discriminator'):
    print("\t{}\t{}".format(var.name, var.get_shape()))

frame_index = [0]

Generator params:
	Generator1.Linear/Generator1.Linear.W:0	(4, 64)
	Generator1.Linear/Generator1.Linear.b:0	(64,)
	Generator2.Linear/Generator2.Linear.W:0	(64, 64)
	Generator2.Linear/Generator2.Linear.b:0	(64,)
	Generator3.Linear/Generator3.Linear.W:0	(64, 64)
	Generator3.Linear/Generator3.Linear.b:0	(64,)
	Generator4/Generator4.W:0	(64, 32)
	Generator4/Generator4.b:0	(32,)
Discriminator params:
	Discriminator1.Linear/Discriminator1.Linear.W:0	(32, 64)
	Discriminator1.Linear/Discriminator1.Linear.b:0	(64,)
	Discriminator2.Linear/Discriminator2.Linear.W:0	(64, 64)
	Discriminator2.Linear/Discriminator2.Linear.b:0	(64,)
	Discriminator3.Linear/Discriminator3.Linear.W:0	(64, 64)
	Discriminator3.Linear/Discriminator3.Linear.b:0	(64,)
	Discriminator4/Discriminator4.W:0	(64, 1)
	Discriminator4/Discriminator4.b:0	(1,)


In [12]:
from sklearn.utils import shuffle

In [13]:
def inf_train_gen():
    if DATASET == '25gaussians':
    
        dataset = []
        for i in xrange(100000/25):
            for x in xrange(-2, 3):
                for y in xrange(-2, 3):
                    point = np.random.randn(2)*0.05
                    point[0] += 2*x
                    point[1] += 2*y
                    dataset.append(point)
        dataset = np.array(dataset, dtype='float32')
        np.random.shuffle(dataset)
        dataset /= 2.828 # stdev
        while True:
            for i in xrange(len(dataset)/BATCH_SIZE):
                yield dataset[i*BATCH_SIZE:(i+1)*BATCH_SIZE]

    elif DATASET == 'swissroll':

        while True:
            data = sklearn.datasets.make_swiss_roll(
                n_samples=BATCH_SIZE, 
                noise=0.25
            )[0]
            data = data.astype('float32')[:, [0, 2]]
            data /= 7.5 # stdev plus a little
            yield data

    elif DATASET == '8gaussians':
    
        scale = 2.
        centers = [
            (1,0),
            (-1,0),
            (0,1),
            (0,-1),
            (1./np.sqrt(2), 1./np.sqrt(2)),
            (1./np.sqrt(2), -1./np.sqrt(2)),
            (-1./np.sqrt(2), 1./np.sqrt(2)),
            (-1./np.sqrt(2), -1./np.sqrt(2))
        ]
        centers = [(scale*x,scale*y) for x,y in centers]
        while True:
            dataset = []
            for i in xrange(BATCH_SIZE):
                point = np.random.randn(2)*.02
                center = random.choice(centers)
                point[0] += center[0]
                point[1] += center[1]
                dataset.append(point)
            dataset = np.array(dataset, dtype='float32')
            dataset /= 1.414 # stdev
            yield dataset

    elif DATASET == 'gaussian':
        np.random.seed(1)
        full_dataset = np.random.randn(SAMPLE_SIZE,DATA_DIM) / np.sqrt(COVARIANCE_SCALE) 
        i = 0
        offset = 0
        while True:
            dataset = full_dataset[i*BATCH_SIZE+offset:(i+1)*BATCH_SIZE+offset,:]
            if (i+1)*BATCH_SIZE+offset > SAMPLE_SIZE: 
                offset = (i+1)*BATCH_SIZE+offset - SAMPLE_SIZE
                np.random.shuffle(full_dataset)
                dataset = np.concatenate([dataset,full_dataset[:offset,:]], axis = 0)
                i = -1 
            i+=1
            yield dataset

In [14]:
#full_dataset = np.random.randn(SAMPLE_SIZE,DATA_DIM) / np.sqrt(COVARIANCE_SCALE)  #data_covariance = np.cov(full_dataset.T)
#w, v = np.linalg.eigh(data_covariance)
#w[:-LATENT_DIM] = 0
#ML_covariance = v.dot(np.diag(w)).dot(v.T)

In [15]:
mean_fake_data = tf.reduce_mean(fake_data, axis=0, keep_dims=True)
vx = tf.matmul(tf.transpose(fake_data),fake_data)/tf.cast(tf.shape(fake_data)[0]-1, tf.float32)
mx = tf.matmul(tf.transpose(mean_fake_data), mean_fake_data)
fake_data_covariance = vx - mx

In [16]:
ML_covariance = np.eye(DATA_DIM)/COVARIANCE_SCALE
accuracy_metric = tf.norm(fake_data_covariance - ML_covariance)

In [17]:
# Train loop!
accuracy_history = []
#if COVARIANCE_SCALE == DATA_DIM:
#    model_name = "d_"
#else:
#    model_name = "root_d_"
#model_name = model_name + "initialize_last_" + str(INITIALIZE_LAST) + "_initialization_" + INITIALIZATION
if MODE == 'wgan-gp':
    model_name = "WGAN_GP"
else:
    model_name = "WGAN_WC"
model_name = model_name + "_LATENT_DIM_" + str(LATENT_DIM) + "_initialization_" + INITIALIZATION
model_name = 'tmp_'+model_name #not to spoil the saved results
plt.figure()
with tf.Session() as session:
    session.run(tf.initialize_all_variables())
    gen = inf_train_gen()
    for iteration in range(ITERS):
        # Train generator
        if iteration > 0:
            _ = session.run(gen_train_op)
        # Train critic
        for i in range(CRITIC_ITERS):
            _data = next(gen)
            _disc_cost, _, accuracy, sample = session.run(
                [disc_cost, disc_train_op, accuracy_metric, fake_data],
                feed_dict={real_data: _data}
            )
            if MODE == 'wgan':
                _ = session.run([clip_disc_weights])
        # Write logs and save samples
        #print(np.abs(get_cov_diff(fake_sample)-accuracy)/get_cov_diff(fake_sample))
        lib.plot.plot('disc cost', _disc_cost)
        lib.plot.plot('accuracy', accuracy)
        lib.plot.plot('sample', sample)
        
        accuracy_history.append(accuracy)
        if iteration % 100 == 99:
            lib.plot.flush("./"+model_name+".pkl")
            plt.clf()
            plt.grid("on", "both")
            plt.plot(np.arange(iteration+1), accuracy_history)
            plt.plot(np.arange(iteration+1), np.zeros(iteration+1))
            plt.savefig("./accuracy_history_"+model_name+".png")
        lib.plot.tick()

Instructions for updating:
Use `tf.global_variables_initializer` instead.
iter 99	disc cost	-5.525683403015137	accuracy	6.3435211181640625	sample	0.013422275893390179
iter 199	disc cost	-4.095254898071289	accuracy	2.9959957599639893	sample	-0.022270550951361656
iter 299	disc cost	-1.2447140216827393	accuracy	1.491998314857483	sample	-0.0031557336915284395
iter 399	disc cost	-1.125573754310608	accuracy	1.4293279647827148	sample	0.003502724226564169
iter 499	disc cost	-1.2014950513839722	accuracy	1.4916225671768188	sample	0.00865233689546585
iter 599	disc cost	-1.1076470613479614	accuracy	1.4260931015014648	sample	0.005894728470593691
iter 699	disc cost	-0.9336122870445251	accuracy	1.3409103155136108	sample	0.0017405118560418487
iter 799	disc cost	-0.9271945953369141	accuracy	1.3947192430496216	sample	-0.0017746153753250837
iter 899	disc cost	-1.0307962894439697	accuracy	1.4054521322250366	sample	-0.005566354375332594
iter 999	disc cost	-0.9024841785430908	accuracy	1.29329514503479	sampl

KeyboardInterrupt: 

-- "accuracy_history_rootd_covariance_he_gp_inithelastlayer.png" -- $K_Y = I/\sqrt{D}$, output layer of generator and discriminator was initialized with he initializations cost = 

-- "accuracy_history_rootd_covariance_he_gp_initnonelastlayer.png" -- $K_Y = I/\sqrt{D}$, output layer of generator and discriminator were initialized with random orthogonal matrices cost = 

-- "accuracy_history_d_covariance_he_gp_initnonelastlayer.png" -- $K_Y = I/D$, output layer of generator and discriminator were initialized with random orthogonal matrices cost = 0.19

ADD: $K_Y = I/D$ and error in $K_Y:$ Y is generated with $I/d$ and $K_Y = I/\sqrt{D}$ for ML matrix

In [None]:
step = 10
plt.clf()
plt.grid("on", "both")
accuracy_history = np.array(accuracy_history)
idx = np.max(np.where(accuracy_history > 3.5))
plt.plot(np.arange(idx,iteration+1,step), accuracy_history[idx::step])
plt.plot(np.arange(iteration+1), np.zeros(iteration+1))
plt.savefig("./accuracy_history_"+model_name+".png")
lib.plot.flush("./"+model_name+".pkl")

In [None]:
import pickle as pkl
with open('./log.pkl', 'rb') as f:
    dict_out = pkl.load(f)

In [None]:
x, y = map(np.array,zip(*list(dict_out['accuracy'].items())))
if np.max(y) <= 3.5:
    idx = 0

idx = np.max(np.where(y>3.5))+1

In [None]:
step = 50
plt.figure()
plt.plot(x[idx::step], y[idx::step], linewidth=2, color = 'red')
plt.plot(np.arange(np.max(x)+1), np.zeros(np.max(x)+1), linewidth=2, color = 'green')
plt.grid('on', 'both')
plt.savefig('accuracy.png')

In [None]:
step = 50
plt.clf()
plt.grid("on", "both")
accuracy_history = np.array(accuracy_history)
idx = np.max(np.where(accuracy_history > 3.5))
plt.plot(np.arange(idx,iteration+1,step), accuracy_history[idx::step], linewidth=2, color = 'red')
plt.plot(np.arange(iteration+1), np.zeros(iteration+1), linewidth=2, color = 'green')
plt.savefig("./accuracy_history_"+model_name+".png")
lib.plot.flush("./"+model_name+".pkl")