In [2]:
import tensorflow as tf
import os
import numpy as np
#import cv2
from numpy import expand_dims

#os.environ["CUDA_VISIBLE_DEVICES"]="5"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"
cuda_count = 3
global_batch_size = 128*cuda_count

def generate_latent_points(latent_dim, n_samples):
	# generate points in the latent space
	x_input = np.random.uniform(0.0, 1.0, latent_dim * n_samples)
	# reshape into a batch of inputs for the network
	x_input = x_input.reshape(n_samples, latent_dim)
	return x_input

def generate_fake_samples(g_model, latent_dim, n_samples):
	# generate points in latent space
	x_input = generate_latent_points(latent_dim, n_samples)
	# predict outputs
	X = g_model.predict(x_input)
	# create 'fake' class labels (0)
	y = np.zeros((n_samples, 1))
	return X, y

def discriminator(inshape=(28,28,1)):
	inputs = tf.keras.Input(shape=inshape)
	x = tf.keras.layers.Conv2D(64,(3,3),strides=(2,2),padding='same', activation = tf.nn.leaky_relu)(inputs)
	#x = tf.keras.layers.LeakyReLU(alpha=0.2)(x)
	x = tf.keras.layers.Dropout(0.4)(x)
	x = tf.keras.layers.Conv2D(64,(3,3),strides=(2,2),padding='same', activation = tf.nn.leaky_relu)(x)
	#x = tf.keras.layers.LeakyReLU(alpha=0.2)(x)
	x = tf.keras.layers.Dropout(0.4)(x)
	x = tf.keras.layers.Flatten()(x)
	outputs = tf.keras.layers.Dense(100, activation = 'sigmoid')(x)
	#outputs = tf.keras.layers.LeakyReLU(alpha=0.2)(x)
	d = tf.keras.Model(inputs=inputs,outputs=outputs)
	return d

def generator(latent_dim):
	g_inputs = tf.keras.Input(shape=(latent_dim,))
	x = tf.keras.layers.Dense(128*7*7, activation = tf.nn.leaky_relu)(g_inputs)
	#x = tf.keras.layers.LeakyReLU(alpha=0.2)(x)
	x = tf.keras.layers.Reshape( (7,7,128), input_shape=(128*7*7,))(x)
	x = tf.keras.layers.Conv2DTranspose(128,(4,4),strides=(2,2), padding='same', activation = tf.nn.leaky_relu)(x)
	#x = tf.keras.layers.LeakyReLU(alpha=0.2)(x)
	x = tf.keras.layers.Conv2DTranspose(128,(4,4),strides=(2,2), padding='same', activation = tf.nn.leaky_relu)(x)
	#x = tf.keras.layers.LeakyReLU(alpha=0.2)(x)
	g_outputs = tf.keras.layers.Conv2D(1,(7,7), activation='sigmoid',padding='same')(x)
	g = tf.keras.Model(inputs=g_inputs,outputs=g_outputs)
	return g

train, test = tf.keras.datasets.mnist.load_data()
X, L = train
X = X.astype('float32')
X_use = X / 255.0
X_use = np.expand_dims(X_use, axis=-1)
#control = np.array([0,0,0,0,0,1])
#Y = np.tile(control,int(X.shape[0]/6.0/global_batch_size))
#print(X[0])
mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():

	@tf.function
	def train_step(X ,l, epsilon): 
		'''
		def step_1(X, latent_dim, epsilon):

			scale = 10.0
			size = X.shape[0]
			x_input1 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
			X_fake1 = g(x_input1)
			yy2 = d(X_fake1)
			

			with tf.GradientTape() as tape1:
				
				def safe_norm(x, e=1e-15, axis=None):
					return tf.sqrt(tf.reduce_sum( tf.square(x), axis=axis) + e)

				tape1.watch(d.trainable_variables)
				x_input2 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
				yy2_pre = g(x_input2)
				yy1 = d(yy2_pre)
				xx1 = d(X)
				
				d_loss_pre  = (safe_norm(yy1 - yy2, axis = 1) - safe_norm(yy1, axis = 1)) - (safe_norm(xx1 - yy2, axis = 1) - safe_norm(xx1, axis = 1))
				
				x_hat = epsilon * xx1 + (1 - epsilon) * yy1

				ddx_pre = tf.gradients(safe_norm(x_hat - yy2, axis = 1) - safe_norm(x_hat, axis = 1), x_hat)[0]
				
				ddx = tf.square(safe_norm(ddx_pre, axis=1) - 1.0) * scale


				d_loss = (d_loss_pre + ddx)*(1/size)

				#d_loss = d_loss_pre 
				
			grads_d = tape1.gradient(d_loss, d.trainable_variables)
			opt1.apply_gradients(list(zip(grads_d, d.trainable_variables)))

			return d_loss

		def step_2(X, latent_dim):

			size = X.shape[0]	
			xx1 = d(X)
			x_input1 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
			yy2_pre = g(x_input1)
			yy2 = d(yy2_pre)
			

			with tf.GradientTape() as tape2:
				def safe_norm(x, e=1e-15, axis=None):
					return tf.sqrt(tf.reduce_sum( tf.square(x) , axis=axis) + e)

				tape2.watch(g.trainable_variables)
				x_input2 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
				yy1_pre = g(x_input2)
				yy1 = d(yy1_pre)
				g_loss =  (safe_norm(xx1 - yy2, axis = 1) - safe_norm(xx1, axis = 1)) - (safe_norm(yy1 - yy2, axis = 1) - safe_norm(yy1, axis = 1))*(1/size) 
			
			grads_g = tape2.gradient(g_loss, g.trainable_variables)
			#print(grads_g)
			opt2.apply_gradients(list(zip(grads_g, g.trainable_variables)))
			return g_loss
		'''		
		def step_fn(X, latent_dim, epsilon):

			x = tf.split(X, num_or_size_splits=6, axis=0)
			scale = 10.0
			size = x[0].shape[0]

			for i in range(5):

				#x_input1 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
				#X_fake1 = g(x_input1)
				#yy2 = d(X_fake1)

				with tf.GradientTape(persistent=True) as tape1:
					def safe_norm(x, e=1e-15, axis=None):
						return tf.sqrt(tf.reduce_sum( tf.square(x), axis=axis) + e)

					#tape1.watch(d.trainable_variables)
					x_input1 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
					X_fake1 = g(x_input1)
					yy2 = d(X_fake1, training = False)
					x_input2 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
					yy2_pre = g(x_input2)
					yy1 = d(yy2_pre)
					xx1 = d(x[i])
					
					d_loss_pre  = (tf.norm(yy1 - yy2, axis = 1) - tf.norm(yy1, axis = 1)) - (tf.norm(xx1 - yy2, axis = 1) - tf.norm(xx1, axis = 1))
					
					x_hat = epsilon * xx1 + (1 - epsilon) * yy1

					with tf.GradientTape(persistent=True) as itape:
						itape.watch(x_hat)
						ddx_pre = (tf.norm(x_hat - yy2, axis = 1) - tf.norm(x_hat, axis = 1))
					#	ddx_pre = x_hat*x_hat
					ddx_pre2 = itape.gradient(ddx_pre, x_hat)
					#ddx_pre = tf.gradients((tf.norm(x_hat - yy2, axis = 1) - tf.norm(x_hat, axis = 1)) , x_hat)[0]
					
					ddx = tf.square(tf.norm(ddx_pre2, axis=1) - 1.0) * scale


					d_loss = (d_loss_pre + ddx)

					#d_loss = d_loss_pre 
					
				grads_d = tape1.gradient(d_loss, d.trainable_variables)
				d_op = opt1.apply_gradients(list(zip(grads_d, d.trainable_variables)))

			
			#xx1 = d(x[5])
			#x_input1 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
			#yy2_pre = g(x_input1)
			#yy2 = d(yy2_pre)

			with tf.GradientTape() as tape2:
				def safe_norm(x, e=1e-15, axis=None):
					return tf.sqrt(tf.reduce_sum( tf.square(x) , axis=axis) + e)

				#tape2.watch(g.trainable_variables)	
				xx1 = d(x[5])
				x_input1 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
				yy2_pre = g(x_input1, training = False)
				yy2 = d(yy2_pre)

				x_input2 = tf.random.uniform(shape=[size, latent_dim],minval=0.0,maxval=1.0)
				yy1_pre = g(x_input2)
				yy1 = d(yy1_pre)
				g_loss =  ((tf.norm(xx1 - yy2, axis = 1) - tf.norm(xx1, axis = 1)) - (tf.norm(yy1 - yy2, axis = 1) - tf.norm(yy1, axis = 1))) 
			
			grads_g = tape2.gradient(g_loss, g.trainable_variables)
			g_op = opt2.apply_gradients(list(zip(grads_g, g.trainable_variables)))

			return g_loss

		#x = tf.split(X, num_or_size_splits=6, axis=0)

		'''
		for i in range(5):
			d_per_example_losses = mirrored_strategy.experimental_run_v2(step_1, args=(x[i], l, epsilon))
			d_mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.MEAN, d_per_example_losses, axis=0)

		g_per_example_losses = mirrored_strategy.experimental_run_v2(step_2, args=(x[5], l))
		g_mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.MEAN, g_per_example_losses, axis=0)
		'''
		g_per_example_losses = mirrored_strategy.experimental_run_v2(step_fn, args=(X, l, epsilon))
		g_mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.MEAN, g_per_example_losses, axis=0)

		#for var in d.trainable_variables:
		#		var.assign(tf.clip_by_value(var, -0.01, 0.01))
		#for var in gan_model.trainable_variables:
				#var.assign(tf.clip_by_value(var, -0.1, 0.1))
		#d.trainable = False
		
		return g_mean_loss

	d = discriminator()
	g = generator(100)
	epsilon = tf.random.uniform([], 0.0, 1.0)
	#train_acc = tf.keras.metrics.BinaryAccuracy()
	
	dataset = tf.data.Dataset.from_tensor_slices( X_use ).batch(int(global_batch_size*6)).shuffle(1000).repeat(2400)
	dist_dataset = mirrored_strategy.experimental_distribute_dataset(dataset)
	opt1 = tf.keras.optimizers.Adam(lr=2e-4, beta_1=0.5, beta_2=0.9)
	opt2 = tf.keras.optimizers.Adam(lr=2e-4, beta_1=0.5, beta_2=0.9)

	#o1 = tf.compat.v1.train.AdamOptimizer.minimize(op1,lr=2e-4, beta_1=0.5, beta_2=0.9)
	#o2 = tf.compat.v1.train.AdamOptimizer.minimize(op2,lr=2e-4, beta_1=0.5, beta_2=0.9)

	#g.trainable = False	
	#j = 0
	for inputs in dist_dataset:
		#cv2.imwrite( str(j)+'.jpg', X[0, :, :, 0]*255)
		print(train_step(inputs ,100, epsilon))
		#if(j%500==0):
		#	X_re, _ = generate_fake_samples(g, 100, 64)
		#	dd = np.reshape(X_re[:,:,:,0], [8*X_re.shape[1], 8*X_re.shape[1]] )
		#	cv2.imwrite( str(j) +'.jpg', dd*255.0 )
		#j = j+1
		
n_samples = 25
X, _ = generate_fake_samples(g, 100, n_samples)

INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:batch_all_reduce: 6 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:batch_all_reduce: 8 all-reduces with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/jo

ValueError: in converted code:

    <ipython-input-2-27bf35756c77>:215 train_step  *
        g_mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.MEAN, g_per_example_losses, axis=0)
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/distribute/distribute_lib.py:852 reduce
        denom = self._extended._reduce(reduce_util.ReduceOp.SUM, denom)  # pylint: disable=protected-access
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/distribute/distribute_lib.py:1436 _reduce
        device_util.current() or "/device:CPU:0"))[0]
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/distribute/mirrored_strategy.py:701 _reduce_to
        reduce_op, self._device_map, value, destinations)
    /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/distribute/cross_device_ops.py:102 reduce_non_distributed_value
        "the given reduce op %s." % (value, reduce_op))

    ValueError: A non-DistributedValues value 128 cannot be reduced with the given reduce op ReduceOp.SUM.


In [1]:
!python3 -c 'import tensorflow as tf; print(tf.__version__)'

2.0.0-rc0
