solution for assignment 2 and code for assignment 3

chiphuyen · Mar 2, 2018 · 6a7dfb8 · 6a7dfb8
1 parent cb78f84
commit 6a7dfb8
Show file tree

Hide file tree

Showing 14 changed files with 365,132 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,7 @@
 *.SUNet
 *.pyc
 .env/*
-examples/data/*
+examples/data
 examples/graphs/*
 examples/checkpoints/*
 examples/visualization/*
diff --git a/assignments/02_style_transfer/load_vgg.py b/assignments/02_style_transfer/load_vgg.py
@@ -6,6 +6,7 @@
 cs20.stanford.edu
 
 For more details, please read the assignment handout:
+https://docs.google.com/document/d/1FpueD-3mScnD0SJQDtwmOb1FrSwo1NGowkXzMwPoLH4/edit?usp=sharing
 
 """
 import numpy as np

diff --git a/assignments/02_style_transfer/load_vgg_sol.py b/assignments/02_style_transfer/load_vgg_sol.py
@@ -0,0 +1,112 @@
+""" Load VGGNet weights needed for the implementation in TensorFlow
+of the paper A Neural Algorithm of Artistic Style (Gatys et al., 2016) 
+
+Created by Chip Huyen (chiphuyen@cs.stanford.edu)
+CS20: "TensorFlow for Deep Learning Research"
+cs20.stanford.edu
+
+For more details, please read the assignment handout:
+
+"""
+import numpy as np
+import scipy.io
+import tensorflow as tf
+
+import utils
+
+# VGG-19 parameters file
+VGG_DOWNLOAD_LINK = 'http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat'
+VGG_FILENAME = 'imagenet-vgg-verydeep-19.mat'
+EXPECTED_BYTES = 534904783
+
+class VGG(object):
+    def __init__(self, input_img):
+        utils.download(VGG_DOWNLOAD_LINK, VGG_FILENAME, EXPECTED_BYTES)
+        self.vgg_layers = scipy.io.loadmat(VGG_FILENAME)['layers']
+        self.input_img = input_img
+        self.mean_pixels = np.array([123.68, 116.779, 103.939]).reshape((1,1,1,3))
+
+    def _weights(self, layer_idx, expected_layer_name):
+        """ Return the weights and biases at layer_idx already trained by VGG
+        """
+        W = self.vgg_layers[0][layer_idx][0][0][2][0][0]
+        b = self.vgg_layers[0][layer_idx][0][0][2][0][1]
+        layer_name = self.vgg_layers[0][layer_idx][0][0][0][0]
+        assert layer_name == expected_layer_name
+        return W, b.reshape(b.size)
+
+    def conv2d_relu(self, prev_layer, layer_idx, layer_name):
+        """ Return the Conv2D layer with RELU using the weights, 
+        biases from the VGG model at 'layer_idx'.
+        Don't forget to apply relu to the output from the convolution.
+        Inputs:
+            prev_layer: the output tensor from the previous layer
+            layer_idx: the index to current layer in vgg_layers
+            layer_name: the string that is the name of the current layer.
+                        It's used to specify variable_scope.
+
+
+        Note that you first need to obtain W and b from from the corresponding VGG's layer 
+        using the function _weights() defined above.
+        W and b returned from _weights() are numpy arrays, so you have
+        to convert them to TF tensors. One way to do it is with tf.constant.
+
+        Hint for choosing strides size: 
+            for small images, you probably don't want to skip any pixel
+        """
+        ###############################
+        ## TO DO
+        with tf.variable_scope(layer_name) as scope:
+            W, b = self._weights(layer_idx, layer_name)
+            W = tf.constant(W, name='weights')
+            b = tf.constant(b, name='bias')
+            conv2d = tf.nn.conv2d(prev_layer, 
+                                filter=W, 
+                                strides=[1, 1, 1, 1], 
+                                padding='SAME')
+            out = tf.nn.relu(conv2d + b)
+        ###############################
+        setattr(self, layer_name, out)
+
+    def avgpool(self, prev_layer, layer_name):
+        """ Return the average pooling layer. The paper suggests that 
+        average pooling works better than max pooling.
+        Input:
+            prev_layer: the output tensor from the previous layer
+            layer_name: the string that you want to name the layer.
+                        It's used to specify variable_scope.
+
+        Hint for choosing strides and kszie: choose what you feel appropriate
+        """
+        ###############################
+        ## TO DO
+        with tf.variable_scope(layer_name):
+            out = tf.nn.avg_pool(prev_layer, 
+                                ksize=[1, 2, 2, 1], 
+                                strides=[1, 2, 2, 1],
+                                padding='SAME')
+        ###############################
+        setattr(self, layer_name, out)
+
+    def load(self):
+        self.conv2d_relu(self.input_img, 0, 'conv1_1')
+        self.conv2d_relu(self.conv1_1, 2, 'conv1_2')
+        self.avgpool(self.conv1_2, 'avgpool1')
+        self.conv2d_relu(self.avgpool1, 5, 'conv2_1')
+        self.conv2d_relu(self.conv2_1, 7, 'conv2_2')
+        self.avgpool(self.conv2_2, 'avgpool2')
+        self.conv2d_relu(self.avgpool2, 10, 'conv3_1')
+        self.conv2d_relu(self.conv3_1, 12, 'conv3_2')
+        self.conv2d_relu(self.conv3_2, 14, 'conv3_3')
+        self.conv2d_relu(self.conv3_3, 16, 'conv3_4')
+        self.avgpool(self.conv3_4, 'avgpool3')
+        self.conv2d_relu(self.avgpool3, 19, 'conv4_1')
+        self.conv2d_relu(self.conv4_1, 21, 'conv4_2')
+        self.conv2d_relu(self.conv4_2, 23, 'conv4_3')
+        self.conv2d_relu(self.conv4_3, 25, 'conv4_4')
+        self.avgpool(self.conv4_4, 'avgpool4')
+        self.conv2d_relu(self.avgpool4, 28, 'conv5_1')
+        self.conv2d_relu(self.conv5_1, 30, 'conv5_2')
+        self.conv2d_relu(self.conv5_2, 32, 'conv5_3')
+        self.conv2d_relu(self.conv5_3, 34, 'conv5_4')
+        self.avgpool(self.conv5_4, 'avgpool5')
diff --git a/assignments/02_style_transfer/style_transfer_sol.py b/assignments/02_style_transfer/style_transfer_sol.py
@@ -0,0 +1,244 @@
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
+import time
+
+import numpy as np
+import tensorflow as tf
+
+import load_vgg_sol
+import utils
+
+def setup():
+    utils.safe_mkdir('checkpoints')
+    utils.safe_mkdir('outputs')
+
+class StyleTransfer(object):
+    def __init__(self, content_img, style_img, img_width, img_height):
+        '''
+        img_width and img_height are the dimensions we expect from the generated image.
+        We will resize input content image and input style image to match this dimension.
+        Feel free to alter any hyperparameter here and see how it affects your training.
+        '''
+        self.img_width = img_width
+        self.img_height = img_height
+        self.content_img = utils.get_resized_image(content_img, img_width, img_height)
+        self.style_img = utils.get_resized_image(style_img, img_width, img_height)
+        self.initial_img = utils.generate_noise_image(self.content_img, img_width, img_height)
+
+        ###############################
+        ## TO DO
+        ## create global step (gstep) and hyperparameters for the model
+        self.content_layer = 'conv4_2'
+        self.style_layers = ['conv1_1', 'conv2_1', 'conv3_1', 'conv4_1', 'conv5_1']
+        self.content_w = 0.01
+        self.style_w = 1
+        self.style_layer_w = [0.5, 1.0, 1.5, 3.0, 4.0] 
+        self.gstep = tf.Variable(0, dtype=tf.int32, 
+                                trainable=False, name='global_step')
+        self.lr = 2.0
+        ###############################
+
+    def create_input(self):
+        '''
+        We will use one input_img as a placeholder for the content image, 
+        style image, and generated image, because:
+            1. they have the same dimension
+            2. we have to extract the same set of features from them
+        We use a variable instead of a placeholder because we're, at the same time, 
+        training the generated image to get the desirable result.
+
+        Note: image height corresponds to number of rows, not columns.
+        '''
+        with tf.variable_scope('input') as scope:
+            self.input_img = tf.get_variable('in_img', 
+                                        shape=([1, self.img_height, self.img_width, 3]),
+                                        dtype=tf.float32,
+                                        initializer=tf.zeros_initializer())
+    def load_vgg(self):
+        '''
+        Load the saved model parameters of VGG-19, using the input_img
+        as the input to compute the output at each layer of vgg.
+
+        During training, VGG-19 mean-centered all images and found the mean pixels
+        to be [123.68, 116.779, 103.939] along RGB dimensions. We have to subtract
+        this mean from our images.
+
+        '''
+        self.vgg = load_vgg_sol.VGG(self.input_img)
+        self.vgg.load()
+        self.content_img -= self.vgg.mean_pixels
+        self.style_img -= self.vgg.mean_pixels
+
+    def _content_loss(self, P, F):
+        ''' Calculate the loss between the feature representation of the
+        content image and the generated image.
+        
+        Inputs: 
+            P: content representation of the content image
+            F: content representation of the generated image
+            Read the assignment handout for more details
+
+            Note: Don't use the coefficient 0.5 as defined in the paper.
+            Use the coefficient defined in the assignment handout.
+        '''
+        # self.content_loss = None
+        ###############################
+        ## TO DO
+        self.content_loss = tf.reduce_sum((F - P) ** 2) / (4.0 * P.size)
+        ###############################
+
+    def _gram_matrix(self, F, N, M):
+        """ Create and return the gram matrix for tensor F
+            Hint: you'll first have to reshape F
+        """
+        ###############################
+        ## TO DO
+        F = tf.reshape(F, (M, N))
+        return tf.matmul(tf.transpose(F), F)
+        ###############################
+
+    def _single_style_loss(self, a, g):
+        """ Calculate the style loss at a certain layer
+        Inputs:
+            a is the feature representation of the style image at that layer
+            g is the feature representation of the generated image at that layer
+        Output:
+            the style loss at a certain layer (which is E_l in the paper)
+
+        Hint: 1. you'll have to use the function _gram_matrix()
+            2. we'll use the same coefficient for style loss as in the paper
+            3. a and g are feature representation, not gram matrices
+        """
+        ###############################
+        ## TO DO
+        N = a.shape[3] # number of filters
+        M = a.shape[1] * a.shape[2] # height times width of the feature map
+        A = self._gram_matrix(a, N, M)
+        G = self._gram_matrix(g, N, M)
+        return tf.reduce_sum((G - A) ** 2 / ((2 * N * M) ** 2))
+        ###############################
+
+    def _style_loss(self, A):
+        """ Calculate the total style loss as a weighted sum 
+        of style losses at all style layers
+        Hint: you'll have to use _single_style_loss()
+        """
+        n_layers = len(A)
+        E = [self._single_style_loss(A[i], getattr(self.vgg, self.style_layers[i])) for i in range(n_layers)]
+
+        ###############################
+        ## TO DO
+        self.style_loss = sum([self.style_layer_w[i] * E[i] for i in range(n_layers)])
+        ###############################
+
+    def losses(self):
+        with tf.variable_scope('losses') as scope:
+            with tf.Session() as sess:
+                # assign content image to the input variable
+                sess.run(self.input_img.assign(self.content_img)) 
+                gen_img_content = getattr(self.vgg, self.content_layer)
+                content_img_content = sess.run(gen_img_content)
+            self._content_loss(content_img_content, gen_img_content)
+
+            with tf.Session() as sess:
+                sess.run(self.input_img.assign(self.style_img))
+                style_layers = sess.run([getattr(self.vgg, layer) for layer in self.style_layers])                              
+            self._style_loss(style_layers)
+
+            ##########################################
+            ## TO DO: create total loss. 
+            ## Hint: don't forget the weights for the content loss and style loss
+            self.total_loss = self.content_w * self.content_loss + self.style_w * self.style_loss
+            ##########################################
+
+    def optimize(self):
+        ###############################
+        ## TO DO: create optimizer
+        self.opt = tf.train.AdamOptimizer(self.lr).minimize(self.total_loss,
+                                                            global_step=self.gstep)
+        ###############################
+
+    def create_summary(self):
+        ###############################
+        ## TO DO: create summaries for all the losses
+        ## Hint: don't forget to merge them
+        with tf.name_scope('summaries'):
+            tf.summary.scalar('content loss', self.content_loss)
+            tf.summary.scalar('style loss', self.style_loss)
+            tf.summary.scalar('total loss', self.total_loss)
+            self.summary_op = tf.summary.merge_all()
+        ###############################
+
+
+    def build(self):
+        self.create_input()
+        self.load_vgg()
+        self.losses()
+        self.optimize()
+        self.create_summary()
+
+    def train(self, n_iters):
+        skip_step = 1
+        with tf.Session() as sess:
+
+            ###############################
+            ## TO DO: 
+            ## 1. initialize your variables
+            ## 2. create writer to write your graph
+            sess.run(tf.global_variables_initializer())
+            writer = tf.summary.FileWriter('graphs/style_stranfer', sess.graph)
+            ###############################
+            sess.run(self.input_img.assign(self.initial_img))
+
+
+            ###############################
+            ## TO DO: 
+            ## 1. create a saver object
+            ## 2. check if a checkpoint exists, restore the variables
+            saver = tf.train.Saver()
+            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/style_transfer/checkpoint'))
+            if ckpt and ckpt.model_checkpoint_path:
+                saver.restore(sess, ckpt.model_checkpoint_path)
+            ##############################
+
+            initial_step = self.gstep.eval()
+
+            start_time = time.time()
+            for index in range(initial_step, n_iters):
+                if index >= 5 and index < 20:
+                    skip_step = 10
+                elif index >= 20:
+                    skip_step = 20
+
+                sess.run(self.opt)
+                if (index + 1) % skip_step == 0:
+                    ###############################
+                    ## TO DO: obtain generated image, loss, and summary
+                    gen_image, total_loss, summary = sess.run([self.input_img,
+                                                                self.total_loss,
+                                                                self.summary_op])
+
+                    ###############################
+
+                    # add back the mean pixels we subtracted before
+                    gen_image = gen_image + self.vgg.mean_pixels 
+                    writer.add_summary(summary, global_step=index)
+                    print('Step {}\n   Sum: {:5.1f}'.format(index + 1, np.sum(gen_image)))
+                    print('   Loss: {:5.1f}'.format(total_loss))
+                    print('   Took: {} seconds'.format(time.time() - start_time))
+                    start_time = time.time()
+
+                    filename = 'outputs/%d.png' % (index)
+                    utils.save_image(filename, gen_image)
+
+                    if (index + 1) % 20 == 0:
+                        ###############################
+                        ## TO DO: save the variables into a checkpoint
+                        saver.save(sess, 'checkpoints/style_stranfer/style_transfer', index)
+                        ###############################
+
+if __name__ == '__main__':
+    setup()
+    machine = StyleTransfer('content/deadpool.jpg', 'styles/guernica.jpg', 333, 250)
+    machine.build()
+    machine.train(300)