In [1]:
import torch
import torchvision
from torchvision import transforms, datasets
from torch.utils.data import Dataset, DataLoader
import os
from skimage import io
from skimage.transform import resize, downscale_local_mean
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

from Utilities.data import DataWrapper
from Models.SRGanGenerator import Generator


ImportError: No module named 'Utilities'

In [2]:
m = nn.AdaptiveAvgPool2d(1)
a = np.random.random((32,1,64,127))

myinput = torch.from_numpy(a)
output = m(myinput)

#print(output.view(-1).data.numpy().shape)
mean1 = output.view(-1).data.numpy()
mean2 = np.mean(a.reshape(32,-1),axis=1)

#print(mean1)
#print(mean2)
#print(np.mean((np.abs(mean1-mean2)>0.0001).astype(float)))

In [3]:
m = nn.Conv2d(512,1024,kernel_size=6, padding=0, bias=True)
a = np.random.random((32,512,7,7))

myinput = Variable(torch.from_numpy(a).float())
output = m(myinput)

#print(output.data.numpy().shape)
myout = output.view(-1).data.numpy()

In [4]:
class DiscMidLego(nn.Module):
    def __init__(self, inp_channels=64, n=64, k=3, s=1):
        super(DiscMidLego, self).__init__()
        p = int(k/2)
        self.conv = nn.Conv2d(inp_channels, n, kernel_size=k, padding=p, stride=s, bias=True)
        self.bn = nn.BatchNorm2d(n)
        self.lrelu = nn.LeakyReLU()
                
    def forward(self,x):
        out = self.lrelu(self.bn(self.conv(x)))
        return out

In [5]:
class DiscMidBlocks(nn.Module):
    def __init__(self, init_ch=64, B=4, k=3):
        super(DiscMidBlocks, self).__init__()
        
        assert B>=1
        
        init_ch=64
        ch_list=[init_ch*(2**i) for i in range(4)]
        p=int(k/2)
        self.first_conv = nn.Conv2d(3, init_ch, kernel_size=k, padding=p, stride=1, bias=True)
        self.first_lrelu = nn.LeakyReLU()
        
        self.building_Blocks = nn.Sequential()
        self.building_Blocks.add_module('Middle_Block_Second_0', DiscMidLego(inp_channels=init_ch, n=init_ch, k=k, s=2))
        
        for b in range(1,B):
            self.building_Blocks.add_module('Middle_Block_First_' + str(b) , 
                                            DiscMidLego(inp_channels=ch_list[b-1], n=ch_list[b], k=k, s=1))
            self.building_Blocks.add_module('Middle_Block_Second_' + str(b) , 
                                            DiscMidLego(inp_channels=ch_list[b]  , n=ch_list[b], k=k, s=2))
        
        
    def forward(self,x):
        y = self.first_conv(x)
        z = self.first_lrelu(y)
        out = self.building_Blocks.forward(z)
        return out

In [6]:
class Discriminator(nn.Module):
    """Discriminator Network

    Args:
        init_ch_expansion (int): The number of output channels for the first Lego
            It is 64 in the original paper, which is the default here.
        B (int): The number of Blocks at the beginning of the network.
            The default value is 4, which is the same as the original paper.
            We call each pair of legos a "block".
            In the original paper:
                -There is one pair of legos (i.e. one block) at the very beginning
                    (where it is unique, since the first lego does not have BN).
                -Afterwards, there are 3 pairs of legos(i.e. 3 blocks).
                    Each block doubles the number of channels, and halves channel dimensions.
        k (int): The kernel size for convolution layers in the First Blocks.
        
        fcn_kernel (int): The dense layers of the original paper were substituted by convolution layers, 
            with same functionality.
            
            This was done so that any input image with an arbitrary size can be fed to the network.
            Example: In order to create a dense layer with 1024 output neurons, suppose we have:
                - Orignial input image size of 96*96.
                - This means that the output of the first blocks would be 512 channels, each with a size of 6*6.
                - Then, in order to have a completely dense layer, we should set the convulotional kernel size equal to 6, and 
                    produce 1024 output channels.
            
            This fcn_kernel will only be important at training time, and deciding the network architecture hyperparams.
            Otherwise, our implementation lets any input image with an arbitrary size to be able to go through the network.
            
            If the network was created with the assumption that an image input size of 96 should get a dense layer in the
            middle, then the output for an image of size 97 would be the average output of four corner 96*96 images 
            of the input 97*97 image.
            
            For a complete dense layer in the middle, make sure the following condition holds
            
            fcn_kernel = input_image_dim / (2^B)
            
            For instance: 6 = 96 / (2^4)
            
        dense_nuerons (list of ints): The list of number of neurons in the dense layer. The number of layers is determined by the length 
        of this list.
        (Default is [1024], as is in the original paper)
        For example: [1024] :::: 512 Channels -> 1024 Channels(i.e. Neurons) -> 1 Output Channel(i.e. Neuron)
        For example: [1024, 256] :::: 512 Channels -> 1024 Channels(i.e. Neurons) 
                                      -> 256 Channels(i.e. Neuron) -> 1 Output Channel(i.e. Neuron)
            
            
    """
    def __init__(self, init_ch_expansion=64, B=4, k=3, fcn_kernel=6, dense_nuerons=[1024]):
        super(Discriminator, self).__init__()
        
        assert len(dense_nuerons)>0
        
        self.midblocks = DiscMidBlocks(init_ch=init_ch_expansion, B=B, k=k)
        self.pre_fcn_channels = init_ch_expansion * (2**(B-1))
        
        curr_ch = self.pre_fcn_channels
        curr_ker = fcn_kernel
        self.fcn = nn.Sequential()
        
        for i,layer_ch in enumerate(dense_nuerons):
            self.fcn.add_module('FCN_layer_' + str(i) , nn.Conv2d(curr_ch, layer_ch, padding = 0,
                                                                  kernel_size = curr_ker, bias=True))
            self.fcn.add_module('FCN_layer_lrelu_' + str(i) , nn.LeakyReLU())
            
            curr_ch = layer_ch
            curr_ker = 1
        
            
        self.fcn.add_module('FCN_Final_layer', nn.Conv2d(dense_nuerons[-1], 1, padding = 0,
                                                         kernel_size = 1, bias = True))
        
        
                
    def forward(self,x):
        y = self.midblocks(x)
        out = self.fcn.forward(y)
        return out

In [7]:
if torch.__version__.startswith('0.3'):
    from torch.nn.modules.module import Module
    class BCEWithLogitsLoss(Module):
        r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
        class. This version is more numerically stable than using a plain `Sigmoid`
        followed by a `BCELoss` as, by combining the operations into one layer,
        we take advantage of the log-sum-exp trick for numerical stability.

        This Binary Cross Entropy between the target and the output logits
        (no sigmoid applied) is:

        .. math:: loss(o, t) = - 1/n \sum_i (t[i] * log(sigmoid(o[i])) + (1 - t[i]) * log(1 - sigmoid(o[i])))

        or in the case of the weight argument being specified:

        .. math:: loss(o, t) = - 1/n \sum_i weight[i] * (t[i] * log(sigmoid(o[i])) + (1 - t[i]) * log(1 - sigmoid(o[i])))

        This is used for measuring the error of a reconstruction in for example
        an auto-encoder. Note that the targets `t[i]` should be numbers
        between 0 and 1.

        Args:
            weight (Tensor, optional): a manual rescaling weight given to the loss
                of each batch element. If given, has to be a Tensor of size
                "nbatch".
            size_average (bool, optional): By default, the losses are averaged
                over observations for each minibatch. However, if the field
                size_average is set to ``False``, the losses are instead summed for
                each minibatch. Default: ``True``

         Shape:
             - Input: :math:`(N, *)` where `*` means, any number of additional
               dimensions
             - Target: :math:`(N, *)`, same shape as the input

         Examples::

             >>> loss = nn.BCEWithLogitsLoss()
             >>> input = autograd.Variable(torch.randn(3), requires_grad=True)
             >>> target = autograd.Variable(torch.FloatTensor(3).random_(2))
             >>> output = loss(input, target)
             >>> output.backward()
        """
        def __init__(self, weight=None, size_average=True, reduce=True):
            super(BCEWithLogitsLoss, self).__init__()
            self.size_average = size_average
            self.reduce=reduce
            self.register_buffer('weight', weight)

        def forward(self, input, target):
            if self.weight is not None:
                if self.reduce:
                    return F.binary_cross_entropy_with_logits(input, target, Variable(self.weight), self.size_average).sum()
                else:
                    return F.binary_cross_entropy_with_logits(input, target, Variable(self.weight), self.size_average)
            else:
                if self.reduce:
                    return F.binary_cross_entropy_with_logits(input, target, size_average=self.size_average).sum()
                else:
                    return F.binary_cross_entropy_with_logits(input, target, size_average=self.size_average)
else:
    BCEWithLogitsLoss = nn.BCEWithLogitsLoss

In [8]:
if torch.__version__.startswith('0.3'):
    from torch.nn.modules.module import Module
    def _assert_no_grad(variable):
        assert not variable.requires_grad, \
            "nn criterions don't compute the gradient w.r.t. targets - please " \
            "mark these variables as volatile or not requiring gradients"

    class _Loss(Module):
        def __init__(self, size_average=True, reduce=True):
            super(_Loss, self).__init__()
            self.size_average = size_average
            self.reduce=reduce

    class SoftMarginLoss(_Loss):
        r"""Creates a criterion that optimizes a two-class classification
        logistic loss between input `x` (a 2D mini-batch Tensor) and
        target `y` (which is a tensor containing either `1` or `-1`).

        ::

            loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x.nelement()

        The normalization by the number of elements in the input can be disabled by
        setting `self.size_average` to ``False``.
        """
        def forward(self, input, target, reduce=True):
            _assert_no_grad(target)
            if self.reduce:
                return F.soft_margin_loss(input, target, size_average=self.size_average).sum()
            else:
                return F.soft_margin_loss(input, target, size_average=self.size_average)
else:
    SoftMarginLoss = nn.SoftMarginLoss

In [9]:
def conv_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        init.xavier_uniform(m.weight, gain=np.sqrt(2))
        init.constant(m.bias, 0)
    elif classname.find('BatchNorm') != -1:
        init.constant(m.weight, 1)
        init.constant(m.bias, 0)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

IS_GPU=True

In [20]:

    
# Create an instance of the nn.module class defined above:
net = Discriminator(init_ch_expansion=64, B=4, k=3, fcn_kernel=6, dense_nuerons=[1024])
#net = Generator(first_stage_hyperparams={'k':9, 'n':64, 's':1}, 
#                 residual_blocks_hyperparams={'k':3, 'n':64, 's':1, 'B':16}, 
#                 upsample_blocks_hyperparams={'k':3, 'n':256, 's':1, 'B':2, 'f':2}, 
#                 last_stage_hyperparams={'k':9, 's':1} )

net.apply(conv_init)

# For training on GPU, we need to transfer net and data onto the GPU
# http://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#training-on-gpu
if IS_GPU:
    import torch.backends.cudnn as cudnn
    net = net.cuda()
    net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))
    cudnn.benchmark = True


##For Generator
#a =  (81*3 + 1 ) *64 +64
#a += ( (9*64 + 1)*64*2 + 5*64)*16 + (9*64 + 1)*64 + 2*64
#a += ((9*64 + 1)*256 + 64)*2
#a += (81*64+1)*3

#For Discriminator
a =  (9 * 3   + 1) * 64
a += (9 * 64  + 1) * 64  + 2 * 64
a += (9 * 64  + 1) * 128 + 2 * 128
a += (9 * 128 + 1) * 128 + 2 * 128
a += (9 * 128 + 1) * 256 + 2 * 256
a += (9 * 256 + 1) * 256 + 2 * 256
a += (9 * 256 + 1) * 512 + 2 * 512
a += (9 * 512 + 1) * 512 + 2 * 512
a += (6 * 6 * 512 + 1) * 1024
a += 1024 + 1
print(count_parameters(net))
print(a)


23565505
23565505


In [24]:
########################################################################
# 3. Define a Loss function and optimizer
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Here we use Cross-Entropy loss and SGD with momentum.
# The CrossEntropyLoss criterion already includes softmax within its
# implementation. That's why we don't use a softmax in our model
# definition.

import torch.optim as optim

#criterion = nn.CrossEntropyLoss()
#criterion = nn.MSELoss()
#criterion = nn.BCEWithLogitsLoss(size_average=True)
criterion = SoftMarginLoss(size_average=False,reduce=True)

# Tune the learning rate.
# See whether the momentum is useful or not
#optimizer = optim.SGD(net.parameters(), lr=0.005, momentum=0.9)
optimizer = optim.Adam(net.parameters(), lr=0.005)

In [25]:
training=DataWrapper()
for i_batch , sample_batch in enumerate(training.dataset):
    #print((sample_batch))
    break

In [26]:
EPOCHS=1
net.apply(conv_init)
for epoch in range(EPOCHS):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(training.loader, 0):
        # get the inputs
        
        inputs = data['High']
        #labels = data['High']

        if IS_GPU:
            inputs = inputs.cuda()
            #labels = labels.cuda()

        # wrap them in Variable
        inputs = Variable(inputs)
        #labels = Variable(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        #print(inputs.shape)
        #print(outputs.shape)
        #print(labels.shape)
        #print(outputs.type)
        #print(labels.type)
        labels = torch.ones_like(outputs)
        loss = criterion(outputs, labels)
        #a=(torch.cuda.FloatTensor(1))
        loss.backward()
        optimizer.step()
        
        #print(outputs.cpu().data.numpy())

        # print statistics
        #print(np.mean(np.square(outputs.cpu().data.numpy() - labels.cpu().data.numpy())))
        sigmoid = lambda x: 1 / (1 + np.exp(-x))
        print(np.sum( -1*np.log(sigmoid(outputs.cpu().data.numpy() * labels.cpu().data.numpy()))) )
        running_loss += loss.data[0]
        
        print(running_loss)
        break

47.957386
47.9573860168457
