In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from unet import UNet
from model import  Nowcast
from GRU import RadarNet
from dataloader import DataSet
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR
from tensorboardX import SummaryWriter
from torch.autograd import Variable
import os
from loss import ComLoss
from dataloader import get_iter_dali
import time
from unet import UNet

class UNet(nn.Module):
    def __init__(self, ):
        super(UNet, self).__init__()
        self.inc= DoubleConv(4,64) #input (bsize,4,640,800)
        self.down1= DownSample(64,128)
        self.down2= DownSample(128,256)
        self.down3= DownSample(256,512)
        self.down4= DownSample(512, 1024)
        
        self.up1= UpSample(1536, 512)
        self.up2= UpSample(768, 256)
        self.up3= UpSample(384, 128)
        self.up4= UpSample(384,64)
        self.outConv= OutConv(128,2)
        self.regout= nn.Conv2d(2,1,1)
        
        
    def forward(self, x):
        x1= self.inc(x)   #(bsize,64,640,800)
        x2=self.down1(x1) #(bsize,128,320,400)
        x3=self.down2(x2) #(bsize,256,160,200)
        x4=self.down3(x3) #(bsize,512,80,100)
        x4= nn.Dropout(0.5)(x4)
        x5=self.down4(x4) #(bsize,1024,40,50)
        x5= nn.Dropout(0.5)(x5)

        out=self.up1(x5, x4) #(bsize,1536,80,100)
        out=self.up2(out, x3)# (bsize,768,160,200)
        out=self.up3(out, x2)# (bsize,384,320,400)
        out= self.up4(out,x1)
        out= self.outConv(out)# (bsize,1,50,50)
        out=self.regout(out)
        
        return out
        
        
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.doubleConv= nn.Sequential(
                        nn.Conv2d(in_channels, out_channels, 3,1,1),
                        nn.ReLU(True),
                        nn.Conv2d(out_channels, out_channels, 3,1,1),
                        nn.ReLU(True),
                )
    def forward(self, x):
        out= self.doubleConv(x)
        
        return out
    
class DownSample(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv= nn.Sequential(
                        
                        nn.MaxPool2d(2),
                        DoubleConv(in_channels, out_channels)
                        )
        
    def forward(self, x):
        return self.maxpool_conv(x)
    
class UpSample(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        self.conv= DoubleConv(in_channels, out_channels)
        
    def forward(self, x1, x2):
        x1= self.up(x1)
        diffY = x2.size()[2]-x1.size()[2]
        diffX = x2.size()[3]-x1.size()[3]
        
        x1= F.pad(x1, [diffX//2, diffX-diffX//2,
                      diffY//2, diffY-diffY//2])
        
#         print(x1.size(), x2.size())
        
        x= torch.cat([x2, x1], dim=1)

        return self.conv(x)
    
class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv= nn.Conv2d(in_channels, out_channels,3,1,1)
        
    def forward(self, x):
        return self.conv(x)

In [2]:
def num_params(net):
		num_params= 0
		for param in net.parameters():
				num_params+= param.numel()

		print('Total number of parameters: %d'%num_params)

In [3]:
def init_weights(m):
    classname= m.__class__.__name__
    
    if classname.find('conv2d')!=-1:
        m.weight.data.uniform(0.0,1.0)
        m.bias.data.fill_(0.0)

use_gpu=True
EVENTS= ['20170604', '20170624', '20170709','20170922','20180704','20181207','20190507','20190522','20190823','20190919']

def num_params(net):
		num_params= 0
		for param in net.parameters():
				num_params+= param.numel()

		print('Total number of parameters: %d'%num_params)

def normalizer(x):
	'''input tensor size (b,tsize,c,m,n), Apply log transform to data
		See Casper et al. 2020 MetNet
	'''
	log_transform= torch.log10(x+0.01)/4
	tangent_transform= torch.tanh(log_transform)

	return tangent_transform 

def denormalizer(x):
	'''An inverse of normalizer'''

	return torch.exp(x*4)-0.01



# set up some parameters
batch_size=4
lr= 1e-3
logging_path= 'logging/'
num_epoches= 500
epoch_to_save= 10


# print("# of training samples: %d\n" %int(len(dataset_train)))

model= UNet()
# wandb.watch(model)
# model=RadarNet(hidden_layers=16,use_gpu=True, device=0)
# best_model_pth= '/home/allen/Documents/Projects/Nowcast/logging/net_epoch991.pth'
# model.load_state_dict((torch.load(best_model_pth, map_location='cpu')))
print(model)
num_params(model)
model.apply(init_weights)

# criterion= ComLoss()
criterion= torch.nn.MSELoss()

# model.load_state_dict(torch.load('../logging/newest-5_8.pth'))

if use_gpu:
    model= model.cuda()
    criterion.cuda()

#optimizer
optimizer= torch.optim.Adam(model.parameters(), lr=lr)
scheduler= MultiStepLR(optimizer, milestones=[100,400, 800], gamma=0.2)

#start training
model.train()
step= 0
for epoch in range(num_epoches):
    start= time.time()


    for param_group in optimizer.param_groups:
        print('learning rate %f' %param_group['lr'])

    for e, event in enumerate(EVENTS[:1]):
        # ====================normal===============#
        dataset_train= DataSet(event=event)
        loader_train= DataLoader(dataset= dataset_train, num_workers=8, batch_size=batch_size, shuffle=True)
        # ====================DALI===============#
        # loader_train = get_iter_dali(event=EVENTS[0], batch_size=2,
                                    # num_threads=8)
        for i, data in enumerate(loader_train):
            # input size: (4,10,1,200,200)
            # target size: (4,10,1,200,200)
            # ====================normal===============#
            # input_train=Variable(torch.rand(size=(1,10,1,200,200)))
            # target_train=Variable(torch.ones(size=(1,10,1,200,200)))
            input_train=data[0].squeeze(axis=2)
            target_train=data[1].squeeze(axis=2)
            # ====================DALI===============#
            # data= data[0]
            # input_train=data['inputs']
            # target_train=data['target']
            optimizer.zero_grad()
            # if model.radarnet.predictor.Who.weight.grad is not None:
            # 	print('before backward gradient: ', model.radarnet.predictor.Who.weight.grad.max())

            # model.zero_grad()


            input_train= normalizer(input_train)
            # target_train= normalizer(target_train)
            input_train, target_train= Variable(input_train), Variable(target_train)
            if use_gpu:
                input_train, target_train= input_train.cuda(), target_train.cuda()

            out_train= model(input_train)
            loss= criterion(target_train, out_train)

            loss.backward()
            # if model.radarnet.predictor.Who.weight.grad is not None:
            # 	print('after backward gradient: ', model.radarnet.predictor.Who.weight.grad.max())
                # print('gradient: ', model.predictor.U_z.weight.grad.max())
                # print('gradient: ', model.predictor.W_r.weight.grad.max())
                # print('gradient: ', model.predictor.U_r.weight.grad.max())
                # print('gradient: ', model.predictor.W_c.weight.grad.max())
                # print('gradient: ', model.predictor.U_c.weight.grad.max())			

            optimizer.step()

            # output_train= torch.clamp(out_train, 0, 1)
            # ================NORMAL================ #
            print("[epoch %d/%d][event %d/%d][step %d/%d]  obj: %.4f "%(epoch+1,num_epoches,e, len(EVENTS), i+1,len(loader_train),loss.item()))
#             print("[epoch %d/%d][step %d/%d]  obj: %.4f "%(epoch+1,num_epoches,  i+1,len(loader_train),loss.item()))
            # ================DALI================ #
            # print("[epoch %d/%d][event %d/%d][step %d]  obj: %.4f "%(epoch+1,num_epoches,e, len(EVENTS), i+1,-loss.item()))

            # print(list(model.parameters()))
#             if step% 50 == 0:
#                 try:
#                     wandb.log({"loss": loss.item(),
#                           "pred": wandb.Image(out_train.squeeze().cpu().detach().numpy()[-1][-1]),
#                           'true': wandb.Image(target_train.squeeze().cpu().detach().numpy()[-1][-1])})
#                 except:
#                     print('recording failed.')

            step+=1

#save model
    if epoch % epoch_to_save==0:
        torch.save(model.state_dict(), os.path.join(logging_path,'net_epoch%d.pth'%(epoch+1)))
    end= time.time()
    print('One epoch costs %.2f minutes!'%((end-start)/60.))

    scheduler.step(epoch)

torch.save(model.state_dict(), os.path.join(logging_path,'newest.pth'))


UNet(
  (inc): DoubleConv(
    (doubleConv): Sequential(
      (0): Conv2d(4, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
    )
  )
  (down1): DownSample(
    (maxpool_conv): Sequential(
      (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (1): DoubleConv(
        (doubleConv): Sequential(
          (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): ReLU(inplace=True)
          (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (3): ReLU(inplace=True)
        )
      )
    )
  )
  (down2): DownSample(
    (maxpool_conv): Sequential(
      (0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (1): DoubleConv(
        (doubleConv): Sequential(
          (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 

RuntimeError: CUDA out of memory. Tried to allocate 1000.00 MiB (GPU 0; 7.93 GiB total capacity; 6.00 GiB already allocated; 93.00 MiB free; 6.47 GiB reserved in total by PyTorch)