## Imports

In [1]:
# EXPORT
# --- Must haves ---
import os, sys
sys.path.append('..')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.cuda as cuda
import torch.nn as nn

from surrogates4sims.datasets import MantaFlowDataset, getSingleSim, createMantaFlowTrainTest

from surrogates4sims.utils import create_opt, create_one_cycle, find_lr, printNumModelParams, \
                                    rmse, writeMessage, plotSampleWprediction, plotSampleWpredictionByChannel, \
                                    plotSample, curl, jacobian, stream2uv

from surrogates4sims.models import Generator, Encoder, AE, AE_no_P

import numpy as np
from tqdm import tqdm
from copy import deepcopy



## Settings

In [2]:
# model name, for tensorboard recording and checkpointing purposes.
versionName = "gangam_style_training"

# GPU Numbers to use. Comma seprate them for multi-GPUs.

gpu_ids = "2"
# path to load model weights.
pretrained_path = None

# rate at which to record metrics. (number of batches to average over when recording metrics, e.g. "every 5 batches")
tensorboard_rate = 5

# number of epochs to train. This is defined here so we can use the OneCycle LR Scheduler.
epochs = 100

# Data Directory
dataDirec = '/data/mantaFlowSim/data/smoke_pos21_size5_f200/v'
reverseXY = False 

# checkpoint directory
cps = 'cps'
tensorboard_direc = "tb"

findLRs = False # only do this if you're trying to set the LR of E, G. It blows up the GPU 

# hyper-params
seed = 1234
np.random.seed(seed)
testSplit = .1
bz = 8
numSamplesToKeep = np.infty #if not debugging
latentDim = 16
filters = 16
num_conv = 4
simLen = 200
stack = True
simVizIndex = 0 # sim in the test set to visualize
createStreamFcn = False
doJacobian = True
versionName = versionName + '_latentDim{}_filters{}_bz{}_numConv{}_stream{}_jacobian{}_epochs{}'.format(latentDim,filters,bz,num_conv,createStreamFcn,doJacobian,epochs)
versionName

'gangam_style_training_latentDim16_filters16_bz8_numConv4_streamFalse_jacobianTrue_epochs100'

### Select Personal GPUs

In [3]:
!nvidia-smi

Tue Mar 17 21:32:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.40       Driver Version: 430.40       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN Xp            On   | 00000000:02:00.0 Off |                  N/A |
| 23%   21C    P8     8W / 250W |   1042MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            On   | 00000000:03:00.0 Off |                  N/A |
| 23%   21C    P8     8W / 250W |      1MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            On   | 00000000:81:00.0 Off |                  N/

In [4]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=gpu_ids

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [6]:
if device.type == 'cuda':
    print(cuda.is_available())
    print(cuda.device_count())
    print(cuda.current_device())
    print(cuda.get_device_name())

True
1
0
TITAN Xp


In [7]:
a = torch.zeros(5, device=device.type)
!nvidia-smi

Tue Mar 17 21:32:28 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.40       Driver Version: 430.40       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN Xp            On   | 00000000:02:00.0 Off |                  N/A |
| 23%   21C    P8     8W / 250W |   1042MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            On   | 00000000:03:00.0 Off |                  N/A |
| 23%   21C    P8     8W / 250W |      1MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            On   | 00000000:81:00.0 Off |                  N/

## Datasets & Loaders

In [8]:
trainData, testData = createMantaFlowTrainTest(dataDirec,simLen,testSplit,seed)
print((len(trainData),len(testData)))

(19000, 2000)


In [9]:
# datasets may be smaller because: numSamplesToKeep 
testDataset = MantaFlowDataset(testData, reverseXY=reverseXY, numToKeep=numSamplesToKeep, AE=False)
trainDataset = MantaFlowDataset(trainData, reverseXY=reverseXY,numToKeep=numSamplesToKeep, AE=False)
len(trainDataset), len(testDataset)

100%|██████████| 2000/2000 [00:03<00:00, 568.82it/s]
100%|██████████| 19000/19000 [00:35<00:00, 542.60it/s]


(19000, 2000)

In [10]:
trainDataLoader = DataLoader(dataset=trainDataset, batch_size=bz, shuffle=True, drop_last=True)
testDataLoader = DataLoader(dataset=testDataset, batch_size=bz)

## Model

Currently, the models need to take data to be built. It's kinda weird. I may look into fix this later. 

In [11]:
X,p = next(iter(testDataLoader))
X = X.to(device)
p = p.to(device)
X.shape, p.shape

(torch.Size([8, 2, 128, 96]), torch.Size([8, 3]))

In [12]:
E = Encoder(X,filters,latentDim,num_conv=num_conv).to(device)
E

Encoder(
  (act): LeakyReLU(negative_slope=0.01)
  (conv1): Conv2d(2, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (convs): Sequential(
    (0): convBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (convs): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
        (2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): LeakyReLU(negative_slope=0.01)
        (4): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (5): LeakyReLU(negative_slope=0.01)
        (6): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (7): LeakyReLU(negative_slope=0.01)
      )
      (downSampleLayer): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
    (1): convBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (convs): Sequential(
        (0): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1

In [13]:
printNumModelParams(E)

54 layers require gradients (unfrozen) out of 54 layers
99,792 parameters require gradients (unfrozen) out of 99,792 parameters


In [14]:
z = E(X)
z.shape

torch.Size([8, 16])

In [15]:
output_shape = torch.tensor(X[0].shape)
output_shape

tensor([  2, 128,  96])

In [16]:
G = Generator(z, filters, output_shape,
                 num_conv=num_conv, conv_k=3, last_k=3, repeat=0, 
                 skip_connection=False, act=nn.LeakyReLU(), stack=stack)
G = G.to(device)
G

Generator(
  (linear): Linear(in_features=16, out_features=768, bias=True)
  (convTransBlockLayers): Sequential(
    (0): convTransBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (upsample): Upsample(scale_factor=2.0, mode=nearest)
      (seq): Sequential(
        (0): ConvTranspose2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
        (2): ConvTranspose2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): LeakyReLU(negative_slope=0.01)
        (4): ConvTranspose2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (5): LeakyReLU(negative_slope=0.01)
        (6): ConvTranspose2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
        (7): LeakyReLU(negative_slope=0.01)
      )
    )
    (1): convTransBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (upsample): Upsample(scale_factor=2.0, mode=nearest)
      (seq): Sequential(
        (0): Con

In [17]:
printNumModelParams(G)

36 layers require gradients (unfrozen) out of 36 layers
65,442 parameters require gradients (unfrozen) out of 65,442 parameters


In [18]:
Xhat = G(z)
Xhat.shape

torch.Size([8, 2, 128, 96])

## Loss Function

In [19]:
def L1_loss(pred, target):
    return torch.mean(torch.abs(pred - target))


def jacobian_loss(pred, target, device='cpu'):
    return L1_loss(jacobian(pred, device), jacobian(target, device))


def curl_loss(pred, target, device):
    return L1_loss(curl(pred, device), curl(target, device))


L = nn.MSELoss()


def p_loss(pred, target):
    return L(pred[:, -target.shape[1]:], target)


def loss(pred, target, device):
    Lj = 0
    if createStreamFcn:
        L1 = L1_loss(stream2uv(pred, device), target)
    else:
        L1 = L1_loss(pred, target)
    if doJacobian:
        Lj = jacobian_loss(pred, target, device)
    return L1 + Lj

In [20]:
loss(Xhat,Xhat,device), loss(Xhat,X,device)

(tensor(0., device='cuda:0', grad_fn=<AddBackward0>),
 tensor(0.0481, device='cuda:0', grad_fn=<AddBackward0>))

In [21]:
p_loss(z,p)

tensor(0.4323, device='cuda:0', grad_fn=<MseLossBackward>)

## Optimizer and LR Scheduler for Encoder

This is a little bit tricky. We have to modify the Datasets so the lr_finder works.

In [22]:
if findLRs:
    opt_E = create_opt(1e-7,E)
    find_lr(E,opt_E,p_loss,device,trainDataLoader) 

In [23]:
max_lr_E = .00001
start_lr_E = max_lr_E/10
opt_E = create_opt(max_lr_E,E)
#opt = torch.optim.Adam(model.parameters(),lr=start_lr,betas=(.5,.999))
lr_scheduler_E = create_one_cycle(opt_E,max_lr_E,epochs,trainDataLoader)

## Optimizer and LR Scheduler for Generator

In [24]:
if findLRs:
    from copy import deepcopy
    
    ae_model = AE_no_P(E,G).to(device)

    AE_dataset = deepcopy(trainDataset)
    AE_dataset.AE = True

    AE_dataLoader = DataLoader(AE_dataset,batch_size=bz,shuffle=True)

    xx,yy = next(iter(AE_dataLoader))

    def loss4LrFinder(pred,target):
        return loss(pred,target,device)

    # put it on the cpu so you can save gpu space for training later.
    opt = create_opt(1e-7,ae_model)
    find_lr(ae_model,opt,loss4LrFinder,device,AE_dataLoader) # this breaks because the output of trainDataLoader is (X,y), X
    # LRFinder does not like the list.

In [25]:
max_lr_G = .00001
start_lr_G = 5*max_lr_G/10
opt_G = create_opt(max_lr_G,G)
#opt = torch.optim.Adam(model.parameters(),lr=start_lr,betas=(.5,.999))
lr_scheduler_G = create_one_cycle(opt_G,max_lr_G,epochs,trainDataLoader)

In [26]:
len(trainDataLoader)

2375

## Train

In [27]:
# EXPORT
def trainEpoch(myDataLoader, tensorboard_writer, E, G, opt_E, opt_G, loss,
               metric, lr_scheduler_E, lr_scheduler_G, tensorboard_rate, device,
               tensorboard_recorder_step, total_steps):

    d_steps = 10
    num_batches = len(myDataLoader)
    
    for bg in range(num_batches//d_steps):
        running_loss = 0.0
        running_rmse = 0.0
        for d_step in range(d_steps):
            Z = []
            XX = []
            cnt = 0
            for i, sampleBatch in enumerate(myDataLoader, start=1):
                cnt += 1
                if cnt >= d_steps:
                    break
                    
                # not sure if this is necessary
                E.zero_grad()

                X,p = sampleBatch[0],sampleBatch[1]
                X = X.to(device)
                p = p.to(device)

                opt_E.zero_grad()

                z = E(X)

                pLoss = p_loss(z,p)
                pLoss.backward()
                opt_E.step()
                
                Z.append(z.detach())
                XX.append(X.detach())

                total_steps += 1

            for z,X in zip(Z,XX):
                opt_G.zero_grad()

                #G
                X_hat = G(z)
                L1 = loss(X_hat,X,device)
                L1.backward()
                opt_G.step()
                
                
            # loss
            batch_loss = pLoss.item() + L1.item()
            running_loss += batch_loss
            lr_scheduler_E.step()
            lr_scheduler_G.step()
            
            # --- Metrics Recording ---

            # metrics
            r = metric(X_hat, X)
            running_rmse += r

            # record lr change
            tensorboard_writer.add_scalar(tag="Encoder_LR", scalar_value=opt_E.param_groups[0]['lr'], global_step=total_steps)
            tensorboard_writer.add_scalar(tag="Generator_LR", scalar_value=opt_G.param_groups[0]['lr'], global_step=total_steps)

            tensorboard_recorder_step += 1
            avg_running_loss = running_loss/d_steps
            avg_running_rmse = running_rmse/d_steps
            tensorboard_writer.add_scalar(tag="Loss", scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
            tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)


    return batch_loss, tensorboard_recorder_step, total_steps


In [28]:
# EXPORT
def validEpoch(myDataLoader, tensorboard_writer, E, G, p_loss, loss, metric,
               device, tensorboard_recorder_step):
    running_loss = 0.0
    running_rmse = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Metrics Recording ---

        # gpu
        X,p = sampleBatch[0],sampleBatch[1]
        X = X.to(device)
        p = p.to(device)
        
        perc = len(X)/len(myDataLoader.dataset)

        # forward, no gradient calculations
        with torch.no_grad():
            z = E(X)
            X_hat = G(z)

        # loss
        pl = p_loss(z,p)
        L1 = loss(X_hat,X,device)
        
        running_loss += perc*(pl.item() + L1.item())

        # metrics
        r = metric(X_hat, X)
        running_rmse += perc*r

    avg_running_loss = running_loss
    avg_running_rmse = running_rmse
    tensorboard_writer.add_scalar(tag="Loss", scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
    tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)

    return running_loss

In [29]:
try:
    os.mkdir(cps)
except:
    print("checkpoints directory already exists :)")

checkpoints directory already exists :)


In [30]:
# create a summary writer.
train_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'train'))
test_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'valid'))
tensorboard_recorder_step = 0
total_steps = 0

In [31]:
writeMessage('---------- Started Training ----------', versionName)
bestLoss = np.infty

for epoch in tqdm(range(1, epochs+1)):  # loop over the dataset multiple times
    
    writeMessage("--- Epoch {0}/{1} ---".format(epoch, epochs), versionName)
    
    E.train()
    G.train()
    trainLoss, tensorboard_recorder_step, total_steps = trainEpoch(trainDataLoader, 
                                                                   train_writer, E, G, opt_E, opt_G, loss,
                                                                   rmse, lr_scheduler_E, lr_scheduler_G, 
                                                                   tensorboard_rate, device,
                                                                   tensorboard_recorder_step, total_steps)
    
    writeMessage("trainLoss: {:.4e}".format(trainLoss),versionName)
    E.eval()
    G.eval()
    valLoss = validEpoch(testDataLoader, test_writer, E, G, p_loss, loss, rmse, device, tensorboard_recorder_step)
    
    # checkpoint progress
    if valLoss < bestLoss:
        bestLoss = valLoss
        writeMessage("Better valLoss: {:.4e}, Saving models...".format(bestLoss),versionName)
        torch.save(E.state_dict(), os.path.join(cps,versionName +'_Encoder'))
        torch.save(G.state_dict(), os.path.join(cps,versionName +'_Generator'))

writeMessage('---------- Finished Training ----------', versionName)

  0%|          | 0/100 [00:00<?, ?it/s]

---------- Started Training ----------
--- Epoch 1/100 ---
trainLoss: 5.0611e-01


  1%|          | 1/100 [19:15<31:46:43, 1155.59s/it]

Better valLoss: 5.4109e-01, Saving models...
--- Epoch 2/100 ---
trainLoss: 4.0123e-01


  2%|▏         | 2/100 [38:37<31:30:20, 1157.35s/it]

Better valLoss: 4.6458e-01, Saving models...
--- Epoch 3/100 ---
trainLoss: 2.0602e-01


  3%|▎         | 3/100 [56:23<30:26:56, 1130.07s/it]

Better valLoss: 2.4289e-01, Saving models...
--- Epoch 4/100 ---
trainLoss: 1.8882e-01


  4%|▍         | 4/100 [1:13:58<29:32:04, 1107.54s/it]

Better valLoss: 2.0215e-01, Saving models...
--- Epoch 5/100 ---
trainLoss: 1.3942e-01


  5%|▌         | 5/100 [1:31:57<28:59:58, 1098.93s/it]

Better valLoss: 1.7819e-01, Saving models...
--- Epoch 6/100 ---
trainLoss: 2.0445e-01


  6%|▌         | 6/100 [1:50:04<28:36:08, 1095.41s/it]

Better valLoss: 1.5044e-01, Saving models...
--- Epoch 7/100 ---
trainLoss: 1.1228e-01


  7%|▋         | 7/100 [2:08:38<28:26:41, 1101.09s/it]

Better valLoss: 1.5042e-01, Saving models...
--- Epoch 8/100 ---
trainLoss: 9.7600e-01


  8%|▊         | 8/100 [2:26:40<27:59:32, 1095.35s/it]

--- Epoch 9/100 ---
trainLoss: 6.7302e+03


  9%|▉         | 9/100 [2:44:34<27:31:35, 1088.96s/it]

--- Epoch 10/100 ---
trainLoss: 3.0755e+06


 10%|█         | 10/100 [3:02:59<27:20:40, 1093.78s/it]

--- Epoch 11/100 ---
trainLoss: 3.5291e+13


 11%|█         | 11/100 [3:20:41<26:48:05, 1084.11s/it]

--- Epoch 12/100 ---
trainLoss: 5.4205e+20


 12%|█▏        | 12/100 [3:38:45<26:29:49, 1083.97s/it]

--- Epoch 13/100 ---
trainLoss: 8.2457e+20


 13%|█▎        | 13/100 [3:56:41<26:08:23, 1081.65s/it]

--- Epoch 14/100 ---
trainLoss: 1.5513e+21


 14%|█▍        | 14/100 [4:12:31<24:54:00, 1042.34s/it]

--- Epoch 15/100 ---
trainLoss: 1.2779e+21


 15%|█▌        | 15/100 [4:28:27<23:59:55, 1016.41s/it]

--- Epoch 16/100 ---
trainLoss: 6.7506e+21


 16%|█▌        | 16/100 [4:44:21<23:16:48, 997.72s/it] 

--- Epoch 17/100 ---
trainLoss: 6.0581e+21


 17%|█▋        | 17/100 [4:59:45<22:29:23, 975.46s/it]

--- Epoch 18/100 ---
trainLoss: 1.5595e+22


 18%|█▊        | 18/100 [5:15:34<22:02:16, 967.51s/it]

--- Epoch 19/100 ---
trainLoss: 2.3404e+22


 19%|█▉        | 19/100 [5:30:53<21:26:39, 953.09s/it]

--- Epoch 20/100 ---
trainLoss: 1.2210e+22


 20%|██        | 20/100 [5:46:40<21:08:14, 951.19s/it]

--- Epoch 21/100 ---
trainLoss: 1.0341e+22


 21%|██        | 21/100 [6:02:18<20:47:18, 947.32s/it]

--- Epoch 22/100 ---
trainLoss: 1.2051e+22


 22%|██▏       | 22/100 [6:17:44<20:22:53, 940.68s/it]

--- Epoch 23/100 ---


KeyboardInterrupt: 

## Compare: Generated vs. Simulated

In [None]:
E.eval()
G.eval()
sampleBatch = next(iter(testDataLoader))
X,p = sampleBatch
X = X.to(device)
p = p.to(device)

with torch.no_grad():
    z = E(X)
    z[:,-p.shape[1]:] = p
    X_hat = G(z)
    
X.shape, p.shape, z.shape, X_hat.shape

In [None]:
idx = 0 # frame in the batch 
XX = X[idx].detach().cpu().squeeze()
XX_hat = X_hat[idx].detach().cpu().squeeze()
plotSampleWpredictionByChannel(XX, XX_hat)

In [None]:
plotSampleWprediction(XX, XX_hat)

In [None]:
plotSample(XX-XX_hat)

### Visualize full simulation

In [None]:
simData = getSingleSim(sim=simVizIndex,dataDirec=testData)
simDataset = MantaFlowDataset(simData, reverseXY=reverseXY, numToKeep=numSamplesToKeep, AE=False)
simDataLoader = DataLoader(simDataset,batch_size=1)

In [None]:
E.eval()
G.eval()
for i, sampleBatch in enumerate(simDataLoader,start=1):
    with torch.no_grad():
        X,p = sampleBatch
        X = X.to(device)
        p = p.to(device)
        
        z = E(X)
        z[:,-p.shape[1]:] = p
        X_hat = G(z)
        
        X = X.detach().cpu().squeeze()
        X_hat = X_hat.detach().cpu().squeeze()
        plotSampleWprediction(X, X_hat)

