## Imports

In [1]:
# EXPORT
# --- Must haves ---
import os, sys
sys.path.append('..')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.cuda as cuda
import torch.nn as nn

from surrogates4sims.datasets import MantaFlowDataset, getSingleSim, createMantaFlowTrainTest

from surrogates4sims.utils import create_opt, create_one_cycle, find_lr, printNumModelParams, \
                                    rmse, writeMessage, plotSampleWprediction, plotSampleWpredictionByChannel, \
                                    plotSample

from surrogates4sims.models import Generator, Encoder, AE, AE_no_P

import numpy as np
from tqdm import tqdm
from copy import deepcopy



## Settings

In [2]:
# model name, for tensorboard recording and checkpointing purposes.
versionName = "train_template"

# GPU Numbers to use. Comma seprate them for multi-GPUs.

gpu_ids = "4"
# path to load model weights.
pretrained_path = None

# rate at which to record metrics. (number of batches to average over when recording metrics, e.g. "every 5 batches")
tensorboard_rate = 5

# number of epochs to train. This is defined here so we can use the OneCycle LR Scheduler.
epochs = 2

# Data Directory
dataDirec = '/data/mantaFlowSim/data/smoke_pos21_size5_f200/v'
reverseXY = False 

# checkpoint directory
cps = 'cps'
tensorboard_direc = "tensorboard_runs"

# hyper-params
seed = 1234
np.random.seed(seed)
testSplit = .1
bz = 32
numSamplesToKeep = 256 # np.infty #if not debugging
latentDim = 16
filters = 128
num_conv = 4
simLen = 200
stack = True
simVizIndex = 5 # sim in the test set to visualize
versionName = versionName + '_latentDim{}_filters{}_bz{}_numConv{}'.format(latentDim,filters,bz,num_conv)
versionName

'train_template_latentDim16_filters128_bz32_numConv4'

### Select Personal GPUs

In [3]:
!nvidia-smi

Thu Mar  5 17:01:47 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.74       Driver Version: 418.74       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro M6000        On   | 00000000:04:00.0 Off |                  Off |
| 25%   30C    P8    15W / 250W |      1MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Quadro M6000        On   | 00000000:05:00.0 Off |                  Off |
| 25%   30C    P8    14W / 250W |      1MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Quadro M6000        On   | 00000000:08:00.0 Off |                  Off |
| 25%   

In [4]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=gpu_ids

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [6]:
if device.type == 'cuda':
    print(cuda.is_available())
    print(cuda.device_count())
    print(cuda.current_device())
    print(cuda.get_device_name())

True
1
0
Quadro M6000


In [7]:
a = torch.zeros(5, device=device.type)
!nvidia-smi

Thu Mar  5 17:01:53 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.74       Driver Version: 418.74       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Quadro M6000        On   | 00000000:04:00.0 Off |                  Off |
| 25%   30C    P8    15W / 250W |      1MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Quadro M6000        On   | 00000000:05:00.0 Off |                  Off |
| 25%   30C    P8    14W / 250W |      1MiB / 12212MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Quadro M6000        On   | 00000000:08:00.0 Off |                  Off |
| 25%   

## Datasets & Loaders

In [8]:
trainData, testData = createMantaFlowTrainTest(dataDirec,simLen,testSplit,seed)
print((len(trainData),len(testData)))

(19000, 2000)


In [9]:
# datasets may be smaller because: numSamplesToKeep 
testDataset = MantaFlowDataset(testData, reverseXY=reverseXY, numToKeep=numSamplesToKeep, AE=True)
trainDataset = MantaFlowDataset(trainData, reverseXY=reverseXY,numToKeep=numSamplesToKeep, AE=True)
len(trainDataset), len(testDataset)

100%|██████████| 256/256 [00:00<00:00, 366.28it/s]
100%|██████████| 256/256 [00:00<00:00, 361.29it/s]


(256, 256)

In [10]:
trainDataLoader = DataLoader(dataset=trainDataset, batch_size=bz, shuffle=True, drop_last=True)
testDataLoader = DataLoader(dataset=testDataset, batch_size=bz)

## Model

Currently, the models need to take data to be built. It's kinda weird. I may look into fix this later. 

In [11]:
(X,p), _ = next(iter(testDataLoader))
X.shape, p.shape

(torch.Size([32, 2, 128, 96]), torch.Size([32, 3]))

In [12]:
E = Encoder(X,filters,latentDim,num_conv=num_conv)
E

Encoder(
  (act): LeakyReLU(negative_slope=0.01)
  (conv1): Conv2d(2, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (convs): Sequential(
    (0): convBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (convs): Sequential(
        (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
        (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): LeakyReLU(negative_slope=0.01)
        (4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (5): LeakyReLU(negative_slope=0.01)
        (6): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (7): LeakyReLU(negative_slope=0.01)
      )
      (downSampleLayer): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    )
    (1): convBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (convs): Sequential(
        (0): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1

In [13]:
printNumModelParams(E)

54 layers require gradients (unfrozen) out of 54 layers
5,314,064 parameters require gradients (unfrozen) out of 5,314,064 parameters


In [14]:
z_wo_p = E(X)
z_wo_p.shape

torch.Size([32, 16])

In [15]:
z = torch.cat([z_wo_p,p],axis=1)
z.shape

torch.Size([32, 19])

In [16]:
output_shape = torch.tensor(X[0].shape)
output_shape

tensor([  2, 128,  96])

In [17]:
G_wo_P = Generator(z_wo_p, filters, output_shape,
                 num_conv=num_conv, conv_k=3, last_k=3, repeat=0, 
                 skip_connection=False, act=nn.LeakyReLU(), stack=stack)
G_wo_P

Generator(
  (linear): Linear(in_features=16, out_features=6144, bias=True)
  (convTransBlockLayers): Sequential(
    (0): convTransBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (upsample): Upsample(scale_factor=2.0, mode=nearest)
      (seq): Sequential(
        (0): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
        (2): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): LeakyReLU(negative_slope=0.01)
        (4): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (5): LeakyReLU(negative_slope=0.01)
        (6): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
        (7): LeakyReLU(negative_slope=0.01)
      )
    )
    (1): convTransBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (upsample): Upsample(scale_factor=2.0, mode=nearest)
      (seq): Sequential(
       

In [18]:
G = Generator(z, filters, output_shape,
                 num_conv=num_conv, conv_k=3, last_k=3, repeat=0, 
                 skip_connection=False, act=nn.LeakyReLU(), stack=stack)
G

Generator(
  (linear): Linear(in_features=19, out_features=6144, bias=True)
  (convTransBlockLayers): Sequential(
    (0): convTransBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (upsample): Upsample(scale_factor=2.0, mode=nearest)
      (seq): Sequential(
        (0): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): LeakyReLU(negative_slope=0.01)
        (2): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): LeakyReLU(negative_slope=0.01)
        (4): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (5): LeakyReLU(negative_slope=0.01)
        (6): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
        (7): LeakyReLU(negative_slope=0.01)
      )
    )
    (1): convTransBlock(
      (act): LeakyReLU(negative_slope=0.01)
      (upsample): Upsample(scale_factor=2.0, mode=nearest)
      (seq): Sequential(
       

In [19]:
printNumModelParams(G)

36 layers require gradients (unfrozen) out of 36 layers
3,380,482 parameters require gradients (unfrozen) out of 3,380,482 parameters


In [20]:
xhat = G(z)
xhat.shape

torch.Size([32, 2, 128, 96])

In [21]:
model = AE(E,G)
model

AE(
  (encoder): Encoder(
    (act): LeakyReLU(negative_slope=0.01)
    (conv1): Conv2d(2, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (convs): Sequential(
      (0): convBlock(
        (act): LeakyReLU(negative_slope=0.01)
        (convs): Sequential(
          (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): LeakyReLU(negative_slope=0.01)
          (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (3): LeakyReLU(negative_slope=0.01)
          (4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (5): LeakyReLU(negative_slope=0.01)
          (6): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (7): LeakyReLU(negative_slope=0.01)
        )
        (downSampleLayer): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      )
      (1): convBlock(
        (act): LeakyReLU(negative_slope=0.01)
        (convs): Sequential(
     

In [22]:
printNumModelParams(model)

90 layers require gradients (unfrozen) out of 90 layers
8,694,546 parameters require gradients (unfrozen) out of 8,694,546 parameters


In [23]:
xhat = model(X,p)
xhat.shape

torch.Size([32, 2, 128, 96])

In [24]:
ae_wo_p_model = AE_no_P(E,G_wo_P)

In [25]:
xhat = ae_wo_p_model(X)
xhat.shape

torch.Size([32, 2, 128, 96])

## Loss Function

In [27]:
loss_func = torch.nn.MSELoss()
loss_func

MSELoss()

## Optimizer and LR Scheduler

This is a little bit tricky. We have to tell the autoencoder not to take P and we have to modify the data loader.

In [28]:
nonAE_dataset = deepcopy(trainDataset)

In [29]:
nonAE_dataset.AE = False
nonAE_dataset.AE

False

In [30]:
nonAE_dataLoader = DataLoader(nonAE_dataset,batch_size=bz,shuffle=True)

In [31]:
# put it on the cpu so you can save gpu space for training later.
opt = create_opt(1e-7,ae_wo_p_model)
find_lr(ae_wo_p_model,opt,loss_func,'cpu',nonAE_dataLoader) # this breaks because the output of trainDataLoader is (X,y), X
# LRFinder does not like the list.

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (96) must match the size of tensor b (3) at non-singleton dimension 3

In [None]:
max_lr = .00001
start_lr = 0.0000025 #max_lr/10
#opt = create_opt(start_lr,model)
opt = torch.optim.Adam(model.parameters(),lr=start_lr,betas=(.5,.999))
lr_scheduler = create_one_cycle(opt,max_lr,epochs,trainDataLoader)

## Forward Pass Playground

In [None]:
sampleBatch = next(iter(trainDataLoader))

In [None]:
model = model.to(device)
with torch.no_grad():
    X,p = sampleBatch[0]
    X = X.to(device)
    p = p.to(device)
    labels = sampleBatch[1].to(device) # same as X for our AE dataLoader
    preds = model(X,p)
    
preds.shape

In [None]:
labels.shape

In [None]:
loss_func(preds, labels)

In [None]:
loss_func(preds, labels).item()

## Train

In [None]:
# EXPORT
def trainEpoch(myDataLoader, tensorboard_writer, model, opt, loss_func,
               metric, lr_scheduler, tensorboard_rate, device,
               tensorboard_recorder_step, total_steps):
    running_loss = 0.0
    running_rmse = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Main Training ---

        # gpu
        X,p = sampleBatch[0]
        X = X.to(device)
        p = p.to(device)
        labels = sampleBatch[1].to(device)

        # zero the parameter gradients
        opt.zero_grad()

        # forward
        preds = model(X,p)

        # loss
        loss = loss_func(preds, labels)
        running_loss += loss.item()

        # backward
        loss.backward()

        # update parameters
        opt.step()

        # --- Metrics Recording ---

        # metrics
        r = metric(preds, labels)
        running_rmse += r

        # record lr change
        total_steps += 1
        tensorboard_writer.add_scalar(tag="lr_step_based", scalar_value=opt.param_groups[0]['lr'], global_step=total_steps)
        lr_scheduler.step()

        # tensorboard writes
        if (i % tensorboard_rate == 0):
            tensorboard_recorder_step += 1
            avg_running_loss = running_loss/tensorboard_rate
            avg_running_rmse = running_rmse/tensorboard_rate
            tensorboard_writer.add_scalar(tag=loss_func._get_name(), scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
            tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)
            # reset running_loss for the next set of batches. (tensorboard_rate number of batches)
            running_loss = 0.0
            running_rmse = 0.0
    return running_loss, tensorboard_recorder_step, total_steps


In [None]:
# EXPORT
def validEpoch(myDataLoader, tensorboard_writer, model, loss_func, metric,
               device, tensorboard_recorder_step):
    running_loss = 0.0
    running_rmse = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Metrics Recording ---

        # gpu
        X,p = sampleBatch[0]
        X = X.to(device)
        p = p.to(device)
        labels = sampleBatch[1].to(device)
        perc = len(X)/len(myDataLoader.dataset)

        # forward, no gradient calculations
        with torch.no_grad():
            preds = model(X,p)

        # loss
        loss = loss_func(preds, labels)
        running_loss += perc*loss.item()

        # metrics
        r = metric(preds, labels)
        running_rmse += perc*r

    avg_running_loss = running_loss
    avg_running_rmse = running_rmse
    tensorboard_writer.add_scalar(tag=loss_func._get_name(), scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
    tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)

    return running_loss

In [None]:
try:
    os.mkdir(cps)
except:
    print("checkpoints directory already exists :)")

In [None]:
# create a summary writer.
train_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'train'))
test_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'valid'))
tensorboard_recorder_step = 0
total_steps = 0

In [None]:
writeMessage('---------- Started Training ----------', versionName)
bestLoss = np.infty

for epoch in tqdm(range(1, epochs+1)):  # loop over the dataset multiple times
    
    writeMessage("--- Epoch {0}/{1} ---".format(epoch, epochs), versionName)
    
    model.train()
    trainLoss, tensorboard_recorder_step, total_steps = trainEpoch(trainDataLoader, 
                                                                   train_writer,model, opt, loss_func, 
                                                                   rmse, lr_scheduler, tensorboard_rate, 
                                                                   device, tensorboard_recorder_step, 
                                                                   total_steps)
    
    writeMessage("trainLoss: {:.4e}".format(trainLoss),versionName)
    model.eval()
    valLoss = validEpoch(testDataLoader, test_writer, model, loss_func, rmse, device, tensorboard_recorder_step)
    
    # checkpoint progress
    if valLoss < bestLoss:
        bestLoss = valLoss
        writeMessage("Better valLoss: {:.4e}, Saving model...".format(bestLoss),versionName)
        torch.save(model.state_dict(), os.path.join(cps,versionName))
#         testLoss = validEpoch(testDataLoader, test_writer)
#         writeMessage("Test Loss: {:.4f}".format(testLoss))

writeMessage('---------- Finished Training ----------', versionName)

## Compare: Generated vs. Simulated

In [None]:
model.eval()
sampleBatch = next(iter(testDataLoader))
X,p = sampleBatch[0]
X = X.to(device)
p = p.to(device)
labels = sampleBatch[1].to(device)
with torch.no_grad():
    preds = model(X,p)
    
print(labels.shape)
print(preds.shape)

In [None]:
idx = 12 # frame in the batch 
X = X[idx].detach().cpu().squeeze()
P = preds[idx].detach().cpu().squeeze()
plotSampleWpredictionByChannel(X, P)

In [None]:
plotSampleWprediction(X, P)

In [None]:
plotSample(X-P)

### Visualize full simulation

In [None]:
simData = getSingleSim(sim=simVizIndex,dataDirec=testData)
simDataset = MantaFlowDataset(simData, reverseXY=reverseXY, numToKeep=numSamplesToKeep, AE=True)
simDataLoader = DataLoader(simDataset,batch_size=1)

In [None]:
model.eval()
for i, sampleBatch in enumerate(simDataLoader,start=1):
    X,p = sampleBatch[0]
    X = X.to(device)
    p = p.to(device)
    labels = sampleBatch[1].to(device)
    with torch.no_grad():
        preds = model(X,p)
    X = X.detach().cpu().squeeze()
    P = preds.detach().cpu().squeeze()
    plotSampleWprediction(X, P)
