## End to End PNNL Surrogate Model Training and Testing

Important parameters:

channel = 1 or 2 # do others later

gridsize = 128 or 512 

w = 10 # anything from 1 to 499 (simLen) is okay. 

latentDim = 16 

## Imports

In [None]:
# EXPORT
# --- Must haves ---
import os, sys
sys.path.append('..')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.cuda as cuda
import torch.nn as nn
import torchvision
import torch.nn.functional as F

from surrogates4sims.pnnlDatasets import CCSI_2D

from surrogates4sims.utils import create_opt, create_one_cycle, find_lr, printNumModelParams, \
                                    rmse, writeMessage, plotSampleWprediction, plotSampleWpredictionByChannel, \
                                    plotSample, curl, jacobian, stream2uv, create_movie, convertSimToImage, \
                                    pkl_save, pkl_load, create_1_channel_movie

from surrogates4sims.models import Generator, Encoder, AE_no_P, AE_xhat_z, AE_xhat_zV2

import numpy as np
from tqdm import tqdm
from copy import deepcopy
from glob import glob

## Settings

In [None]:
# data 
eval_only=False
DEBUG = False
# model name, for tensorboard recording and checkpointing purposes.
versionName = "pnnl_end2end_plateau_train"

# GPU Numbers to use. Comma seprate them for multi-GPUs.
gpu_ids = "2"
versionName = versionName + '_GPUs{}'.format(gpu_ids.replace(',',''))
# path to load model weights.
pretrained_path = None

# rate at which to record metrics. (number of batches to average over when recording metrics, e.g. "every 5 batches")
tensorboard_rate = 5

# number of epochs to train. This is defined here so we can use the OneCycle LR Scheduler.
epochs = 1000

# Data Directory
channel = 1
gridsize = 128
dataDirec = '/data/ccsi/pnnl_liquid_inlet/channel_{}/gridsize_{}'.format(channel,gridsize)
preprocess = False # keep this as false until using the long runtime loader
testSplit = .2 # don't change this for now. 
AE = False
numWorkers = 2
physicsDim = 2 # inlet velocity and time sample

# checkpoint directory
cps = 'cps'
tensorboard_direc = "tb"

findLRs = False  

# LIN parameters
hiddenLayers = [128,128]
activation = nn.ELU()

# hyper-params
seed = 1234
np.random.seed(seed)
bz = 8
numSamplesToKeep = np.infty #if not debugging
latentDim = 16
window_size = 5
filters = 128
num_conv = 4 # breaks when less than 2
simLen = 500
stack = True
simVizIndex = 0 # sim in the test set to visualize
createStreamFcn = False
doJacobian = False
repeat = 0
skip_connection = False
patience = 1
if DEBUG:
    epochs = 10000
    numSamplesToKeep = 2 # 1 simulation
    
versionName = versionName + '_channel{}_gridsize{}_latentDim{}'.format(channel, gridsize, latentDim)
versionName

'pnnl_end2end_plateau_train_GPUs2_channel1_gridsize128_latentDim16'

## Select GPUs

In [None]:
!nvidia-smi

Thu Oct 29 20:03:50 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN Xp            On   | 00000000:02:00.0 Off |                  N/A |
| 23%   20C    P8     9W / 250W |      1MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            On   | 00000000:03:00.0 Off |                  N/A |
| 29%   52C    P2   106W / 250W |   3208MiB / 12196MiB |     87%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            On   | 00000000:81:00.0 Off |                  N/A |
| 25%   

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=gpu_ids

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
if device.type == 'cuda':
    print(cuda.is_available())
    print(cuda.device_count())
    print(cuda.current_device())
    print(cuda.get_device_name())

True
1
0
TITAN Xp


In [None]:
a = torch.zeros(5, device=device.type)
!nvidia-smi

Thu Oct 29 20:03:50 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN Xp            On   | 00000000:02:00.0 Off |                  N/A |
| 23%   20C    P8     8W / 250W |      1MiB / 12196MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            On   | 00000000:03:00.0 Off |                  N/A |
| 29%   52C    P2   153W / 250W |   3208MiB / 12196MiB |     85%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN Xp            On   | 00000000:81:00.0 Off |                  N/

## Datasets & Loaders

In [None]:
sims = glob(os.path.join(dataDirec,'*.pkl'))
numSims = len(sims)
idx = int(testSplit*numSims)
testInds = np.linspace(1,numSims-2,idx).astype('int')
trainInds = list(set(np.arange(0,numSims)).difference(set(testInds)))
# perm = np.random.permutation(numSims)
# testInds = perm[:idx]
# trainInds = perm[idx:]
testSimFiles = [sims[idx] for idx in testInds]
trainSimFiles = [sims[idx] for idx in trainInds]
len(testSimFiles), len(trainSimFiles)

(10, 40)

In [None]:
## change the CCSI_2D class in 01_pnnl with this one later
from torch.utils.data import Dataset
class CCSI_2D(Dataset):
    def __init__(self, 
                 dataFiles,
                 txtFile = '/data/ccsi/pnnl_liquid_inlet/liquid_inlet_velocity.txt',
                 channel=1,
                 gridSize=128,
                 simLen = 500,
                 w = 10, # this is the length of the Y output to predict
                 AE = False, # this only return x,x, i.e. no y.
                 numToKeep=np.infty,doPreprocess=False): 
        
        self.dataFiles = dataFiles
        if numToKeep < len(self.dataFiles):
            self.dataFiles = self.dataFiles[:numToKeep]

        self.channel = channel
        self.gridSize = gridSize
        self.numToKeep = numToKeep
        self.simLen = 500
        self.t = np.linspace(0,1,simLen).astype('float32')
        self.w = w
        self.AE = AE
        self.doPreprocess = doPreprocess
        
        # Get the inlet velocity
        with open(txtFile) as fid:
            txt = fid.read().splitlines()
        inletVelocity = np.array(list(map(float,txt[1:]))).astype('float32')
        self.inletMx = np.max(inletVelocity)
        self.inletMn = np.min(inletVelocity)
        
        data = []
        for fn in self.dataFiles:
            idx = int(fn.split('/')[-1].replace('.pkl','')) - 1
            D = pkl_load(fn)
            data.append((D,inletVelocity[idx]))
               
        self.data = data
    
    def __len__(self):
        return len(self.simLen*self.data)

    def __getitem__(self, idx):
        if len(self.data) == 1:
            q = 0
            r_idx = idx
        else:
            q,r = np.divmod(idx,self.simLen)
            r_idx = np.random.randint(0,self.simLen-self.w)
            
        X,p = self.data[q]
        x = X[r_idx:r_idx+1]
        #print(x.shape)
        y = X[r_idx+1:r_idx+self.w+1]
        #print(y.shape)
        if self.doPreprocess:
            x = self.preprocessFcn(x)
            y = self.preprocessFcn(y)
        
        y = np.expand_dims(y,1)
        p_x = np.hstack([p,self.t[r_idx]])
        p_y = np.vstack([p*np.ones((self.w,)),self.t[r_idx+1:r_idx+self.w+1]]).T
        X = x.astype('float32')
        Y = y.astype('float32')
        if self.AE:
            return X,X # this allows LR_finder to work
        else:
            return X, Y, p_x, p_y


In [None]:
testDataset = CCSI_2D(testSimFiles,doPreprocess=preprocess,numToKeep=numSamplesToKeep,channel=channel,AE=AE,
                      w=window_size)
trainDataset = CCSI_2D(trainSimFiles,doPreprocess=preprocess,numToKeep=numSamplesToKeep,channel=channel,AE=AE,
                      w=window_size)
len(testDataset), len(trainDataset)

(5000, 20000)

In [None]:
trainDataLoader = DataLoader(dataset=trainDataset, batch_size=bz, shuffle=True, drop_last=True, 
                             num_workers=numWorkers, pin_memory=True)
testDataLoader = DataLoader(dataset=testDataset, batch_size=bz, num_workers=numWorkers, pin_memory=True)
len(trainDataLoader), len(testDataLoader)

(2500, 625)

In [None]:
X,Y,p_x, p_y = next(iter(trainDataLoader))
print(X.shape,Y.shape,p_x.shape, p_y.shape)

torch.Size([8, 1, 128, 128]) torch.Size([8, 5, 1, 128, 128]) torch.Size([8, 2]) torch.Size([8, 5, 2])


## Model

In [None]:
X = X.to(device)
AE_model = AE_xhat_zV2(X, filters, latentDim, num_conv, repeat, 
                 skip_connection, stack, conv_k=3, last_k=3, 
                 act=nn.LeakyReLU(), return_z=True, stream=createStreamFcn, device=device)

if len(gpu_ids.split(',')) > 1:
    AE_model = nn.DataParallel(model)

[128, 8, 8]


In [None]:
printNumModelParams(AE_model)

154 layers require gradients (unfrozen) out of 154 layers
8,761,361 parameters require gradients (unfrozen) out of 8,761,361 parameters


In [None]:
Xhat,z = AE_model(X)
Xhat.shape, z.shape

(torch.Size([8, 1, 128, 128]), torch.Size([8, 16]))

In [None]:
# AE_model.load_state_dict(torch.load(os.path.join('/home/widemann1/surrogates4sims/cps',
# 'plateau_train_GPUs2_latentDim16_filters128_bz16_numConv4_streamFalse_jacobianFalse_epochs1000_stackTrue_lr0.0001')))

In [None]:
# LIN Model
class MLP(nn.Module):
    def __init__(self, X, hiddenLayerSizes = [1024], activation=nn.ELU()):
        super(MLP,self).__init__()
        
        self.activation = activation
        self.inputSize = X.shape[1:]
        self.modules = []
        self.modules.append(nn.Linear(np.prod(self.inputSize),hiddenLayerSizes[0]))
        self.modules.append(self.activation)
        for idx,sz in enumerate(hiddenLayerSizes[:-1]):
            self.modules.append(nn.Linear(hiddenLayerSizes[idx],hiddenLayerSizes[idx+1]))
            self.modules.append(self.activation)
                               
        self.modules.append(nn.Linear(hiddenLayerSizes[-1],np.prod(self.inputSize)))
        self.layers = nn.Sequential(*self.modules)
                                
        
    def forward(self,x):
        x = self.layers(x)
        return x

In [None]:
LIN_model = MLP(z, hiddenLayerSizes=hiddenLayers, activation=activation)
LIN_model

MLP(
  (activation): ELU(alpha=1.0)
  (layers): Sequential(
    (0): Linear(in_features=16, out_features=128, bias=True)
    (1): ELU(alpha=1.0)
    (2): Linear(in_features=128, out_features=128, bias=True)
    (3): ELU(alpha=1.0)
    (4): Linear(in_features=128, out_features=16, bias=True)
  )
)

In [None]:
# surrogate class
class Surrogate(nn.Module):
    
    def __init__(self, window,
                 z_size, p_size,
                LIN, encoder, decoder):
        super(Surrogate, self).__init__()
        self.window = window
        self.z_size = z_size # this does not include the size of p
        self.p_size = p_size
        self.c_size = z_size + p_size # this does include the size of p
        self.LIN = LIN
        self.encoder = encoder
        self.decoder = decoder
        
    def encode(self, U):
        
        return self.encoder(U)
        
    def decode(self, encoding):
        
        return self.decoder(encoding)
        
    def predict_next_w_encodings(self, encoding, p_y, window):
        '''
        use the LIN to predict the next w encodings for each 
        encoded U in the batch
        '''
            
        predicted_encodings = []
            
        # given a batch of encodings, advance each encoding window time steps.
        # save the result at each time step
        for i in range(window):
            encoding = self.LIN(encoding) + encoding # use LIN to predict delta in encoding
            # this was encoding[:,:,-self.p_size:] in 09_manta..., why the extra dimension?
            encoding[:,-self.p_size:] = p_y[:, i]
            predicted_encodings.append(encoding)
            
            
        return torch.stack(predicted_encodings)
    
    def forward(self, U, p_x, p_y, window = None):
        
        if window == None:
            window = self.window
        assert p_y.size(1) == window
            
        encoding = self.encode(U)
        encoding[:,-self.p_size:] = p_x # added this on 10/27/2020
        encoding_w = self.predict_next_w_encodings(encoding, p_y, window)
        # want to have this agree with U_y, which is [batch_size, window_size, channels, nx, ny]
        # right now, it's [window_size, batch_size, c_size], so transpose dimensions 0 and 1
        #print(encoding_w.shape)
        U = torch.stack([self.decode(encoding_i) for encoding_i in encoding_w])
        return U.transpose(0,1)
    

In [None]:
surrogate = Surrogate(window_size, latentDim - physicsDim, physicsDim, LIN_model, AE_model.encoder, AE_model.generator)

In [None]:
surrogate = surrogate.to(device)

In [None]:
encoding = surrogate.encode(X)
encoding.shape

torch.Size([8, 16])

In [None]:
decoding = surrogate.decode(encoding)
decoding.shape

torch.Size([8, 1, 128, 128])

In [None]:
assert surrogate.c_size == latentDim
assert surrogate.p_size == physicsDim
assert encoding.shape[-1] == surrogate.c_size
assert decoding.shape == X.shape

In [None]:
Xhat = surrogate.forward(X, p_x, p_y)
Xhat.shape

torch.Size([8, 5, 1, 128, 128])

In [None]:
del surrogate, encoding, decoding, X, Y

surrogate = Surrogate(window_size, latentDim - physicsDim, physicsDim, LIN_model, 
                      AE_model.encoder, AE_model.generator).to(device)

if len(gpu_ids.split(',')) > 1:
    surrogate = nn.DataParallel(surrogate)

In [None]:
max_lr = .0001
start_lr = 5*max_lr/10
#opt = create_opt(max_lr,model)
#lr_scheduler = create_one_cycle(opt,max_lr,epochs,trainDataLoader)
opt = torch.optim.Adam(surrogate.parameters(),lr=max_lr,betas=(.5,.999))
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt,patience=patience)

In [None]:
def L1_loss(pred, target):
    return torch.mean(torch.abs(pred - target))


def jacobian_loss(pred, target, device='cpu'):
    return L1_loss(jacobian(pred, device), jacobian(target, device))


def curl_loss(pred, target, device):
    return L1_loss(curl(pred, device), curl(target, device))


L = nn.MSELoss()


def p_loss(pred, target):
    return L(pred[:, -target.shape[1]:], target)


def loss(pred, target, device):
    
    if createStreamFcn:
        pred = stream2uv(pred, device)
        
    L1 = L1_loss(pred, target)
    Lj = 0
    if doJacobian:
        Lj = jacobian_loss(pred, target, device)
        
    return L1 + Lj

In [None]:
def trainEpoch(myDataLoader, tensorboard_writer, model, opt, p_loss, loss,
               metric, lr_scheduler, tensorboard_rate, device,
               tensorboard_recorder_step, total_steps):
    running_loss = 0.0
    running_rmse = 0.0
    total_loss = 0.0
    running_ploss = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Main Training ---
        
        # gpu
        U_x, U_y, p_x, p_y = sampleBatch
        U_x = U_x.to(device)
        p_x = p_x.to(device)
        U_y = U_y.to(device)
        p_y = p_y.to(device)
            

        # zero the parameter gradients
        opt.zero_grad()

        U_hat = model(U_x, p_x, p_y)
        pl = 0
        ll = loss(U_hat, U_y, device)
        combined_loss = pl + ll
        combined_loss.backward()
        opt.step()
        
        # loss
        batch_loss = combined_loss.item()
        running_loss += batch_loss
        total_loss += batch_loss
        
        batch_ploss = pl
        running_ploss += batch_ploss

        # --- Metrics Recording ---

        # metrics
        r = metric(U_hat, U_y)
        running_rmse += r

        # record lr change
        total_steps += 1
        tensorboard_writer.add_scalar(tag="LR", scalar_value=opt.param_groups[0]['lr'], global_step=total_steps)
        #lr_scheduler.step()

        # tensorboard writes
        if (i % tensorboard_rate == 0):
            tensorboard_recorder_step += 1
            avg_running_loss = running_loss/tensorboard_rate
            avg_running_rmse = running_rmse/tensorboard_rate
            avg_running_ploss = running_ploss/tensorboard_rate
            tensorboard_writer.add_scalar(tag="Loss", scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
            tensorboard_writer.add_scalar(tag="p_loss", scalar_value=avg_running_ploss, global_step=tensorboard_recorder_step)
            tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)
            # reset running_loss for the next set of batches. (tensorboard_rate number of batches)
            running_loss = 0.0
            running_rmse = 0.0
            running_ploss = 0.0
            tensorboard_writer.flush()

    return total_loss/len(myDataLoader), tensorboard_recorder_step, total_steps

In [None]:
def validEpoch(myDataLoader, tensorboard_writer, model, p_loss, loss, metric,
               device, tensorboard_recorder_step):
    running_loss = 0.0
    running_rmse = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Metrics Recording ---

        # gpu
        U_x, U_y, p_x, p_y = sampleBatch
        U_x = U_x.to(device) # only squeeze away the window dimension (because batch size = 1)
        p_x = p_x.to(device) # only squeeze away the window dimension (because batch size = 1)
        U_y = U_y.to(device)
        p_y = p_y.to(device)
        
        perc = len(U_x)/len(myDataLoader.dataset)

        # forward, no gradient calculations
        with torch.no_grad():
            U_hat = model(U_x, p_x, p_y, window = window_size)

        # loss
        combined_loss = loss(U_hat, U_y, device)
        
        running_loss += perc*(combined_loss.item())

        # metrics
        r = metric(U_hat, U_y)
        running_rmse += perc*r

    avg_running_loss = running_loss
    avg_running_rmse = running_rmse
    tensorboard_writer.add_scalar(tag="Loss", scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
    tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)
    tensorboard_writer.flush()
    
    return running_loss

In [None]:
try:
    os.mkdir(cps)
except:
    print("checkpoints directory already exists :)")
    
# create a summary writer.
train_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'train'))
test_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'valid'))
tensorboard_recorder_step = 0
total_steps = 0

checkpoints directory already exists :)


In [None]:
writeMessage('---------- Started Training ----------', versionName)
bestLoss = np.infty

if not eval_only:
    for epoch in tqdm(range(1, epochs+1)):  # loop over the dataset multiple times

        writeMessage("--- Epoch {0}/{1} ---".format(epoch, epochs), versionName)

        surrogate.train()
        trainLoss, tensorboard_recorder_step, total_steps = trainEpoch(trainDataLoader, 
                                                                       train_writer, surrogate,
                                                                       opt, p_loss, loss,
                                                                       rmse, lr_scheduler, 
                                                                       tensorboard_rate, device,
                                                                       tensorboard_recorder_step, total_steps)

        writeMessage("trainLoss: {:.4e}".format(trainLoss),versionName)
        writeMessage("LR: {:.4e}".format(opt.param_groups[0]['lr']),versionName)
#         if trainLoss < bestLoss:
#             bestLoss = trainLoss
#             writeMessage("Better trainLoss: {:.4e}, Saving models...".format(bestLoss),versionName)
#             torch.save(surrogate.state_dict(), os.path.join(cps,versionName))

        surrogate.eval()
        valLoss = validEpoch(testDataLoader, test_writer, surrogate, p_loss, loss, rmse, device, tensorboard_recorder_step)
        writeMessage("valLoss: {:.4e}".format(valLoss),versionName)

        # checkpoint progress
        if valLoss < bestLoss:
            bestLoss = valLoss
            writeMessage("Better valLoss: {:.4e}, Saving models...".format(bestLoss),versionName)
            torch.save(surrogate.state_dict(), os.path.join(cps,versionName))

        lr_scheduler.step(trainLoss)

        if opt.param_groups[0]['lr'] < 5e-8:
            break
    writeMessage('---------- Finished Training ----------', versionName)


  0%|          | 0/1000 [00:00<?, ?it/s][A

---------- Started Training ----------
--- Epoch 1/1000 ---
trainLoss: 6.9081e-02
LR: 1.0000e-04



  0%|          | 1/1000 [15:14<253:52:02, 914.84s/it][A

valLoss: 6.2116e-02
Better valLoss: 6.2116e-02, Saving models...
--- Epoch 2/1000 ---
trainLoss: 6.3304e-02
LR: 1.0000e-04



  0%|          | 2/1000 [30:34<254:00:00, 916.23s/it][A

valLoss: 6.0629e-02
Better valLoss: 6.0629e-02, Saving models...
--- Epoch 3/1000 ---
trainLoss: 6.1575e-02
LR: 1.0000e-04



  0%|          | 3/1000 [45:53<253:57:58, 917.03s/it][A

valLoss: 6.0476e-02
Better valLoss: 6.0476e-02, Saving models...
--- Epoch 4/1000 ---
trainLoss: 6.0795e-02
LR: 1.0000e-04



  0%|          | 4/1000 [1:01:12<253:55:43, 917.81s/it][A

valLoss: 6.1229e-02
--- Epoch 5/1000 ---
trainLoss: 6.0317e-02
LR: 1.0000e-04



  0%|          | 5/1000 [1:16:31<253:44:13, 918.04s/it][A

valLoss: 6.2150e-02
--- Epoch 6/1000 ---
trainLoss: 6.0427e-02
LR: 1.0000e-04



  1%|          | 6/1000 [1:31:49<253:30:31, 918.14s/it][A

valLoss: 6.6611e-02
--- Epoch 7/1000 ---
trainLoss: 6.1088e-02
LR: 1.0000e-04



  1%|          | 7/1000 [1:47:07<253:14:27, 918.09s/it][A

valLoss: 6.3687e-02
--- Epoch 8/1000 ---
trainLoss: 5.6602e-02
LR: 1.0000e-05



  1%|          | 8/1000 [2:02:26<253:02:29, 918.30s/it][A

valLoss: 6.2280e-02
--- Epoch 9/1000 ---
trainLoss: 5.5889e-02
LR: 1.0000e-05



  1%|          | 9/1000 [2:17:43<252:38:38, 917.78s/it][A

valLoss: 6.1556e-02
--- Epoch 10/1000 ---
trainLoss: 5.5533e-02
LR: 1.0000e-05



  1%|          | 10/1000 [2:33:01<252:27:46, 918.05s/it][A

valLoss: 6.1708e-02
--- Epoch 11/1000 ---
trainLoss: 5.5134e-02
LR: 1.0000e-05



  1%|          | 11/1000 [2:48:20<252:16:04, 918.27s/it][A

valLoss: 6.1597e-02
--- Epoch 12/1000 ---
trainLoss: 5.4646e-02
LR: 1.0000e-05



  1%|          | 12/1000 [3:03:42<252:19:58, 919.43s/it][A

valLoss: 6.1685e-02
--- Epoch 13/1000 ---
trainLoss: 5.4444e-02
LR: 1.0000e-05



  1%|▏         | 13/1000 [3:19:13<252:58:21, 922.70s/it][A

valLoss: 6.1534e-02
--- Epoch 14/1000 ---
trainLoss: 5.4087e-02
LR: 1.0000e-05



  1%|▏         | 14/1000 [3:34:43<253:22:04, 925.08s/it][A

valLoss: 6.1365e-02
--- Epoch 15/1000 ---
trainLoss: 5.3776e-02
LR: 1.0000e-05



  2%|▏         | 15/1000 [3:50:13<253:30:45, 926.54s/it][A

valLoss: 6.1650e-02
--- Epoch 16/1000 ---
trainLoss: 5.3464e-02
LR: 1.0000e-05



  2%|▏         | 16/1000 [4:05:44<253:36:37, 927.84s/it][A

valLoss: 6.1395e-02
--- Epoch 17/1000 ---
trainLoss: 5.3136e-02
LR: 1.0000e-05



  2%|▏         | 17/1000 [4:21:14<253:29:15, 928.34s/it][A

valLoss: 6.1444e-02
--- Epoch 18/1000 ---
trainLoss: 5.2866e-02
LR: 1.0000e-05



  2%|▏         | 18/1000 [4:36:42<253:16:32, 928.51s/it][A

valLoss: 6.1289e-02
--- Epoch 19/1000 ---
trainLoss: 5.2556e-02
LR: 1.0000e-05



  2%|▏         | 19/1000 [4:52:14<253:15:40, 929.40s/it][A

valLoss: 6.1336e-02
--- Epoch 20/1000 ---
trainLoss: 5.2254e-02
LR: 1.0000e-05



  2%|▏         | 20/1000 [5:07:46<253:12:04, 930.13s/it][A

valLoss: 6.1342e-02
--- Epoch 21/1000 ---
trainLoss: 5.2029e-02
LR: 1.0000e-05



  2%|▏         | 21/1000 [5:23:13<252:40:34, 929.15s/it][A

valLoss: 6.1342e-02
--- Epoch 22/1000 ---
trainLoss: 5.1746e-02
LR: 1.0000e-05



  2%|▏         | 22/1000 [5:38:42<252:24:26, 929.11s/it][A

valLoss: 6.1392e-02
--- Epoch 23/1000 ---
trainLoss: 5.1505e-02
LR: 1.0000e-05



  2%|▏         | 23/1000 [5:54:10<252:06:26, 928.95s/it][A

valLoss: 6.1375e-02
--- Epoch 24/1000 ---
trainLoss: 5.1168e-02
LR: 1.0000e-05



  2%|▏         | 24/1000 [6:09:40<251:54:32, 929.17s/it][A

valLoss: 6.1312e-02
--- Epoch 25/1000 ---
trainLoss: 5.1037e-02
LR: 1.0000e-05



  2%|▎         | 25/1000 [6:25:08<251:32:53, 928.79s/it][A

valLoss: 6.1460e-02
--- Epoch 26/1000 ---
trainLoss: 5.0715e-02
LR: 1.0000e-05



  3%|▎         | 26/1000 [6:40:37<251:19:18, 928.91s/it][A

valLoss: 6.1553e-02
--- Epoch 27/1000 ---
trainLoss: 5.0507e-02
LR: 1.0000e-05



  3%|▎         | 27/1000 [6:56:04<250:54:29, 928.33s/it][A

valLoss: 6.1449e-02
--- Epoch 28/1000 ---
trainLoss: 5.0234e-02
LR: 1.0000e-05



  3%|▎         | 28/1000 [7:11:32<250:36:56, 928.21s/it][A

valLoss: 6.1584e-02
--- Epoch 29/1000 ---
trainLoss: 5.0147e-02
LR: 1.0000e-05



  3%|▎         | 29/1000 [7:27:00<250:23:28, 928.33s/it][A

valLoss: 6.1377e-02
--- Epoch 30/1000 ---
trainLoss: 4.9900e-02
LR: 1.0000e-05



  3%|▎         | 30/1000 [7:42:31<250:18:12, 928.96s/it][A

valLoss: 6.1495e-02
--- Epoch 31/1000 ---
trainLoss: 4.9643e-02
LR: 1.0000e-05



  3%|▎         | 31/1000 [7:58:01<250:10:33, 929.45s/it][A

valLoss: 6.1497e-02
--- Epoch 32/1000 ---
trainLoss: 4.9430e-02
LR: 1.0000e-05



  3%|▎         | 32/1000 [8:13:29<249:43:20, 928.72s/it][A

valLoss: 6.1541e-02
--- Epoch 33/1000 ---
trainLoss: 4.9107e-02
LR: 1.0000e-05



  3%|▎         | 33/1000 [8:28:54<249:14:07, 927.87s/it][A

valLoss: 6.1691e-02
--- Epoch 34/1000 ---
trainLoss: 4.9089e-02
LR: 1.0000e-05



  3%|▎         | 34/1000 [8:44:22<248:57:35, 927.80s/it][A

valLoss: 6.1704e-02
--- Epoch 35/1000 ---
trainLoss: 4.8869e-02
LR: 1.0000e-05



  4%|▎         | 35/1000 [8:59:52<248:52:13, 928.43s/it][A

valLoss: 6.1683e-02
--- Epoch 36/1000 ---
trainLoss: 4.8714e-02
LR: 1.0000e-05



  4%|▎         | 36/1000 [9:15:17<248:22:16, 927.53s/it][A

valLoss: 6.1638e-02
--- Epoch 37/1000 ---
trainLoss: 4.8488e-02
LR: 1.0000e-05



  4%|▎         | 37/1000 [9:30:47<248:17:05, 928.17s/it][A

valLoss: 6.1660e-02
--- Epoch 38/1000 ---
trainLoss: 4.8318e-02
LR: 1.0000e-05



  4%|▍         | 38/1000 [9:46:16<248:06:30, 928.47s/it][A

valLoss: 6.1627e-02
--- Epoch 39/1000 ---
trainLoss: 4.8187e-02
LR: 1.0000e-05



  4%|▍         | 39/1000 [10:01:48<248:04:47, 929.33s/it][A

valLoss: 6.1790e-02
--- Epoch 40/1000 ---
trainLoss: 4.7964e-02
LR: 1.0000e-05



  4%|▍         | 40/1000 [10:17:17<247:49:26, 929.34s/it][A

valLoss: 6.1849e-02
--- Epoch 41/1000 ---
trainLoss: 4.7830e-02
LR: 1.0000e-05



  4%|▍         | 41/1000 [10:32:45<247:30:04, 929.10s/it][A

valLoss: 6.1872e-02
--- Epoch 42/1000 ---
trainLoss: 4.7658e-02
LR: 1.0000e-05



  4%|▍         | 42/1000 [10:48:17<247:24:36, 929.72s/it][A

valLoss: 6.1798e-02
--- Epoch 43/1000 ---
trainLoss: 4.7534e-02
LR: 1.0000e-05



  4%|▍         | 43/1000 [11:03:43<246:54:42, 928.82s/it][A

valLoss: 6.1886e-02
--- Epoch 44/1000 ---
trainLoss: 4.7337e-02
LR: 1.0000e-05



  4%|▍         | 44/1000 [11:19:14<246:47:07, 929.32s/it][A

valLoss: 6.2006e-02
--- Epoch 45/1000 ---
trainLoss: 4.7217e-02
LR: 1.0000e-05



  4%|▍         | 45/1000 [11:34:44<246:38:06, 929.72s/it][A

valLoss: 6.1887e-02
--- Epoch 46/1000 ---
trainLoss: 4.7098e-02
LR: 1.0000e-05



  5%|▍         | 46/1000 [11:50:16<246:31:22, 930.28s/it][A

valLoss: 6.1721e-02
--- Epoch 47/1000 ---
trainLoss: 4.6961e-02
LR: 1.0000e-05



  5%|▍         | 47/1000 [12:05:47<246:20:21, 930.56s/it][A

valLoss: 6.1832e-02
--- Epoch 48/1000 ---
trainLoss: 4.6794e-02
LR: 1.0000e-05



  5%|▍         | 48/1000 [12:21:14<245:45:55, 929.36s/it][A

valLoss: 6.2191e-02
--- Epoch 49/1000 ---
trainLoss: 4.6612e-02
LR: 1.0000e-05



  5%|▍         | 49/1000 [12:36:43<245:30:19, 929.36s/it][A

valLoss: 6.1939e-02
--- Epoch 50/1000 ---
trainLoss: 4.6480e-02
LR: 1.0000e-05



  5%|▌         | 50/1000 [12:52:09<244:55:58, 928.17s/it][A

valLoss: 6.1997e-02
--- Epoch 51/1000 ---
trainLoss: 4.6350e-02
LR: 1.0000e-05



  5%|▌         | 51/1000 [13:07:38<244:45:46, 928.50s/it][A

valLoss: 6.2077e-02
--- Epoch 52/1000 ---
trainLoss: 4.6242e-02
LR: 1.0000e-05



  5%|▌         | 52/1000 [13:23:06<244:29:10, 928.43s/it][A

valLoss: 6.2153e-02
--- Epoch 53/1000 ---


KeyboardInterrupt: 

In [None]:
surrogate.load_state_dict(torch.load(os.path.join(cps,versionName)))

In [None]:
testDataset_fullSim = CCSI_2D(testSimFiles,doPreprocess=preprocess,numToKeep=numSamplesToKeep,channel=channel,AE=AE,
                      w=simLen-1)
first_frame_testDataset = torch.utils.data.Subset(testDataset_fullSim, range(0, len(testDataset), simLen))
simulation_testDataLoader = DataLoader(dataset=first_frame_testDataset, batch_size=1)

In [None]:
X,Y, p_x, p_y = next(iter(simulation_testDataLoader))
X.shape,Y.shape, p_x.shape, p_y.shape

(torch.Size([1, 1, 128, 128]),
 torch.Size([1, 499, 1, 128, 128]),
 torch.Size([1, 2]),
 torch.Size([1, 499, 2]))

In [None]:
surrogate.eval()
U_hats = []
Us = []
for i, sampleBatch in enumerate(simulation_testDataLoader, start=1):

    # gpu
    U_x, U_y, p_x, p_y = sampleBatch
    U_x = U_x.to(device)
    p_x = p_x.to(device)
    U_y = U_y.to(device)
    p_y = p_y.to(device)
    with torch.no_grad():
        Us.append(U_y.detach().cpu())
        
        U_hat = surrogate(U_x, p_x, p_y, window=simLen-1)
                    
        U_hats.append(U_hat.detach().cpu())
        
        
Real_U = torch.stack(Us)
#Real_X_img = convertSimToImage(Real_X)

Surr_U = torch.stack(U_hats)
#Surr_X_img = convertSimToImage(Surr_X)

In [None]:
for a,b in zip(Us,U_hats):
    rel_error = torch.norm(a - b)/torch.norm(a)
    writeMessage("Relative_Error: {:.4e}".format(rel_error),versionName)

Relative_Error: 9.7969e-01
Relative_Error: 9.7679e-01
Relative_Error: 9.7701e-01
Relative_Error: 9.7978e-01
Relative_Error: 9.7920e-01
Relative_Error: 9.7219e-01
Relative_Error: 9.7175e-01
Relative_Error: 9.7451e-01
Relative_Error: 9.7328e-01
Relative_Error: 9.7342e-01


In [None]:
rel_error = torch.norm(Real_U - Surr_U)/torch.norm(Real_U)
writeMessage("Relative_Error: {:.4e}".format(rel_error),versionName)
test_writer.add_scalar(tag="Relative_Error", scalar_value=rel_error, global_step=tensorboard_recorder_step)
test_writer.flush()