## Imports

In [None]:
# EXPORT
# --- Must haves ---
import os, sys
sys.path.append('..')

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import torch.cuda as cuda
import torch.nn as nn
import torchvision
import torch.nn.functional as F

from surrogates4sims.datasets import MantaFlowDataset, getSingleSim, createMantaFlowTrainTest

from surrogates4sims.utils import create_opt, create_one_cycle, find_lr, printNumModelParams, \
                                    rmse, writeMessage, plotSampleWprediction, plotSampleWpredictionByChannel, \
                                    plotSample, curl, jacobian, stream2uv, create_movie, convertSimToImage

#from surrogates4sims.models import Generator, Encoder, AE_no_P, AE_xhat_z, AE_xhat_zV2

from surrogates4sims.train import trainEpoch, validEpoch

from surrogates4sims.svd import MantaFlowSVDDataset

import numpy as np
from tqdm import tqdm
from copy import deepcopy
import matplotlib.pyplot as plt

## Settings

In [None]:
DEBUG = False
# model name, for tensorboard recording and checkpointing purposes.
versionName = "LIN_SVD_only_z_and_p_LSTM_bidirec"

# GPU Numbers to use. Comma seprate them for multi-GPUs.
gpu_ids = "2"#,1,2,3"
versionName = versionName + '_GPUs{}'.format(gpu_ids.replace(',',''))
# path to load model weights.
pretrained_path = None

# rate at which to record metrics. (number of batches to average over when recording metrics, e.g. "every 5 batches")
tensorboard_rate = 5

# number of epochs to train. This is defined here so we can use the OneCycle LR Scheduler.
epochs = 1000

# Data Directory
dataDirec = '/data/mantaFlowSim/data/smoke_pos21_size5_f200/v'
reverseXY = False 

# checkpoint directory
cps = 'cps'
tensorboard_direc = "tb"

findLRs = True  
patience = 10

# hyper-params
seed = 1234
np.random.seed(seed)
testSplit = .1
bz = 4
numSamplesToKeep = np.infty #if not debugging
latentDim = 512
simLen = 200
simVizIndex = 0 # sim in the test set to visualize
numComponents = latentDim
transform = True
if DEBUG:
    epochs = 5000
    numSamplesToKeep = 1000
    
versionName = versionName + '_latentDim{}_bz{}_transform{}_epochs{}'.format(latentDim,bz,transform,epochs)
versionName

### Select Personal GPUs

In [None]:
!nvidia-smi

In [None]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]=gpu_ids

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
if device.type == 'cuda':
    print(cuda.is_available())
    print(cuda.device_count())
    print(cuda.current_device())
    print(cuda.get_device_name())

In [None]:
a = torch.zeros(5, device=device.type)
!nvidia-smi

## Datasets & Loaders

In [None]:
# A = np.load('svd_out.npz')
# vh = A['arr_2']
# vh.shape
# data = []
# numComp = 512
# for idx in range(105):
#     D = getSingleSim(idx)
#     simDataset = MantaFlowDataset(D)
#     Z = []
#     P = []
#     X0, p = simDataset[0]
#     for sample in simDataset:
#         f, p = sample
#         coeffs = vh[:numComp]@f.flatten()
#         Z.append(coeffs)
#         P.append(p)
#     y = np.array(Z)
#     P = np.array(P)
#     X = (X0,P,y[0])
#     data.append((X,y[1:]))
# with open('simLatentVectors.pkl','wb') as fid:
#     pickle.dump(data,fid)    

In [None]:
import pickle
with open('simLatentVectors.pkl','rb') as fid:
    data = pickle.load(fid)
len(data)

In [None]:
data[0][0][0].shape, data[0][0][1].shape, data[0][0][2].shape, data[0][1].shape

In [None]:
trainData, testData = createMantaFlowTrainTest(dataDirec,simLen,testSplit,seed)
print((len(trainData),len(testData)))

In [None]:
from glob import glob
d = glob(os.path.join(dataDirec,'*.npz'))
d = sorted(d)
simLen = 200
numSims = len(d)//simLen
numTestSamples = int(np.round(testSplit*numSims))
np.random.seed(seed)
perm = np.random.permutation(numSims)
testSims = perm[:numTestSamples]
trainSims = perm[numTestSamples:]
testSims, trainSims

In [None]:
testDataset = [data[i] for i in testSims]
trainDataset = [data[i] for i in trainSims]
len(testDataset), len(trainDataset)

In [None]:
D = []
E = []
for X,y in trainDataset:
    z = X[2]
    z = z.reshape(1,len(z))
    y = np.concatenate([z,y])
    D.append(y.max(axis=0))
    E.append(y.min(axis=0))
D = np.array(D)
E = np.array(E)
ymx = D.max(axis=0)
ymn = E.min(axis=0)
ymx, ymn

In [None]:
def transformY(y,ymx,ymn):
    return (ymx - y)/(ymx - ymn)

def inverseTransformY(y,ymx,ymn):
    x = ymx - y*(ymx - ymn)
    return x

In [None]:
a = transformY(y,ymx,ymn)
a.max(), a.min()

In [None]:
b = inverseTransformY(a,ymx,ymn)
np.abs(y - b).max()

In [None]:
class LatentSVD(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
                 
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, return_norm_stats=False):
        X, y  = self.data[idx]
        p = X[1]
        z = X[2]

        if self.transform:
            y = torch.tensor(y)
            norm_stats = y.norm(dim=1).unsqueeze(1)
            y = y / norm_stats
            
        D = []
        for pp in p[1:]:
            zz = np.concatenate([pp,z])#.reshape(1,515)
            D.append(zz)
        X = np.array(D)
        
        if return_norm_stats:
            return X, y, norm_stats
        else:
            return X, y

In [None]:
if transform:
    testDataset = LatentSVD(testDataset, transform=transformY)
    trainDataset = LatentSVD(trainDataset, transform=transformY)
else:
    testDataset = LatentSVD(testDataset)
    trainDataset = LatentSVD(trainDataset)

In [None]:
import random

class LatentSVDSub(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
                 
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, return_norm_stats=False):
        X, y  = self.data[idx]
        p = X[1]
        z = X[2]

        if self.transform:
            y = torch.tensor(y)
            norm_stats = y.norm(dim=1).unsqueeze(1)
            y = y / norm_stats
            
        D = []
        for pp in p[1:]:
            zz = np.concatenate([pp,z])#.reshape(1,515)
            D.append(zz)
        X = np.array(D)
        
        rand_i = random.randint(1, 189)
        X = X[rand_i:rand_i+10]
        X[:, 3:] = y[rand_i-1]
        y = y[rand_i:rand_i+10]
        
        if return_norm_stats:
            norm_stats = norm_stats[rand_i:rand_i+10]
            return X, y, norm_stats
        else:
            return X, y

In [None]:
if transform:
    testDataset = LatentSVDSub(testDataset, transform=transformY)
    trainDataset = LatentSVDSub(trainDataset, transform=transformY)
else:
    testDataset = LatentSVDSub(testDataset)
    trainDataset = LatentSVDSub(trainDataset)

### Making sure the transform works

In [None]:
trainDataLoader = DataLoader(dataset=trainDataset, batch_size=1, shuffle=True, drop_last=True)
testDataLoader = DataLoader(dataset=testDataset, batch_size=1)

In [None]:
X,y = next(iter(trainDataLoader))
X.shape, y.shape

In [None]:
plt.plot(y[0].squeeze().T)
plt.show()

In [None]:
y.max(), y.min()

In [None]:
X.shape

In [None]:
X[:,:,3:].max(),X[:,:,3:].min()

### DataLoaders

In [None]:
trainDataLoader = DataLoader(dataset=trainDataset, batch_size=bz, shuffle=True, drop_last=True)
testDataLoader = DataLoader(dataset=testDataset, batch_size=bz)

In [None]:
len(trainDataLoader)

## Model

In [None]:
inputSize = X.shape[2]
hiddenSize = y.shape[2]
numLayers = 1
bidirectional = False
batch_first = True
#model = nn.LSTM(inputSize, hiddenSize, numLayers, batch_first=True, bidirectional=bidirectional)

In [None]:
class LSTM(nn.Module):
    def __init__(self,inputSize=3, hiddenSize=512, numLayers=1, batch_first=True, bidirectional=False):
        super(LSTM,self).__init__()
        self.bidirectional = bidirectional
        self.lstm1 = nn.LSTM(3, hiddenSize, numLayers, batch_first=batch_first, bidirectional=bidirectional)

    def forward(self,x0):
        h1_0 = x0[:, 0, 3:].unsqueeze(0).clone()
        c1_0 = torch.zeros_like(h1_0)
        x1,_ = self.lstm1(x0[:, :100, :3], (h1_0, c1_0))
        return x1

In [None]:
model = LSTM(inputSize, hiddenSize, numLayers, batch_first, bidirectional).to(device)
model

In [None]:
printNumModelParams(model)

In [None]:
output = model(X.to(device))
output.shape

In [None]:
if len(gpu_ids.split(',')) > 1:
    model = nn.DataParallel(model)

## Orig Loss Function

In [None]:
output.shape

In [None]:
L = nn.MSELoss()

In [None]:
with torch.no_grad():
    loss = L(output, y.to(device))
loss

## Set LR

In [None]:
# if findLRs and (len(gpu_ids.split(','))==1): # doesn't work for multigpu???
#     opt = create_opt(1e-7,model)
#     find_lr(model,opt,L,device,trainDataLoader)

In [None]:
max_lr = .001
versionName = versionName + '_lr{}'.format(str(max_lr))

versionName

## Train

In [None]:
def trainEpoch(myDataLoader, tensorboard_writer, model, opt, loss,
               metric, lr_scheduler, tensorboard_rate, device,
               tensorboard_recorder_step, total_steps):
    running_loss = 0.0
    running_rmse = 0.0
    total_loss = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Main Training ---
        
        # gpu
        X,y = sampleBatch[0],sampleBatch[1]
        X = X.to(device)
        y = y.to(device)
        
        # zero the parameter gradients
        opt.zero_grad()

        y_hat = model(X)
        combined_loss = loss(y_hat,y)
        combined_loss.backward()
        opt.step()
        
        # loss
        batch_loss = combined_loss.item()
        running_loss += batch_loss
        total_loss += batch_loss

        # --- Metrics Recording ---

        # metrics
        r = metric(y_hat, y)
        running_rmse += r

        # record lr change
        total_steps += 1
        tensorboard_writer.add_scalar(tag="LR", scalar_value=opt.param_groups[0]['lr'], global_step=total_steps)

        # tensorboard writes
        if (i % tensorboard_rate == 0):
            tensorboard_recorder_step += 1
            avg_running_loss = running_loss/tensorboard_rate
            avg_running_rmse = running_rmse/tensorboard_rate
            tensorboard_writer.add_scalar(tag="Loss", scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
            tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)
            # reset running_loss for the next set of batches. (tensorboard_rate number of batches)
            running_loss = 0.0
            running_rmse = 0.0

    return total_loss/len(myDataLoader), tensorboard_recorder_step, total_steps


In [None]:
def validEpoch(myDataLoader, tensorboard_writer, model, loss, metric,
               device, tensorboard_recorder_step):
    running_loss = 0.0
    running_rmse = 0.0
    for i, sampleBatch in enumerate(myDataLoader, start=1):

        # --- Metrics Recording ---

        # gpu
        X,y = sampleBatch[0],sampleBatch[1]
        X = X.to(device)
        y = y.to(device)
        
        #perc = len(X)/len(myDataLoader.dataset)
        perc = 1./len(myDataLoader.dataset)
        # forward, no gradient calculations
        with torch.no_grad():
            y_hat = model(X)

        # loss
        combined_loss = loss(y_hat,y)
        
        running_loss += perc*(combined_loss.item())

        # metrics
        r = metric(y_hat, y)
        running_rmse += perc*r

    avg_running_loss = running_loss
    avg_running_rmse = running_rmse
    tensorboard_writer.add_scalar(tag="Loss", scalar_value=avg_running_loss, global_step=tensorboard_recorder_step)
    tensorboard_writer.add_scalar(tag=metric.__name__, scalar_value=avg_running_rmse, global_step=tensorboard_recorder_step)

    return running_loss

In [None]:
try:
    os.mkdir(cps)
except:
    print("checkpoints directory already exists :)")

In [None]:
# create a summary writer.
train_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'train'))
test_writer = SummaryWriter(os.path.join(tensorboard_direc, versionName,'valid'))
tensorboard_recorder_step = 0
total_steps = 0

In [None]:
def fit_up_to_trim_idx():
    
    global tensorboard_recorder_step
    global total_steps
    
    opt = torch.optim.Adam(model.parameters())
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt,patience=patience)

    writeMessage('---------- Started Training ----------', versionName)
    bestLoss = np.infty

    for epoch in tqdm(range(1, epochs+1)):  # loop over the dataset multiple times

        writeMessage("--- Epoch {0}/{1} ---".format(epoch, epochs), versionName)

        model.train()
        trainLoss, tensorboard_recorder_step, total_steps = trainEpoch(trainDataLoader, 
                                                                       train_writer, model, opt, L,
                                                                       rmse, lr_scheduler, 
                                                                       tensorboard_rate, device,
                                                                       tensorboard_recorder_step, total_steps)

        writeMessage("trainLoss: {:.4e}".format(trainLoss),versionName)
        writeMessage("LR: {:.4e}".format(opt.param_groups[0]['lr']),versionName)
    #     if trainLoss < bestLoss:
    #         bestLoss = trainLoss
    #         writeMessage("Better trainLoss: {:.4e}, Saving models...".format(bestLoss),versionName)
    #         torch.save(model.state_dict(), os.path.join(cps,versionName))

        model.eval()
        valLoss = validEpoch(testDataLoader, test_writer, model, L, rmse, device, tensorboard_recorder_step)
        writeMessage("valLoss: {:.4e}".format(valLoss),versionName)

        #checkpoint progress
        if valLoss < bestLoss:
            bestLoss = valLoss
            writeMessage("Better valLoss: {:.4e}, Saving models...".format(bestLoss),versionName)
            torch.save(model.state_dict(), os.path.join(cps,versionName))

        #lr_scheduler.step(trainLoss)

        if opt.param_groups[0]['lr'] < 5e-8:
            break
    writeMessage('---------- Finished Training ----------', versionName)

In [None]:
fit_up_to_trim_idx()

## Compare: Generated vs. Simulated

In [None]:
model.load_state_dict(torch.load(os.path.join(cps,versionName)))
model = model.to(device)

In [None]:
model.eval()

In [None]:
X, y, norm_stats = testDataset.__getitem__(0, True)
X = torch.tensor(X).unsqueeze(0)
y = y.to(device)
norm_stats = norm_stats.to(device)
X.shape, y.shape, norm_stats.shape

In [None]:
X, y = testDataset[0]
X = torch.tensor(X).unsqueeze(0)
y = torch.tensor(y).to(device)
X.shape, y.shape

In [None]:
batch_out = model(X.to(device))
batch_out.shape

In [None]:
batch_out = batch_out.squeeze()
y = y.squeeze()
err = []
for i in range(y.shape[0]):
    err.append(torch.norm(y[i] - batch_out[i]))
err

In [None]:
batch_out = batch_out.squeeze()*norm_stats
y = y.squeeze()*norm_stats
err = []
for i in range(y.shape[0]):
    err.append(torch.norm(y[i] - batch_out[i]))
err

In [None]:
d = batch_out - y
d.shape

In [None]:
import matplotlib.pyplot as plt
plt.plot(err)

In [None]:
t = np.arange(512)
for dd,yy in zip(batch_out,y):
    plt.plot(t, dd.detach().cpu().numpy(),t, yy.detach().cpu().numpy(),'--')
    plt.show()

In [None]:
tmp = y.detach().cpu().numpy()
tmp.max(), tmp.min()

In [None]:
bb = batch_out.detach().cpu().numpy()
bb.max(), bb.min()

In [None]:
plt.plot(tmp.max(axis=0))
plt.show()

In [None]:
plt.plot(d[-2].detach().cpu().numpy())
plt.show()

In [None]:
from sklearn.manifold import TSNE
X = y.detach().cpu().numpy()
X.shape

In [None]:
X_embedded = TSNE(n_components=2,verbose=3).fit_transform(X)
X_embedded.shape

In [None]:
plt.scatter(X_embedded[:,0],X_embedded[:,1],s=2)
plt.show()

In [None]:
X = batch_out.detach().cpu().numpy()
X_embedded = TSNE(n_components=2,verbose=3).fit_transform(X)
X_embedded.shape

In [None]:
plt.scatter(X_embedded[:,0],X_embedded[:,1],s=2)
plt.show()