In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import glob
from urllib.request import urlretrieve
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Imported because Roberto also did it.
# from torch_geometric.data import Data

# Load the data

In [2]:
# Load the dem, waterdepth, x_velocity, y_velocity data
path = "data/raw_datasets/" 

allfiles_dem = glob.glob(path + 'DEM/*.txt')
allfiles_wd = glob.glob(path + 'WD/*.txt')
allfiles_vx = glob.glob(path + 'VX/*.txt')
allfiles_vy = glob.glob(path + 'VY/*.txt')

dem = []
wd = []
vx = []
vy = []

for i in range(len(allfiles_dem)):
    dem.append(np.loadtxt(allfiles_dem[i]))
    wd.append(np.loadtxt(allfiles_wd[i]))
    vx.append(np.loadtxt(allfiles_vx[i]))
    vy.append(np.loadtxt(allfiles_vy[i]))


In [3]:
# Number of grids (in one axis)
number_grids = 64

# Simulation index (range:0-136)
sim = 0
sim_len = len(wd)

# Timestep index (range:0-96)
t = 0
t_len = len(wd[0])

In [4]:
# Extract DEM data at specific simulation
dem_sim = np.reshape(dem[sim][:,2],(number_grids,number_grids))
# Extract the water depth, vx, and vy value at specific time
wd_sim = np.reshape(wd[sim][t],(number_grids,number_grids))
vx_sim = np.reshape(vx[sim][t],(number_grids,number_grids))
vy_sim = np.reshape(vy[sim][t],(number_grids,number_grids))

In [5]:
# Stacking all the simulation for normalization 
# Note that each of the simulation contains t = 0 - 96
wd_allArray = wd[0]
vx_allArray = vx[0]
vy_allArray = vy[0]

for x in range(1,len(wd)):
    wd_allArray = np.concatenate((wd_allArray, wd[i]),axis=0)
    vx_allArray = np.concatenate((vx_allArray, vx[i]),axis=0)
    vy_allArray = np.concatenate((vy_allArray, vy[i]),axis=0)

# Data Processing and convertion

In [6]:
# def convert_to_pyg(graph, pos, DEM, WD, VX, VY):
#     '''
#     Converts a graph or mesh into a PyTorch Geometric Data type 
#     Then, add position, DEM, and water variables to data object.
#     Adapted from https://github.com/RBTV1/SWE-GNN-paper-repository-/blob/main/database/graph_creation.py
#     '''
#     DEM = DEM.reshape(-1)

#     edge_index = torch.LongTensor(list(graph.edges)).t().contiguous()
#     row, col = edge_index

#     data = Data()

#     delta_DEM = torch.FloatTensor(DEM[col]-DEM[row])
#     coords = torch.FloatTensor(get_coords(pos))
#     edge_relative_distance = coords[col] - coords[row]
#     edge_distance = torch.norm(edge_relative_distance, dim=1)
#     edge_slope = delta_DEM/edge_distance

#     data.edge_index = edge_index
#     data.edge_distance = edge_distance
#     data.edge_slope = edge_slope
#     data.edge_relative_distance = edge_relative_distance

#     data.num_nodes = graph.number_of_nodes()
#     data.pos = torch.tensor(list(pos.values()))
#     data.DEM = torch.FloatTensor(DEM)
#     data.WD = torch.FloatTensor(WD.T)
#     data.VX = torch.FloatTensor(VX.T)
#     data.VY = torch.FloatTensor(VY.T)
        
#     return data

# Data Normalization

In [7]:
# Here use the Min-max normalization to normalize the water depth over the entire simulation sequence (0-130)
def scale_sequences(X,scaler=None,scaler_type='minmax'):
    """
    Uses a minmax scaler to transform sequences. The scaler is created if no scaler is passed as argument.
    Adapted from exercise notebook on drinking water demand.
    
    The input parameter X is a two-dimensional array.
    """
    
    Xshape=X.shape
    if scaler:
        X = scaler.transform(X.reshape(-1,1)).reshape(Xshape)
        return X
    else:
        if scaler_type == 'standard':
            scaler = StandardScaler()
        elif scaler_type == 'minmax':
            scaler = MinMaxScaler()
        else:
            raise Exception("Type of scikit-learn scaler not supported. Choose 'standard' or 'minmax.")
        X = scaler.fit_transform(X.reshape(-1,1)).reshape(Xshape)
        return X, scaler
    
def denormalize(image_tensor, mean, std):
    # Denormalize the image
    denorm_img = image_tensor * std[:, None, None] + mean[:, None, None]
    # Clip values to be between 0 and 1
    denorm_img = denorm_img.clip(0, 1)
    return denorm_img


In [8]:
# normalization of waterdepth and velocity
wd_scale, scaler = scale_sequences(wd_allArray ,scaler=None,scaler_type='minmax')
vx_scale, scaler = scale_sequences(vx_allArray ,scaler=None,scaler_type='minmax')
vy_scale, scaler = scale_sequences(vy_allArray ,scaler=None,scaler_type='minmax')

In [9]:
# After normalization all the data, split them into the original form (131 simulations with t = 0-96 of each)
wd_norm = np.vsplit(wd_scale,sim_len)
vx_norm = np.vsplit(vx_scale,sim_len)
vy_norm = np.vsplit(vy_scale,sim_len)

# Split the train and testing dataset 

In [40]:
# First, Split the dataset into following portion: training(70%), testing(15%), validation(15%)
# Second, Split the existing test dataset into validation and test sets (50/50 split)

# Water depth
wd_tra, wd_tst, iwd_tra, iwd_tst = train_test_split(
    wd_norm, np.arange(sim_len), test_size=0.30, shuffle=True, random_state=42)

wd_val, wd_tst, iwd_val, iwd_tst = train_test_split(
    wd_tst, iwd_tst, test_size=0.5, shuffle=True, random_state=42)


# X velocity
vx_tra, vx_tst, ivx_tra, ivx_tst = train_test_split(
    vx_norm, np.arange(sim_len), test_size=0.30, shuffle=True, random_state=42)

vx_val, vx_tst, ivx_val, ivx_tst = train_test_split(
    vx_tst, ivx_tst, test_size=0.5, shuffle=True, random_state=42)


# Y velocity
vy_tra, vy_tst, ivy_tra, ivy_tst = train_test_split(
    vy_norm, np.arange(sim_len), test_size=0.30, shuffle=True, random_state=42)

vy_val, vy_tst, ivy_val, ivy_tst = train_test_split(
    vy_tst, ivy_tst, test_size=0.5, shuffle=True, random_state=42)


print(f"len(vx_tra): {len(vy_tra)}")
print(f"len(vx_val): {len(vy_val)}")
print(f"len(vx_tst): {len(vy_tst)}")

# Note that the form of the training dataset is a list containing multiple arrays

len(vx_tra): 91
len(vx_val): 20
len(vx_tst): 20


# Datasets and Data Loaders

In [42]:
# Not finished yet!!!


# train_dataset = TensorDataset(torch.tensor(wd_tra, dtype=torch.float32), torch.tensor(Y_tra, dtype=torch.float32))
# val_dataset = TensorDataset(torch.tensor(wd_val, dtype=torch.float32), torch.tensor(Y_val, dtype=torch.float32))
# test_dataset = TensorDataset(torch.tensor(wd_tst, dtype=torch.float32), torch.tensor(Y_tst, dtype=torch.float32 ))

  train_dataset = TensorDataset(torch.tensor(wd_tra, dtype=torch.float32), torch.tensor(Y_tra, dtype=torch.float32))


NameError: name 'Y_tra' is not defined