## Test modules during dev


In [1]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr

In [3]:
import sys
sys.path.append('../modules/')

In [4]:
%reload_ext autoreload
%autoreload 2
import datasets
import ML_classes

2024-09-28 18:22:00.601553: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-28 18:22:00.601614: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-28 18:22:00.605119: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
#dtree = datasets.read_filtered_datatree()
#dtree = datasets.calculate_magnitudes(dtree)

## Load data for different simulations and prepare it with right variables and non-dims.

In this step all the variables and extra dimensions may be created that were not originally present. 
Each input and output variable gets its own containing variable (e.g. if top and bottom layer were two separate inputs, then in this step they may get separated out). 
Other examples might be that rotated variables, or magnitudes become their own variables in this step. 

In [6]:
DT = datasets.SimulationData(simulation_names=['P2L', 'DG'], filter_scales=['50','100','200','400'])

In [7]:
DT.preprocess_simulation_data(window_size=3)

## ML data
Go from simulation data to batches. 
In this step things like dropping extra points, masking of data etc happen. 

In [8]:
ML_DT_train = datasets.MLDataset(simulation_data=DT,input_variables=['dudx_widened','filter_scale'],
                                 output_variables=['uphp'], time_range=slice(-50, None))
ML_DT_test = datasets.MLDataset(simulation_data=DT,input_variables=['dudx_widened','filter_scale'],
                                 output_variables=['uphp'], time_range=slice(-100, -75))

In [9]:
ML_DT_train.create_xr_ML_variables()

choose_ml_variables took: 0.0047 seconds
Time subsampling took: 0.0193 seconds
Horizontal subsampling took: 0.0149 seconds
h_mask_ml_variables took: 0.0654 seconds
stack_physical_dimensions took: 0.0570 seconds
will load upto: 0.05146 gb
load took: 2.7550 seconds
drop_nans took: 0.0562 seconds
Seed set as:42
randomize_along_points took: 0.0681 seconds
Picked: 23400points
pick_uniform_points took: 0.0081 seconds
concat_datatree_nodes took: 0.0290 seconds
randomize_concatenated_ml_dataset took: 0.0248 seconds
generate_batches took: 0.0002 seconds


In [10]:
ML_DT_test.create_xr_ML_variables()

choose_ml_variables took: 0.0049 seconds
Time subsampling took: 0.0178 seconds
Horizontal subsampling took: 0.0135 seconds
h_mask_ml_variables took: 0.0639 seconds
stack_physical_dimensions took: 0.0443 seconds
will load upto: 0.02573 gb
load took: 1.5915 seconds
drop_nans took: 0.0251 seconds
Seed set as:42
randomize_along_points took: 0.0301 seconds
Picked: 11700points
pick_uniform_points took: 0.0090 seconds
concat_datatree_nodes took: 0.0202 seconds
randomize_concatenated_ml_dataset took: 0.0126 seconds
generate_batches took: 0.0002 seconds


In [11]:
ML_DT_train.ml_dataset

In [12]:
len(ML_DT_train.ml_batches), len(ML_DT_test.ml_batches) 

(100, 100)

In [13]:
ML_DT_train.ml_batches[0]

### Convert these xbatcher batches to processed ones for training

This needs to take care of extra dims, like Xn,Yn.

In [14]:
import jax.numpy as jnp

In [15]:
batch_xr = ML_DT_train.ml_batches[0]

In [16]:
def preprocess_batch(batch: xr.Dataset, input_channels, output_channels): 
    #X = jnp.asarray(batch[input_channels].to_array().transpose(...,'variable').data)
    X_xr = batch[input_channels].to_stacked_array("input_features", sample_dims=['points'])
    y_xr = batch[output_channels].to_array().transpose(...,'variable')

    X = jnp.asarray(X_xr.data)
    y = jnp.asarray(y_xr.data)

    batch_out = {'X':X, 'y':y, 'Xp':y}
    return batch_out

In [17]:
batch = preprocess_batch(batch_xr, 
                       ['dudx_widened','filter_scale'],
                                 ['uphp'])

In [18]:
batch

{'X': Array([[-5.0976166e-07, -3.1233489e-07,  4.5745463e-08, ...,
         -3.6830841e-07,  1.9943457e-07,  2.0000000e+05],
        [-1.9215232e-08, -9.1745868e-08, -9.4928147e-08, ...,
         -2.8925214e-07, -2.1935111e-07,  2.0000000e+05],
        [ 1.1917033e-06,  1.0966531e-06,  8.7268972e-07, ...,
          1.0959418e-06,  1.2412154e-06,  1.0000000e+05],
        ...,
        [-1.3792079e-07, -6.1557678e-08,  2.2218057e-08, ...,
         -4.7250381e-08,  2.8943131e-08,  5.0000000e+04],
        [-6.9139065e-07, -1.0232109e-06, -6.3984248e-07, ...,
          3.0391622e-08,  9.1356310e-07,  1.0000000e+05],
        [-1.2656824e-06, -1.4003823e-06, -1.5208605e-06, ...,
         -1.0549345e-06, -1.2478831e-06,  5.0000000e+04]], dtype=float32),
 'y': Array([[ 9.0428972e+00],
        [ 5.4557592e-02],
        [ 1.3160020e-01],
        ...,
        [-6.0020429e-03],
        [-2.4853182e+00],
        [-2.0141811e+00]], dtype=float32),
 'Xp': Array([[ 9.0428972e+00],
        [ 5.4557592e-0

In [19]:
class MLData: 
    def __init__(self, ML_dataset, input_channels, output_channels):
        self.input_channels = input_channels
        self.output_channels = output_channels
        
        # Preprocess the entire dataset
        self.preprocessed_data = []
        for batch in ML_dataset.ml_batches:
            batch_out = preprocess_batch(batch, input_channels, output_channels)
            # if normalize:
            #     mean = np.mean(X)
            #     std = np.std(X)
            #     X = normalize_batch(X, mean, std)
            self.preprocessed_data.append(batch_out)

    def get_batches(self):
        for batch_out in self.preprocessed_data:
            yield batch_out

In [20]:
train_ML_data = MLData(ML_DT_train, ['dudx_widened','filter_scale'],
                                 ['uphp'])
test_ML_data = MLData(ML_DT_test, ['dudx_widened','filter_scale'],
                                 ['uphp'])

In [21]:
len(train_ML_data.preprocessed_data), len(test_ML_data.preprocessed_data)

(100, 100)

In [22]:
for batch_out in train_ML_data.get_batches():
    print("Input (X) batch shape:", batch_out['X'].shape)
    print("Output (y) batch shape:", batch_out['y'].shape)
    
    # If you want to view a sample of the data
    #print("Sample X:", batch_out['X'])  # First sample of the batch
    #print("Sample y:", batch_out['y'])  # Corresponding label/target
    break  # Remove break to print more batches

Input (X) batch shape: (1872, 10)
Output (y) batch shape: (1872, 1)


In [23]:
for batch_out in test_ML_data.get_batches():
    print("Input (X) batch shape:", batch_out['X'].shape)
    print("Output (y) batch shape:", batch_out['y'].shape)
    
    # If you want to view a sample of the data
    #print("Sample X:", batch_out['X'])  # First sample of the batch
    #print("Sample y:", batch_out['y'])  # Corresponding label/target
    break  # Remove break to print more batches

Input (X) batch shape: (936, 10)
Output (y) batch shape: (936, 1)


In [24]:
train_ML_data.get_batches()

<generator object MLData.get_batches at 0x7e00b5268e80>

In [25]:
ML_data_combo = {'train_gen':train_ML_data, 'test_gen':test_ML_data}

In [26]:
ML_data_combo

{'train_gen': <__main__.MLData at 0x7e00a7bf28a0>,
 'test_gen': <__main__.MLData at 0x7e00a6c190d0>}

In [27]:
for batch_out in ML_data_combo['train_gen'].get_batches():
    print("Input (X) batch shape:", batch_out['X'].shape)
    print("Output (y) batch shape:", batch_out['y'].shape)
    
    # If you want to view a sample of the data
    #print("Sample X:", batch_out['X'])  # First sample of the batch
    #print("Sample y:", batch_out['y'])  # Corresponding label/target
    break  # Remove break to print more batches

Input (X) batch shape: (1872, 10)
Output (y) batch shape: (1872, 1)


## The ML part


In [29]:
ANN_model = ML_classes.PointwiseANN(num_in=10,shape=[24,24,1])

In [30]:
ANN_model.count_parameters()

889


In [31]:
regress_sys = ML_classes.AnnRegressionSystem(ANN_model, learning_rate=1.)

In [32]:
%time
regress_sys.mse(regress_sys.state.params, batch_out['X'], batch_out['y'], batch_out['Xp'])

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 9.3 µs


Array(6.1180625e+10, dtype=float32)

In [33]:
%time
regress_sys.step(batch_out, kind='test')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.82 µs


Array(6.1180625e+10, dtype=float32)

In [34]:
%time
regress_sys.step(batch_out, kind='train')

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 9.06 µs


Array(6.1180625e+10, dtype=float32)

In [36]:
%%time
regress_sys.train_system(ML_data_combo, num_epoch=10, print_freq=2)

Train loss step 0:  119.38364799499512 test loss: 96.05960159301758
Train loss step 2:  101.61752700805664 test loss: 95.91249671936035
Train loss step 4:  101.44505531311034 test loss: 95.74408870697022
Train loss step 6:  101.24851387023926 test loss: 95.55253093719483
Train loss step 8:  101.02640197753907 test loss: 95.33656520843506
CPU times: user 1min 33s, sys: 13.3 s, total: 1min 46s
Wall time: 1min 28s
