In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv3D, UpSampling3D, Add
from tensorflow.keras.models import Model


# Clear tensor sessions to free up memory

tf.keras.backend.clear_session()




2023-12-05 08:36:24.844352: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [51]:
import netCDF4 as nc
from netCDF4 import num2date
import numpy as np

# Load the data for both years
#data_2020 = nc.Dataset('/Users/heyj/Desktop/sql project/2020data.nc')
#data_2021 = nc.Dataset('/Users/heyj/Desktop/sql project/2020data.nc')
data_2020 = nc.Dataset('2020data00UTC.nc') #(choose a specific timestamp: 00UTC for each day)
data_2021 = nc.Dataset('2021data00UTC.nc')

# Combine data from both years
#z_combined = np.concatenate((data_2020['z'][:], data_2021['z'][:]), axis=0)
u_combined = np.concatenate((data_2020['u'][:], data_2021['u'][:]), axis=0)
v_combined = np.concatenate((data_2020['v'][:], data_2021['v'][:]), axis=0)

data_2020.variables

{'longitude': <class 'netCDF4._netCDF4.Variable'>
 float32 longitude(longitude)
     units: degrees_east
     long_name: longitude
 unlimited dimensions: 
 current shape = (101,)
 filling on, default _FillValue of 9.969209968386869e+36 used,
 'latitude': <class 'netCDF4._netCDF4.Variable'>
 float32 latitude(latitude)
     units: degrees_north
     long_name: latitude
 unlimited dimensions: 
 current shape = (81,)
 filling on, default _FillValue of 9.969209968386869e+36 used,
 'level': <class 'netCDF4._netCDF4.Variable'>
 int32 level(level)
     units: millibars
     long_name: pressure_level
 unlimited dimensions: 
 current shape = (3,)
 filling on, default _FillValue of -2147483647 used,
 'time': <class 'netCDF4._netCDF4.Variable'>
 int32 time(time)
     units: hours since 1900-01-01 00:00:00.0
     long_name: time
     calendar: gregorian
 unlimited dimensions: 
 current shape = (365,)
 filling on, default _FillValue of -2147483647 used,
 'z': <class 'netCDF4._netCDF4.Variable'>
 int

In [3]:
times_2020 = data_2020['time'][:]
dates_2020 = nc.num2date(times_2020, units=data_2020['time'].units)

times_2021 = data_2021['time'][:]
dates_2021 = nc.num2date(times_2021, units=data_2021['time'].units)

all_dates = np.concatenate((dates_2020, dates_2021))



In [4]:
# Preprocessing the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()

#z_normalized = scaler.fit_transform(z_combined.reshape(-1, 1)).reshape(z_combined.shape)
u_normalized = scaler.fit_transform(u_combined.reshape(-1, 1)).reshape(u_combined.shape)
v_normalized = scaler.fit_transform(v_combined.reshape(-1, 1)).reshape(v_combined.shape)

# Combine the parameters to form a single dataset
#data_combined = np.stack((z_normalized, u_normalized, v_normalized), axis=-1)
data_combined = np.stack((u_normalized, v_normalized), axis=-1)
# Pad the data to get even dimensions
padded_data = np.pad(data_combined, ((0, 0), (0, 1), (0, 1), (0, 1), (0, 0)), mode='constant')
np.info(padded_data)

class:  ndarray
shape:  (731, 4, 82, 102, 2)
strides:  (535296, 133824, 1632, 16, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x7f0c2e420010
byteorder:  little
byteswap:  False
type: float64


In [7]:
# Preprocessing the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()

#z_normalized = scaler.fit_transform(z_combined.reshape(-1, 1)).reshape(z_combined.shape)
u_normalized = scaler.fit_transform(u_combined.reshape(-1, 1)).reshape(u_combined.shape)
v_normalized = scaler.fit_transform(v_combined.reshape(-1, 1)).reshape(v_combined.shape)

# Combine the parameters to form a single dataset
#data_combined = np.stack((z_normalized, u_normalized, v_normalized), axis=-1)
data_combined = np.stack((u_normalized, v_normalized), axis=-1)

# Pad the data to get even dimensions
padded_data = np.pad(data_combined, ((0, 0), (0, 1), (0, 1), (0, 1), (0, 0)), mode='constant')


# Split the data into training and validation sets

X_train, X_val, dates_train, dates_val = train_test_split(padded_data, all_dates, test_size=0.3, random_state=42)
#X_train, X_val = train_test_split(padded_data, test_size=0.3, random_state=42)

np.info(padded_data)

class:  ndarray
shape:  (731, 4, 82, 102, 2)
strides:  (535296, 133824, 1632, 16, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x7f0bf6537010
byteorder:  little
byteswap:  False
type: float64


In [8]:
import numpy as np
np.save ('X_train.npy', X_train)
np.save ('dates_train.npy', dates_train)

In [9]:
print("Training data shape", X_train.shape)
print("Validation data shape", X_val.shape)


Training data shape (511, 4, 82, 102, 2)
Validation data shape (220, 4, 82, 102, 2)


In [14]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv3D, UpSampling3D, Add
from tensorflow.keras.models import Model


# Clear tensor sessions to free up memory

tf.keras.backend.clear_session()
# Define ResNet block
def resnet_block(input_tensor, filters, kernel_size=(3, 3, 3), strides=(1, 1, 1)):
    x = Conv3D(filters, kernel_size, strides=strides, padding='same')(input_tensor)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    
    x = Conv3D(filters, kernel_size, padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    # Adjust the shortcut connection
    if strides != (1, 1, 1) or input_tensor.shape[-1] != filters:
        shortcut = Conv3D(filters, (1, 1, 1), strides=strides, padding='same')(input_tensor)
    else:
        shortcut = input_tensor
    
    x = Add()([x, shortcut])
    x = tf.keras.layers.Activation('relu')(x)
    return x


# Define the model architecture
input_shape_padded = (4, 82, 102, 2)
inputs = Input(shape=input_shape_padded)

# Encoder
x = resnet_block(inputs, 32)
x = resnet_block(x, 64)
encoded = resnet_block(x, 128, strides=(2, 2, 2))

# Decoder
x = UpSampling3D((2, 2, 2))(encoded)
x = resnet_block(x, 64)
x = UpSampling3D((1, 1, 1))(x)
x = resnet_block(x, 32)
decoded = Conv3D(2, (3, 3, 3), activation='sigmoid', padding='same')(x)

# Compile the autoencoder
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Print the model summary
autoencoder.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4, 82, 102,  0                                            
__________________________________________________________________________________________________
conv3d (Conv3D)                 (None, 4, 82, 102, 3 1760        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 4, 82, 102, 3 128         conv3d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 4, 82, 102, 3 0           batch_normalization[0][0]        
______________________________________________________________________________________________

In [15]:
# Train the model 
history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, validation_data=(X_val, X_val))


2023-12-05 08:46:32.177753: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2023-12-05 08:46:32.196288: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2100000000 Hz


Epoch 1/50


2023-12-05 08:46:35.491106: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2023-12-05 08:46:36.140816: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8905
2023-12-05 08:46:36.819216: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2023-12-05 08:46:36.819714: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [16]:
autoencoder.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 4, 82, 102,  0                                            
__________________________________________________________________________________________________
conv3d (Conv3D)                 (None, 4, 82, 102, 3 1760        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 4, 82, 102, 3 128         conv3d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 4, 82, 102, 3 0           batch_normalization[0][0]        
______________________________________________________________________________________________

In [17]:
# 1. Reconstruction Error:Evaluate how well the autoencoder can reconstruct the input data

from sklearn.metrics import mean_squared_error, mean_absolute_error

decoded_val = autoencoder.predict(X_val)

# Mean Squared Error (MSE)
mse = mean_squared_error(X_val.flatten(), decoded_val.flatten())

# Mean Absolute Error (MAE)
mae = mean_absolute_error(X_val.flatten(), decoded_val.flatten())

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


Mean Squared Error: 0.00011903555899868252
Mean Absolute Error: 0.007710221525083468


In [18]:
# Save Summary text file
original_stdout = sys.stdout  # Save a reference to the original standard output

with open('model_summary.txt', 'w') as f:
    sys.stdout = f  # Change the standard output to the file we created.
    print(autoencoder.summary())
    sys.stdout = original_stdout  # Reset the standard output to its original value




In [20]:
# Save the history object 
import pandas as pd
import pickle

# Convert the history.history dict to a pandas DataFrame
hist_df = pd.DataFrame(history.history)

# Save to csv
hist_csv_file = 'history.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

# Save to pickle
with open('history.pkl', 'wb') as file:
    pickle.dump(history.history, file)


In [21]:
# Load the saved history 

loaded_history = pd.read_csv('history.csv')

with open('history.pkl', 'rb') as file:
    loaded_history = pickle.load(file)

In [22]:
# Define the encoder model. Find the low dimension layer according to the model summary. 'activation_15' should be the encoder layer
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('activation_5').output)




In [23]:
encoder_output = autoencoder.get_layer('activation_5').output

In [24]:
# Save the model
autoencoder.save('autoencoder_yjmodel.h5')



In [25]:
# Save the weights
autoencoder.save_weights('autoencoder_yjweights.h5') 

In [26]:
# Save whole training database encoded_database
encoded_database = encoder.predict(X_train)

# Save the encoded data as np and h5py

np. save('encoded_database.npy', encoded_database)

import h5py

with h5py.File('encoded_database', 'w') as h5f:
    h5f.create_dataset('dataset_1', data=encoded_database)

In [27]:
# Load the model
from tensorflow.keras.models import load_model

autoencoder = load_model('autoencoder_yjmodel.h5')

In [28]:
# Preprocess and Encode the ##Input Day$$ Data, here use 28th Oct 2023 


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()

Oct_28_2023 = nc.Dataset('Oct_28_00UTC.nc')
#Oct_15_2023 = nc.Dataset('/home/jianhong/Desktop/Analogue-Nowcasting-Model (copy)/15_Oct.nc')
z_input = Oct_28_2023['z'][:]
u_input = Oct_28_2023['u'][:]
v_input = Oct_28_2023['v'][:]

# Normalize the data
scaler = MinMaxScaler()
z_input_normalized = scaler.fit_transform(z_input.reshape(-1, 1)).reshape(z_input.shape)
u_input_normalized = scaler.fit_transform(u_input.reshape(-1, 1)).reshape(u_input.shape)
v_input_normalized = scaler.fit_transform(v_input.reshape(-1, 1)).reshape(v_input.shape)


# Combine the parameters to form a single dataset
data_combined = np.stack((z_input_normalized, u_input_normalized , v_input_normalized), axis=-1)
print(data_combined.shape)
np.info(data_combined)


# Pad the data to get even dimensions
input_day_padded = np.pad(data_combined, ((0,0),(0, 1), (0, 1), (0, 1), (0, 0)), mode='constant')


np.info(input_day_padded)
# Use the encoder to generate the encoded representation of this input day’s data
#input_day_encoded = encoder.predict(input_day_padded)




(1, 3, 81, 101, 3)
class:  ndarray
shape:  (1, 3, 81, 101, 3)
strides:  (589032, 196344, 2424, 24, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x562f6841f480
byteorder:  little
byteswap:  False
type: float64
class:  ndarray
shape:  (1, 4, 82, 102, 3)
strides:  (802944, 200736, 2448, 24, 8)
itemsize:  8
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x562f68820040
byteorder:  little
byteswap:  False
type: float64


In [25]:
encoded_input = encoder.predict(input_day_padded)
np.info(encoded_input)

class:  ndarray
shape:  (1, 2, 41, 51, 128)
strides:  (2141184, 1070592, 26112, 512, 4)
itemsize:  4
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x55afbb4349d0
byteorder:  little
byteswap:  False
type: float32


In [26]:
np.info(encoded_database)

class:  ndarray
shape:  (511, 2, 41, 51, 128)
strides:  (2141184, 1070592, 26112, 512, 4)
itemsize:  4
aligned:  True
contiguous:  True
fortran:  False
data pointer: 0x7fa211914010
byteorder:  little
byteswap:  False
type: float32


In [40]:
# Calculate similarities and find the most similar day:

# Use <Euclidean distances> to calculate the Euclidean distances between the input day's encoded representation and the encoded representations of all the days in the database.



input_flattened = encoded_input.reshape(1,-1)
database_flattened = encoded_database.reshape(encoded_database.shape[0], -1)


#np.info(input_flattened)
#np.info(database_flattened)

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np


# Calculate Euclidean distances 
distances = euclidean_distances(input_flattened, database_flattened)



sorted_data= np.argsort(distances)


most_similar_dates = dates_train[sorted_data]
# Calculate Euclidean distances 
#distances = np.array([euclidean(input_flattened,sample) for sample in database_flattened])

#indices_of_smallest = np.argsort(distance)[:3]
most_similar_dates 


masked_array(data=[[cftime.DatetimeGregorian(2021, 7, 27, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2020, 9, 26, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2021, 10, 10, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2021, 10, 11, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2020, 9, 30, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2021, 5, 23, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2020, 8, 4, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2020, 8, 5, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2020, 10, 1, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2020, 10, 19, 0, 0, 0, 0, has_year_zero=False),
                    cftime.DatetimeGregorian(2021, 6, 11, 0, 0, 0, 0, has_year_

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()

Nov_28_2023 = nc.Dataset('Nov_28_2023.nc')
#Oct_15_2023 = nc.Dataset('/home/jianhong/Desktop/Analogue-Nowcasting-Model (copy)/15_Oct.nc')
#z_input = Nov_28_2023['z'][:]
u_input = Nov_28_2023['u'][:]
v_input = Nov_28_2023['v'][:]

# Normalize the data
scaler = MinMaxScaler()
#z_input_normalized = scaler.fit_transform(z_input.reshape(-1, 1)).reshape(z_input.shape)
u_input_normalized = scaler.fit_transform(u_input.reshape(-1, 1)).reshape(u_input.shape)
v_input_normalized = scaler.fit_transform(v_input.reshape(-1, 1)).reshape(v_input.shape)


# Combine the parameters to form a single dataset
#data_combined = np.stack((z_input_normalized, u_input_normalized , v_input_normalized), axis=-1)
data_combined = np.stack((u_input_normalized , v_input_normalized), axis=-1)
#print(data_combined.shape)
#np.info(data_combined)


# Pad the data to get even dimensions
input_day_padded = np.pad(data_combined, ((0,0),(0, 1), (0, 1), (0, 1), (0, 0)), mode='constant')


#np.info(input_day_padded)
# Use the encoder to generate the encoded representation of this input day’s data
#input_day_encoded = encoder.predict(input_day_padded)


In [49]:
encoded_input = encoder.predict(input_day_padded)
input_flattened = encoded_input.reshape(1,-1)
database_flattened = encoded_database.reshape(encoded_database.shape[0], -1)


#np.info(input_flattened)
#np.info(database_flattened)

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np


# Calculate Euclidean distances 
distances = euclidean_distances(input_flattened, database_flattened).flatten()

sorted_data= np.argsort(distances)


most_similar_dates = dates_train[sorted_data][:5]

most_similar_dates
# Calculate Euclidean distances 
#distances = np.array([euclidean(input_flattened,sample) for sample in database_flattened])

#indices_of_smallest = np.argsort(distance)[:3]


masked_array(data=[cftime.DatetimeGregorian(2021, 11, 29, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(2021, 4, 2, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(2021, 12, 6, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(2021, 1, 16, 0, 0, 0, 0, has_year_zero=False),
                   cftime.DatetimeGregorian(2020, 12, 9, 0, 0, 0, 0, has_year_zero=False)],
             mask=False,
       fill_value='?',
            dtype=object)