In [1]:
import netCDF4 as nc
import numpy as np

# Load the data for both years
data_2020 = nc.Dataset('/Users/heyj/Desktop/sql project/2020data.nc')
data_2021 = nc.Dataset('/Users/heyj/Desktop/sql project/2020data.nc')

# Combine data from both years
z_combined = np.concatenate((data_2020['z'][:], data_2021['z'][:]), axis=0)
u_combined = np.concatenate((data_2020['u'][:], data_2021['u'][:]), axis=0)
v_combined = np.concatenate((data_2020['v'][:], data_2021['v'][:]), axis=0)


In [2]:
# Preprocessing the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Normalize the data
scaler = MinMaxScaler()

z_normalized = scaler.fit_transform(z_combined.reshape(-1, 1)).reshape(z_combined.shape)
u_normalized = scaler.fit_transform(u_combined.reshape(-1, 1)).reshape(u_combined.shape)
v_normalized = scaler.fit_transform(v_combined.reshape(-1, 1)).reshape(v_combined.shape)

# Combine the parameters to form a single dataset
data_combined = np.stack((z_normalized, u_normalized, v_normalized), axis=-1)

# Pad the data to get even dimensions
padded_data = np.pad(data_combined, ((0, 0), (0, 1), (0, 1), (0, 1), (0, 0)), mode='constant')

# Split the data into training and validation sets
X_train_padded, X_val_padded = train_test_split(padded_data, test_size=0.3, shuffle=False)


In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv3D, UpSampling3D, Add
from tensorflow.keras.models import Model

# Define ResNet block
def resnet_block(input_tensor, filters, kernel_size=(3, 3, 3), strides=(1, 1, 1)):
    x = Conv3D(filters, kernel_size, strides=strides, padding='same')(input_tensor)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    
    x = Conv3D(filters, kernel_size, padding='same')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    # Adjust the shortcut connection
    if strides != (1, 1, 1) or input_tensor.shape[-1] != filters:
        shortcut = Conv3D(filters, (1, 1, 1), strides=strides, padding='same')(input_tensor)
    else:
        shortcut = input_tensor
    
    x = Add()([x, shortcut])
    x = tf.keras.layers.Activation('relu')(x)
    return x


# Define the model architecture
input_shape_padded = (4, 42, 102, 3)
inputs = Input(shape=input_shape_padded)

# Encoder
x = resnet_block(inputs, 32)
x = resnet_block(x, 64)
encoded = resnet_block(x, 128, strides=(2, 2, 2))

# Decoder
x = UpSampling3D((2, 2, 2))(encoded)
x = resnet_block(x, 64)
x = UpSampling3D((1, 1, 1))(x)
x = resnet_block(x, 32)
decoded = Conv3D(3, (3, 3, 3), activation='sigmoid', padding='same')(x)

# Compile the autoencoder
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='mse')


2023-10-22 14:45:55.796493: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Train the model 
history = autoencoder.fit(X_train_padded, X_train_padded, epochs=5, batch_size=32, validation_data=(X_val_padded, X_val_padded))


Epoch 1/5

KeyboardInterrupt: 

In [None]:
autoencoder.summary()

In [6]:
# Save the history object 
import pandas as pd
import pickle

# Convert the history.history dict to a pandas DataFrame
hist_df = pd.DataFrame(history.history)

# Save to csv
hist_csv_file = 'history.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

# Save to pickle
with open('history.pkl', 'wb') as file:
    pickle.dump(history.history, file)


NameError: name 'history' is not defined

In [7]:
# Load the saved history 

loaded_history = pd.read_csv('history.csv')

with open('history.pkl', 'rb') as file:
    loaded_history = pickle.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'history.csv'

In [8]:
# Generate Encoded Representations for the Database

encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer(name='encoder').output)
encoded_database = encoder.predict(padded_data)


ValueError: No such layer: encoder. Existing layers are: ['input_1', 'conv3d', 'batch_normalization', 'activation', 'conv3d_1', 'batch_normalization_1', 'conv3d_2', 'add', 'activation_1', 'conv3d_3', 'batch_normalization_2', 'activation_2', 'conv3d_4', 'batch_normalization_3', 'conv3d_5', 'add_1', 'activation_3', 'conv3d_6', 'batch_normalization_4', 'activation_4', 'conv3d_7', 'batch_normalization_5', 'conv3d_8', 'add_2', 'activation_5', 'up_sampling3d', 'conv3d_9', 'batch_normalization_6', 'activation_6', 'conv3d_10', 'batch_normalization_7', 'conv3d_11', 'add_3', 'activation_7', 'up_sampling3d_1', 'conv3d_12', 'batch_normalization_8', 'activation_8', 'conv3d_13', 'batch_normalization_9', 'conv3d_14', 'add_4', 'activation_9', 'conv3d_15'].

In [None]:
# Preprocess and Encode the ##Input Day$$ Data, here use 28th Sep 2023 

Sep_28_2023 = nc.Dataset('/Users/heyj/Desktop/sql project/2020data.nc')
z_Sep_28_2023 = data_2020['z'][:]
u_Sep_28_2023 = data_2020['u'][:]
v_Sep_28_2023 = data_2020['v'][:]

# Normalize the data
scaler = MinMaxScaler()
z_Sep_28_2023_normalized = scaler.fit_transform(z_Sep_28_2023.reshape(-1, 1)).reshape(z_Sep_28_2023.shape)
u_Sep_28_2023_normalized = scaler.fit_transform(u_Sep_28_2023.reshape(-1, 1)).reshape(u_Sep_28_2023.shape)
v_Sep_28_2023_normalized = scaler.fit_transform(v_Sep_28_2023.reshape(-1, 1)).reshape(v_Sep_28_2023.shape)


# Combine the parameters to form a single dataset
data_combined = np.stack((z_Sep_28_2023_normalized, u_Sep_28_2023_normalized , v_Sep_28_2023_normalized), axis=-1)

# Pad the data to get even dimensions
input_day_padded = np.pad(data_combined, ((0, 0), (0, 1), (0, 1), (0, 1), (0, 0)), mode='constant')

# Use the encoder to generate the encoded representation of this input day’s data
input_day_encoded = encoder.predict(input_day_padded)


In [None]:
# Calculate similarities and find the most similar day:

# Use <Euclidean distances> to calculate the Euclidean distances between the input day's encoded representation and the encoded representations of all the days in the database.

from sklearn.metrics.pairwise import euclidean_distances
import numpy as np

# Calculate Euclidean distances 
distances = euclidean_distances(input_day_encoded, encoded_database)

# Find the index of the most similar day (smallest distance)
most_similar_day_index = np.argmin(distances)

# Retrieve the data of the most similar day
most_similar_day_data = original_data_library[most_similar_day_index]




# Use <cosine similarity> to calculate the similarities between the encoded representation of the input day and all the days in the encoded database
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(input_day_encoded, encoded_database)
most_similar_day_index = np.argmax(similarities)
most_similar_day_data = original_data_library[most_similar_day_index]


In [None]:
# This part for model interpretation 

# 1. Visualise Filters

filters = autoencoder.layers[1].get_weights()[0]
# Assuming the filters are 3D
f_min, f_max = filters.min(), filters.max()
filters = (filters - f_min) / (f_max - f_min)

n_filters = 8
for i in range(n_filters):
    f = filters[:, :, :, i]
    plt.subplot(2, 4, i+1)
    plt.imshow(f[:, :, 0], cmap='gray')  # Adjust the last index if needed
    plt.axis('off')
plt.show()


In [None]:
# 2. Visualizing Activations

from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

# Select a sample from your dataset
sample = X_train_padded[10]
sample = np.expand_dims(sample, axis=0)

# Define a model to fetch activations
activations_model = Model(inputs=autoencoder.input, outputs=[layer.output for layer in autoencoder.layers])
activations = activations_model.predict(sample)

# Visualize activations of the first convolutional layer
first_layer_activations = activations[1]
n_filters = first_layer_activations.shape[-1]

for i in range(n_filters):
    plt.subplot(2, 4, i+1)
    plt.imshow(first_layer_activations[0, :, :, i], cmap='gray')
    plt.axis('off')
plt.show()


In [None]:
# 3. Visualizing Encoded Representations

encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder').output)
encoded_representation = encoder_model.predict(sample)

plt.imshow(encoded_representation[0, :, :, 0], cmap='gray')  # Adjust indices as needed
plt.title('Encoded Representation')
plt.axis('off')
plt.show()


In [None]:
# 4. Dimensionality Reduction on Encoded Representations (Optional)

from sklearn.manifold import TSNE
import seaborn as sns

# Assuming `encoded_data` contains encoded representations of your entire dataset
tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
tsne_results = tsne.fit_transform(encoded_data.reshape(len(encoded_data), -1))

sns.scatterplot(x=tsne_results[:,0], y=tsne_results[:,1], alpha=0.5)
plt.title('t-SNE of Encoded Representations')
plt.show()


In [None]:
# Evaluation Part Below


In [None]:
# 1. Reconstruction Error:Evaluate how well the autoencoder can reconstruct the input data

from sklearn.metrics import mean_squared_error, mean_absolute_error

decoded_val = autoencoder.predict(X_val_padded)

# Mean Squared Error (MSE)
mse = mean_squared_error(X_val_padded.flatten(), decoded_val.flatten())

# Mean Absolute Error (MAE)
mae = mean_absolute_error(X_val_padded.flatten(), decoded_val.flatten())

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


In [None]:
# 2. Visual Inspection of Reconstructions:

import matplotlib.pyplot as plt
import numpy as np

index = np.random.randint(X_val_padded.shape[0])
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.imshow(X_val_padded[index, :, :, 0], cmap='gray')  # Adjust indexing if necessary
plt.title('Original')

plt.subplot(1, 2, 2)
plt.imshow(decoded_val[index, :, :, 0], cmap='gray')
plt.title('Reconstructed')

plt.show()


In [None]:
# 3. Latent Space Visualization:

from sklearn.manifold import TSNE
import seaborn as sns

encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder').output)
encoded_data = encoder.predict(X_val_padded)

tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
tsne_results = tsne.fit_transform(encoded_data.reshape(len(encoded_data), -1))

sns.scatterplot(x=tsne_results[:, 0], y=tsne_results[:, 1], alpha=0.5)
plt.title('t-SNE of Encoded Representations')
plt.show()
