# Pre-Processing Pipeline for ERA5 Dataset  

## Overview

This notebook contains the code for importing the raw ERA5 dataset and preparing it for use with a torchcde, LSTM, and RNN implementation.

In [46]:
import os
import numpy as np
import torch
import pandas as pd

# Load the data
data = pd.read_csv('../../../data/raw/ERA5/train_weather_data.csv')

In [47]:
# Reshape to (10, 26, 2)
data = data.drop(columns=['date'])
data = data.to_numpy()
data = data.reshape(10, 26, 2)

In [48]:
# Add time channel
time = np.linspace(0, 25, 26)
time = time.reshape(26, 1)
time = np.tile(time, (10, 1, 1))
data = np.concatenate((data, time), axis=2)

In [49]:
# Convert to tensor
data = torch.tensor(data, dtype=torch.float32)

In [50]:
# Split into train and test
X_train = data[:8]
X_test = data[8:]

# Delete last 2 sequence elements from each sample
X_train = X_train[:, :-2, :]
X_test = X_test[:, :-2, :]

# Normalize each sequence
for i in range(X_train.shape[0]):
    X_train[i] = (X_train[i] - X_train[i].mean(dim=0)) / X_train[i].std(dim=0)
for i in range(X_test.shape[0]):
    X_test[i] = (X_test[i] - X_test[i].mean(dim=0)) / X_test[i].std(dim=0)

# Save the data
path = '../../../data/processed/ERA5/'
os.makedirs(path, exist_ok=True)
torch.save(X_train, os.path.join(path, 'X_train.pt'))
torch.save(X_test, os.path.join(path, 'X_test.pt'))