# Pre-Processing Pipeline for ERA5 Dataset  

## Overview

This notebook contains the code for importing the raw ERA5 dataset and preparing it for use with a torchcde, LSTM, and RNN implementation.

In [37]:
import os
import numpy as np
import torch
import pandas as pd

# Load the data
data = pd.read_csv('../../../data/raw/ERA5/train_weather_data.csv')

In [38]:
# Reshape to (10, 26, 2)
data = data.drop(columns=['date'])
data = data.to_numpy()
data = data.reshape(10, 26, 2)

In [39]:
# Add time channel
time = np.linspace(0, 25, 26)
time = time.reshape(26, 1)
time = np.tile(time, (10, 1, 1))
data = np.concatenate((data, time), axis=2)

In [40]:
# Normalize
data = (data - np.mean(data)) / np.std(data)

In [41]:
# Convert to tensor
data = torch.tensor(data, dtype=torch.float32)

In [42]:
# Split into train and test
X_train = data[:8]
X_test = data[8:]

# Delete last 2 sequence elements from each sample
X_train = X_train[:, :-2, :]
X_test = X_test[:, :-2, :]

# Further split each sequence
X_train = torch.split(X_train, 12, dim=1)
X_test = torch.split(X_test, 12, dim=1)

# Recombine into a single tensor
X_train = torch.cat(X_train, dim=0)
X_test = torch.cat(X_test, dim=0)

# Save the data
path = '../../../data/processed/ERA5/regression'
os.makedirs(path, exist_ok=True)
torch.save(X_train, os.path.join(path, 'X_train.pt'))
torch.save(X_test, os.path.join(path, 'X_test.pt'))