In [1]:
import os
import xarray as xr
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, IterableDataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
d = xr.open_dataset('/scratch/smrserraoseabr/Projects/FluvialCO2/results32/simulation_results_realization_32x32_1.nc')

In [3]:
d

In [21]:
folder = "/scratch/smrserraoseabr/Projects/FluvialCO2/results32"
input_vars = ['Por', 'Perm', 'Pressure']
output_vars = ['Pressure']
input_vars.append('x_encoding')
input_vars.append('y_encoding')
input_vars.append('time_encoding')
output_vars = output_vars




data = d
# Add the meshgrids as new data variables in the xarray
X = data['X'].values
Y = data['Y'].values
TIME = data['time'].values

# Create meshgrids for X and Y dimensions
x_mesh, y_mesh = np.meshgrid(data.X, data.Y, indexing='ij')
TIME_MESH = np.meshgrid(data.time, data.X, data.Y, indexing='ij')
data = data.assign(x_encoding=xr.DataArray(x_mesh, coords=[("X", X), ("Y", Y)]))
data = data.assign(y_encoding=xr.DataArray(y_mesh, coords=[("X", X), ("Y", Y)]))
data = data.assign(time_encoding=xr.DataArray(TIME_MESH[0], coords=[("time", TIME), ("X", X), ("Y", Y)]))




In [22]:
data

In [28]:
data.isel(X=0).isel(Y=0)['time_encoding'].values

array([   0,   30,   60,   90,  120,  150,  180,  210,  240,  270,  300,
        330,  360,  390,  420,  450,  480,  510,  540,  570,  600,  630,
        660,  690,  720,  750,  780,  810,  840,  870,  900,  930,  960,
        990, 1020, 1050, 1080, 1110, 1140, 1170, 1200, 1230, 1260, 1290,
       1320, 1350, 1380, 1410, 1440, 1470, 1500, 1530, 1560, 1590, 1620,
       1650, 1680, 1710, 1740, 1770, 1800])

In [11]:
data

In [32]:
class XarrayDataset(Dataset):
    def __init__(self, folder, input_vars, output_vars):
        self.folder = folder
        self.file_list = os.listdir(folder)
        self.input_vars = input_vars
        self.input_vars.append('x_encoding')
        self.input_vars.append('y_encoding')
        self.input_vars.append('time_encoding')
        self.output_vars = output_vars

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = os.path.join(self.folder, self.file_list[idx])
        data = xr.open_dataset(file_path)
        # Add the meshgrids as new data variables in the xarray
        X = data['X'].values
        Y = data['Y'].values
        TIME = data['time'].values

        # Create meshgrids for X and Y dimensions
        x_mesh, y_mesh = np.meshgrid(data.X, data.Y, indexing='ij')
        TIME_MESH = np.meshgrid(data.time, data.X, data.Y, indexing='ij')
        data = data.assign(x_encoding=xr.DataArray(x_mesh, coords=[("X", X), ("Y", Y)]))
        data = data.assign(y_encoding=xr.DataArray(y_mesh, coords=[("X", X), ("Y", Y)]))
        data = data.assign(time_encoding=xr.DataArray(TIME_MESH[0], coords=[("time", TIME), ("X", X), ("Y", Y)]))

        # Concatenate input variables along a new dimension
        input_data_list = []
        for var in self.input_vars:
            if 'time' in data[var].dims:
                input_data_list.append(torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(0))
            else:
                scalar_matrix = torch.tensor(data[var].values, dtype=torch.float32).expand(data.time.size, -1, -1)
                input_data_list.append(scalar_matrix.unsqueeze(0))

        input_data = torch.cat(input_data_list, dim=0)
        input_data = input_data.permute(1, 2, 3, 0)  # Reorder dimensions to (time, shape[0], shape[1], channels)

        # Concatenate output variables along a new dimension
        output_data_list = [torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(0) for var in self.output_vars]
        output_data = torch.cat(output_data_list, dim=0)
        output_data = output_data.permute(1, 2, 3, 0)  # Reorder dimensions to (time, shape[0], shape[1], channels)

        return input_data, output_data


In [33]:
folder = "/scratch/smrserraoseabr/Projects/FluvialCO2/results32"
input_vars = ['Por', 'Perm', 'Pressure'] # Porosity, Permeability, Pressure + x, y, time encodings 
output_vars = ['Pressure']

# Create a dataset and DataLoader
dataset = XarrayDataset(folder, input_vars, output_vars)
dataloader = DataLoader(dataset, batch_size=25, shuffle=True, num_workers=1)

# Test the dataloader by loading a few batches
num_batches_to_check = 2
for i, (input_data, output_data) in enumerate(dataloader):
    if i >= num_batches_to_check:
        break

    print(f"Batch {i+1}:")
    print(f"Input data shape: {input_data.shape}")
    print(f"Output data shape: {output_data.shape}")

Batch 1:
Input data shape: torch.Size([25, 61, 32, 32, 6])
Output data shape: torch.Size([25, 61, 32, 32, 1])
Batch 2:
Input data shape: torch.Size([25, 61, 32, 32, 6])
Output data shape: torch.Size([25, 61, 32, 32, 1])


##Reading from the zip file - Still need to figure out how to read from the zip file

In [31]:
import zipfile
import io
class XarrayDataset(Dataset):
    def __init__(self, zip_file_path, input_vars, output_vars):
        self.zip_file_path = zip_file_path
        self.input_vars = input_vars
        self.input_vars.append('x_encoding')
        self.input_vars.append('y_encoding')
        self.input_vars.append('time_encoding')
        self.output_vars = output_vars

        with zipfile.ZipFile(self.zip_file_path, 'r') as zip_ref:
            self.file_list = [f for f in zip_ref.namelist() if f.endswith('.nc')]

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        with zipfile.ZipFile(self.zip_file_path, 'r') as zip_ref:
            file_path = self.file_list[idx]
            with zip_ref.open(file_path, 'r') as file:
                data = xr.open_dataset(io.BytesIO(file.read()) , engine='scipy')
        # Add the meshgrids as new data variables in the xarray
        X = data['X'].values
        Y = data['Y'].values
        TIME = data['time'].values

        # Create meshgrids for X and Y dimensions
        x_mesh, y_mesh = np.meshgrid(data.X, data.Y, indexing='ij')
        TIME_MESH = np.meshgrid(data.time, data.X, data.Y, indexing='ij')
        data = data.assign(x_encoding=xr.DataArray(x_mesh, coords=[("X", X), ("Y", Y)]))
        data = data.assign(y_encoding=xr.DataArray(y_mesh, coords=[("X", X), ("Y", Y)]))
        data = data.assign(time_encoding=xr.DataArray(TIME_MESH[0], coords=[("time", TIME), ("X", X), ("Y", Y)]))

        # Concatenate input variables along a new dimension
        input_data_list = []
        for var in self.input_vars:
            if 'time' in data[var].dims:
                input_data_list.append(torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(0))
            else:
                scalar_matrix = torch.tensor(data[var].values, dtype=torch.float32).expand(data.time.size, -1, -1)
                input_data_list.append(scalar_matrix.unsqueeze(0))

        input_data = torch.cat(input_data_list, dim=0)
        input_data = input_data.permute(1, 2, 3, 0)  # Reorder dimensions to (time, shape[0], shape[1], channels)

        # Concatenate output variables along a new dimension
        output_data_list = [torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(0) for var in self.output_vars]
        output_data = torch.cat(output_data_list, dim=0)
        output_data = output_data.permute(1, 2, 3, 0)  # Reorder dimensions to (time, shape[0], shape[1], channels)

        return input_data, output_data

In [77]:
zip_file_path = "/scratch/smrserraoseabr/Projects/FluvialCO2/results32.zip"
input_vars = ['Por', 'Perm']
output_vars = ['Pressure']

# Create a dataset and DataLoader
dataset = XarrayDataset(zip_file_path, input_vars, output_vars)
dataloader = DataLoader(dataset, batch_size=5, shuffle=True, num_workers=1)

# Test the dataloader by loading a few batches
num_batches_to_check = 5
for i, (input_data, output_data) in enumerate(dataloader):
    if i >= num_batches_to_check:
        break

    print(f"Batch {i+1}:")
    print(f"Input data shape: {input_data.shape}")
    print(f"Output data shape: {output_data.shape}")


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/scipy_.py", line 102, in _open_scipy_netcdf
    return scipy.io.netcdf_file(filename, mode=mode, mmap=mmap, version=version)
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/scipy/io/_netcdf.py", line 277, in __init__
    self._read()
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/scipy/io/_netcdf.py", line 598, in _read
    raise TypeError("Error: %s is not a valid NetCDF 3 file" %
TypeError: Error: None is not a valid NetCDF 3 file

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/tmp/ipykernel_2981016/2060069648.py", line 21, in __getitem__
    data = xr.open_dataset(io.BytesIO(file.read()) , engine='scipy')
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/api.py", line 531, in open_dataset
    backend_ds = backend.open_dataset(
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/scipy_.py", line 279, in open_dataset
    store = ScipyDataStore(
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/scipy_.py", line 154, in __init__
    scipy_dataset = _open_scipy_netcdf(
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/scipy_.py", line 113, in _open_scipy_netcdf
    raise TypeError(errmsg)
TypeError: Error: None is not a valid NetCDF 3 file
            If this is a NetCDF4 file, you may need to install the
            netcdf4 library, e.g.,

            $ pip install netcdf4
            


In [78]:
!pip install netCDF4


Defaulting to user installation because normal site-packages is not writeable


In [None]:
import os
import zipfile
import xarray as xr
import torch
from torch.utils.data import IterableDataset

class XarrayDataset(IterableDataset):
    def __init__(self, zip_file, input_vars_time_varying, input_vars_time_fixed, output_vars):
        self.zip_file = zipfile.ZipFile(zip_file, 'r')
        self.file_list = self.zip_file.namelist()
        self.input_vars_time_varying = input_vars_time_varying
        self.input_vars_time_fixed = input_vars_time_fixed
        self.output_vars = output_vars

    def __len__(self):
        return len(self.file_list)

    def read_xarray_file(self, file_path):
        with self.zip_file.open(file_path) as file:
            data = xr.open_dataset(file, engine='netcdf4', backend_kwargs={'mode': 'r'})

            input_data_time_varying = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in self.input_vars_time_varying], dim=-1)
            input_data_time_fixed = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in self.input_vars_time_fixed], dim=-1)
            output_data = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in self.output_vars], dim=-1)

        return input_data_time_varying, input_data_time_fixed, output_data

    def __iter__(self):
        for file_path in self.file_list:
            yield self.read_xarray_file(file_path)


In [10]:
#read from the zip folder
class XarrayDatasetZip(Dataset):
    def __init__(self, zip_file, input_vars, output_vars):
        self.zip_file = zipfile.ZipFile(zip_file, 'r')
        self.file_list = self.zip_file.namelist()
        self.input_vars = input_vars
        self.output_vars = output_vars


    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]

        # Read xarray file directly from the zip file
        with self.zip_file.open(file_path) as file:
            data = xr.open_dataset(file, engine='netcdf4', backend_kwargs={'mode': 'r'})



            # Concatenate input variables along a new dimension
            input_data = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in input_vars], dim=-1)

            # Concatenate output variables along a new dimension
            output_data = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in output_vars], dim=-1)

        return input_data, output_data

In [35]:
class XarrayDatasetZipIter(IterableDataset):
    def __init__(self, zip_file, input_vars, output_vars):
        self.zip_file = zipfile.ZipFile(zip_file, 'r')
        self.file_list = self.zip_file.namelist()
        self.input_vars = input_vars
        self.output_vars = output_vars

    def __len__(self):
        return len(self.file_list)

    def read_xarray_file(self, file_path):
        with self.zip_file.open(file_path) as file:
            data = xr.open_dataset(file, backend_kwargs={'mode': 'r'})



            input_data = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in input_vars], dim=-1)
            output_data = torch.cat([torch.tensor(data[var].values, dtype=torch.float32).unsqueeze(-1) for var in output_vars], dim=-1)

        return input_data, output_data

    def __iter__(self):
        for file_path in self.file_list:
            yield self.read_xarray_file(file_path)

In [36]:
""""
Note that we set batch_size=None in the DataLoader. This is because IterableDataset does not support automatic batching, so you'll need to handle batching manually in your training loop.
"""
input_vars = ['Por', 'Perm']
output_vars = ['GasSat', 'Pressure', 'CO_2']
folder_zip="/scratch/smrserraoseabr/Projects/FluvialCO2/results32.zip"
#dataset = XarrayDataset(folder, input_vars, output_vars)
#dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)
dataset = XarrayDatasetZipIter(folder_zip, input_vars, output_vars)
dataloader = DataLoader(dataset, batch_size=None, num_workers=4)

In [37]:
# Test the dataloader by loading a few batches
num_batches_to_check = 5

for i, (input_data, output_data) in enumerate(dataloader):
    print(f"Batch {i+1}:")
    print(f"Input data shape: {input_data.shape}")
    print(f"Output data shape: {output_data.shape}")
    print(f"Input data: {input_data}")
    print(f"Output data: {output_data}")
    print("\n")

    if i >= num_batches_to_check - 1:
        break


ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 39, in fetch
    data = next(self.dataset_iter)
  File "/tmp/ipykernel_2941125/2706523513.py", line 24, in __iter__
    yield self.read_xarray_file(file_path)
  File "/tmp/ipykernel_2941125/2706523513.py", line 13, in read_xarray_file
    data = xr.open_dataset(file, backend_kwargs={'mode': 'r'})
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/api.py", line 515, in open_dataset
    engine = plugins.guess_engine(filename_or_obj)
  File "/home/smrserraoseabr/.local/lib/python3.8/site-packages/xarray/backends/plugins.py", line 155, in guess_engine
    raise ValueError(error_msg)
ValueError: did not find a match in any of xarray's currently installed IO backends ['netcdf4', 'scipy']. Consider explicitly selecting one of the installed engines via the ``engine`` parameter, or installing additional IO dependencies, see:
https://docs.xarray.dev/en/stable/getting-started-guide/installing.html
https://docs.xarray.dev/en/stable/user-guide/io.html


In [30]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x15553b2da580>

In [None]:
""""
def train_epoch(dataloader, model, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    n_batches = 0

    batch_inputs = []
    batch_outputs = []
    batch_size = 32

    for inputs, outputs in dataloader:
        batch_inputs.append(inputs.to(device))
        batch_outputs.append(outputs.to(device))

        if len(batch_inputs) == batch_size:
            inputs_batch = torch.stack(batch_inputs)
            outputs_batch = torch.stack(batch_outputs)

            optimizer.zero_grad()
            predictions = model(inputs_batch)
            loss = criterion(predictions, outputs_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            n_batches += 1

            batch_inputs.clear()
            batch_outputs.clear()

    return running_loss / n_batches
"""

In [None]:
import zipfile
import torch
from torchvision import transforms
from PIL import Image
import io

def load_image_from_zip(zip_path, file_path):
    with zipfile.ZipFile(zip_path, 'r') as z:
        with z.open(file_path) as f:
            img = Image.open(io.BytesIO(f.read()))
            transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
            ])
            img_tensor = transform(img)
            return img_tensor

zip_path = "path/to/your/zipfile.zip"
file_path = "path/to/your/image.jpg"

img_tensor = load_image_from_zip(zip_path, file_path)
print(img_tensor.shape)


In [15]:
import zipfile
import xarray as xr
import torch
from torch.utils.data import IterableDataset



In [16]:
zip_path = r'/scratch/smrserraoseabr/Projects/FluvialCO2/results32.zip'

In [18]:
zip_file = zipfile.ZipFile(zip_path, 'r')
    

In [21]:
len(zip_file.namelist()[1:])

1000

In [24]:
zipfile.ZipFile(zip_path, 'r')

<zipfile.ZipFile filename='/scratch/smrserraoseabr/Projects/FluvialCO2/results32.zip' mode='r'>

In [25]:
with zip_file.open(file_path) as file:
        data = xr.open_dataset(file, engine='netcdf4', backend_kwargs={'mode': 'r'})




NameError: name 'file_path' is not defined

In [None]:
file_list = zip_file.namelist()