In [1]:
import torch
# from torchvision.models import resnet18, ResNet18_Weights
import torch.nn as nn

import h5py
from pathlib import Path
import pandas as pd
import os
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset
from torchvision import transforms

from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader


  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:


class WDMGalaxiesDataset(Dataset):
    """
    file_path: path to hdf file
    dataset_name: specify train or test datasets
    transform, target_transforms: optional transforms
    """
    
    def __init__(self, file_path, dataset_name, transform = None,
                 target_transform = None):
        self.file_path = file_path
        self.dataset_name = dataset_name
        self.transform = transform 
        self.target_transform = target_transform
        self.length = None
        self._idx_to_name = {} #data_dict 

        with h5py.File(self.file_path, 'r') as hf:
            for gname, group in hf.items():
                if gname == self.dataset_name:
                    sample_id_idx = 0
                    for sim_id, dd in (group.items()):
                        for Mgas_id, ee in enumerate(dd.items()):
                            self._idx_to_name[sample_id_idx] = [sim_id, ee[0]]
                            sample_id_idx+=1 
                    self.length = sample_id_idx
        # print(self._idx_to_name)

    def __len__(self):
        assert self.length is not None
        return self.length
    
    def _open_hdf5(self):
        self._hf = h5py.File(self.file_path, 'r')

    def __getitem__(self, idx):
        # if (torch.is_tensor(idx)):
        #     idx = idx.tolist()
        if not hasattr(self, '_hf'):
            self._open_hdf5()

        sim_id, Mgas_id = self._idx_to_name[idx]
        data = self._hf[self.dataset_name][sim_id][Mgas_id]
        image = np.array(data)
        # label = torch.tensor(data.attrs['WDM'])
        label = torch.tensor(self._hf[self.dataset_name][sim_id].attrs['WDM'])

        if self.transform:
            image = self.transform(image)

        # if self.target_transform:
            # label = self.target_transform(label)

        return image, label
# import wandb

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(42)

<torch._C.Generator at 0x155549ba0210>

In [4]:

class CNNRegression(nn.Module):
    def __init__(self, image_size = (1, 512, 512)):
        super().__init__()
        
        self.image_size = image_size
        self.conv1 = nn.Conv2d(in_channels=self.image_size[0], out_channels=4, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=4, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.linear_line_size = int(16*(image_size[1]//4)*(image_size[2]//4))
        self.fc1 = nn.Linear(in_features=self.linear_line_size, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=1)
        
    def forward(self, x):

        x = self.conv1(x)
        print('Size of tensor after each layer')
        print(f'conv1 {x.size()}')
        x = nn.functional.relu(x)
        print(f'relu1 {x.size()}')
        x = self.pool1(x)
        print(f'pool1 {x.size()}')
        x = self.conv2(x)
        print(f'conv2 {x.size()}')
        x = nn.functional.relu(x)
        print(f'relu2 {x.size()}')
        x = self.pool2(x)
        print(f'pool2 {x.size()}')
        x = x.view(-1, self.linear_line_size)
        print(f'view1 {x.size()}')
        x = self.fc1(x)
        print(f'fc1 {x.size()}')
        x = nn.functional.relu(x)
        print(f'relu2 {x.size()}')
        x = self.fc2(x)
        print(f'fc2 {x.size()}')
        return x
       
        
    


In [5]:
img_dir =  "/mnt/ceph/users/dmohan/dreams/data/dreams/mwzooms_test.hdf5"

mu = 498244.
sigma = 1235061.2500

transforms = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize(mean = mu, std = sigma)])

trainset = WDMGalaxiesDataset(img_dir, 'Train', transforms) 
valset = WDMGalaxiesDataset(img_dir, 'Val', transforms)
testset = WDMGalaxiesDataset(img_dir, 'Test', transforms)

print(len(trainset))
print(len(valset))
print(len(testset))

train_dataloader = DataLoader(trainset, batch_size=2, shuffle=True, num_workers = 1)
val_dataloader = DataLoader(valset, batch_size=2, shuffle=True, num_workers = 1)
test_dataloader = DataLoader(testset, batch_size=2, shuffle=True, num_workers = 1)

20
20
20


In [6]:
model = CNNRegression(image_size = (1, 512, 512)).to(device)
print(model)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

CNNRegression(
  (conv1): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(4, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=262144, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


In [7]:
t = torch.zeros((1, 512, 512))
t.shape

torch.Size([1, 512, 512])

In [8]:
model(t.to(device))

Size of tensor after each layer
conv1 torch.Size([4, 512, 512])
relu1 torch.Size([4, 512, 512])
pool1 torch.Size([4, 256, 256])
conv2 torch.Size([16, 256, 256])
relu2 torch.Size([16, 256, 256])
pool2 torch.Size([16, 128, 128])
view1 torch.Size([1, 262144])
fc1 torch.Size([1, 128])
relu2 torch.Size([1, 128])
fc2 torch.Size([1, 1])


tensor([[-0.0094]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [11]:
def train(model, optimizer, criterion, train_loader, device):

    running_loss = 0.0
    for i, (x_train, y_train) in enumerate(train_loader):
        x_train, y_train = x_train.to(device), y_train.to(device)
        optimizer.zero_grad()
        outputs = model(x_train)
        print(outputs.flatten(), y_train)
        loss = criterion(outputs.to(torch.double), y_train)
        # print(outputs.to(torch.double).dtype, y_train.dtype)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss/(i+1)

In [12]:
#training loop
epochs = 10
train_loss = np.zeros(epochs)
for epoch in range(epochs):
    model.train(True)
    avg_loss_train = train(model, optimizer, criterion, train_dataloader, device)
    train_loss[epoch] = avg_loss_train

Size of tensor after each layer
conv1 torch.Size([2, 4, 512, 512])
relu1 torch.Size([2, 4, 512, 512])
pool1 torch.Size([2, 4, 256, 256])
conv2 torch.Size([2, 16, 256, 256])
relu2 torch.Size([2, 16, 256, 256])
pool2 torch.Size([2, 16, 128, 128])
view1 torch.Size([2, 262144])
fc1 torch.Size([2, 128])
relu2 torch.Size([2, 128])
fc2 torch.Size([2, 1])
tensor([ 63066.0352, 257999.6094], device='cuda:0', grad_fn=<ViewBackward0>) tensor([0.1912, 0.1130], device='cuda:0', dtype=torch.float64)


Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cnn_train.so.8. Error: libcudnn_cnn_train.so.8: cannot open shared object file: No such file or directory
Could not load library libcudnn_cn

RuntimeError: GET was unable to find an engine to execute this computation

In [9]:
!nvidia-smi

Tue Oct  8 17:03:35 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off |   00000000:1C:00.0 Off |                    0 |
| N/A   30C    P0             57W /  300W |     744MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [12]:
print(torch.version.cuda)

12.3


In [13]:
torch. __version__

'2.2.2'

In [14]:
torch.cuda.is_available()

True

In [13]:
!module load cuda

/bin/bash: module: command not found
