In [1]:
import os
import mlflow
import torch
import random
import numpy as np
from tqdm import tqdm
from os.path import join as pjoin
from datetime import datetime
import sys

PROJECT_PATH = 'C:\\Users\\galiger.gergo\\Desktop\\ecg-denoising\\workspace'
PROJECT_SRC_PATH = os.path.join(PROJECT_PATH, 'src\\refactored')
PROJECT_DCDL_PATH = os.path.join(PROJECT_PATH, 'src\\refactored\\src\\models\\DCDicL')

sys.path.append(PROJECT_SRC_PATH)
sys.path.append(PROJECT_DCDL_PATH)

from src.trainers.DCDicLTrainer import DCDicLTrainer
from src.utils.loader import DataSplit
from src.models.DCDicL.models.model import Model as DCDicL

RANDOM_SEED = 42

mlflow.set_tracking_uri('http://localhost:8080')

In [2]:
DATA_DIR = os.path.join(PROJECT_PATH, 'data')
DATA_FILE_GEN = 'generated/BW_master_10000_2024-04-07-12-43-32.pkl'
DATA_FILE_SIGS = 'steinbrinker/testing_data_mvg_avg.npy'
DATA_FILE_BW = 'mit-bih/bw'
DATA_FILE_GAUSS = 'generated/gaussian_noise.npy'
DATA_FILE_BPDN_MAX = 'generated/BW_alphas-BPDN_10000_2024-04-07-12-43-32.npy'
DATA_FILE_BPDN = 'generated/BW_alphas-BPDN-1iters_10000_2024-04-07-12-43-32.npy'
# DATA_FILE_BPDN = 'generated/BW_alphas-BPDN-3iters_10000_2024-04-07-12-43-32.npy'
# DATA_FILE_BPDN = 'generated/BW_alphas-BPDN-5iters_10000_2024-04-07-12-43-32.npy'
DATA_FILE_BPDN_FINAL = 'generated/BW_alphas-BPDN_10000_2024-04-07-12-43-32.npy'
DICT_FILE_BW = 'steinbrinker/dictionary_BW_real_data.npy'
NOISE_TYPE = 'bw'
if NOISE_TYPE == 'bw':
    DATA_FILE_NOISE = DATA_FILE_BW
elif NOISE_TYPE == 'gauss':
    DATA_FILE_NOISE = DATA_FILE_GAUSS
DATA_SIZE = 10000
BATCH_SIZE = 10
TVT_SPLIT = {
    'train': 80,
    'valid': 10,
    'test': 10
}


LOAD_MODEL_RUN = 'bdb03e3d83e94eccbebcaee555cb0da1'
LOAD_MODEL_EPOCH = None

LR_DEC_AFTER = 15000
LR_DEC_EVERY = 10
LEARNING_RATE = 1e-1

opt = {
  "task": "train"           # taskname
  ,
  "is_train": True,
  "gpu_ids": [              # gpu id
    0
  ],
  "path": {
    "root": "debug/denoising",
    "pretrained_netG": None       # pretrained path
  },
  "data": {
    "type": "denoising",
    "n_channels": 1,              # image channels
    "train": {
      "sigma": [
        0,
        50
      ],
      "dataroot_H": "~/data/denoising/train/",
      "H_size": 128,             # patch size
      "num_workers": 8,
      "batch_size": 32           # batch size
    },
    "test": {
      "sigma": [
        15,
        25,
        50
      ],
      "dataroot_H": "~/data/denoising/test"      # test path
    }
  },
  "netG": {
    "type": "denoising",
    "d_size": 3,      # dictionary size
    "n_iter": 2,      # stages
    "in_nc": 1,       # image channel
    "out_nc": 1,
    "nc_x": [
      64,
      128,
      256,
      512
    ],
    "nb": 2           # number of blocks
  },
  "train": {
    "manual_seed": RANDOM_SEED,
    "reload_broadcast": False,
    "G_optimizer_lr": 1e-4,           # lr
    "G_scheduler_milestones": [       # milestones
      200000,
      400000,
      600000,
      800000
    ],
    "G_scheduler_gamma": 0.5,
    "checkpoint_test": 10,
    "checkpoint_savemodel": 5000,
    "checkpoint_log": 100,
    "checkpoint_saveimage": 5000,
    "checkpoint_visual": 5000
  },
  "test": {
    "visualize": True
  }
}

In [3]:
experiment = mlflow.set_experiment('dcdicl-initial-test')

In [4]:
with mlflow.start_run(log_system_metrics=True) as run:
    # Seed random generators to ensure deterministic experiments
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    generator = torch.Generator()
    generator.manual_seed(RANDOM_SEED) 
    
    # Define PyTorch device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Read and log train, validation and test datasets
    trn_ldr, val_ldr, tst_ldr = DataSplit(DATA_DIR, NOISE_TYPE, DATA_FILE_GEN, DATA_FILE_SIGS, DATA_FILE_NOISE,
                                          DATA_FILE_BPDN, DATA_FILE_BPDN_FINAL, TVT_SPLIT, BATCH_SIZE, generator=generator)
    Psi = torch.from_numpy(np.load(pjoin(DATA_DIR, DICT_FILE_BW)))
    Psi = Psi.clone().detach().to(device=device)
    bpdn_est = np.load(os.path.join(DATA_DIR, DATA_FILE_BPDN_MAX))
    dictionary = np.load(os.path.join(DATA_DIR, DICT_FILE_BW))

    # Load model
    model = DCDicL(opt)
    model.init()
    
    # Specify and log training parameters
    dt = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    params = {
        'device': device,
        'batch_size': BATCH_SIZE,
        'lr': LEARNING_RATE,
        'lr_dec_after': LR_DEC_AFTER,
        'lr_dec_every': LR_DEC_EVERY,
        'opt': opt,
        'load_model_run': LOAD_MODEL_RUN,
        'load_model_epoch': LOAD_MODEL_EPOCH
    }   
    mlflow.log_params(params)

    # Define, train and evaluate model
    trainer = DCDicLTrainer(model, Psi, bpdn_est, dictionary, params)
    trainer.train(trn_ldr, val_ldr, 15000, start_epoch=0, log_model_every=100, log_comp_fig_every=10)
    trainer.evaluate(tst_ldr)
    trainer.evaluate(tst_ldr, criterion=torch.nn.HuberLoss(), crit_text='huber')

2024/11/29 14:04:15 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return torch._C._cuda_getDeviceCount() > 0


Initialization method [orthogonal + uniform], gain is [0.20]


2024/11/29 14:04:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-snail-464 at: http://localhost:8080/#/experiments/123604823160901751/runs/05302b2c093444aea9b4d666bca42de4.
2024/11/29 14:04:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/123604823160901751.
2024/11/29 14:04:16 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/11/29 14:04:16 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 1: invalid argument