In [1]:
import os
import mlflow
import torch
import random
import numpy as np
from tqdm import tqdm
import sys

PROJECT_PATH = '/home/gergogaliger/PROJECTS/road_safety_analysis/frequency_analysis/model_development'
SHARED_PROJECT_PATH = '/opt/jupyterhub/SHARED_PROJECTS/road_safety_analysis'

sys.path.append(PROJECT_PATH)

from src.utils.data_utils import read_mlflow_dataset
from src.trainers.AETrainer import AETrainer

RANDOM_SEED = 42

mlflow.set_tracking_uri('http://clijupyter01.kozlek.local:4242')

In [2]:
experiment = mlflow.set_experiment('01-01_denoising_autoencoder')

In [3]:
with mlflow.start_run(log_system_metrics=True) as run:
    # Seed random generators to ensure deterministic experiments
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # Define PyTorch device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Read and log train, validation and test datasets
    data_date = '18-06-2024'
    X_train, y_train, _ = read_mlflow_dataset(SHARED_PROJECT_PATH, data_date, 'train', targets='acc_no_23-24', device=device)
    X_valid, y_valid, _ = read_mlflow_dataset(SHARED_PROJECT_PATH, data_date, 'validate', targets='acc_no_23-24', device=device)
    X_test, y_test, non_accident_dim = read_mlflow_dataset(SHARED_PROJECT_PATH, data_date, 'test', targets='acc_no_23-24', device=device)
    
    # Specify and log training parameters
    params = {
        'inp_dim': non_accident_dim,
        'noise_factor': 0.5,
        'enc_dim': 4,
        'learning_rate': 1e-2,
        'weight_decay': 1e-8
    }
    mlflow.log_params(params)

    # Define, train and evaluate model
    trainer = AETrainer(**params)
    trainer.train(X_train, y_train, X_valid, y_valid, 1500)
    trainer.evaluate(X_test, y_test)

2024/06/24 18:15:19 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
  return _dataset_source_registry.resolve(
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  return _dataset_source_registry.resolve(
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  return _dataset_source_registry.resolve(
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
100%|██████████| 1500/1500 [00:22<00:00, 66.25it/s]
2024/06/24 18:15:47 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/06/24 18:15:47 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!
