## Load package and data

In [1]:
from upath import UPath as Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nbvv

from serotiny.io.image import image_loader
from cytodata_aics.io_utils import rescale_image



In [2]:
mitocells = pd.read_csv('/home/aicsuser/cytodata-hackathon-base/data/mitocells.csv')

# Model

In [3]:
from serotiny.transforms.dataframe.transforms import split_dataframe
data_dir = "/home/aicsuser/cytodata-hackathon-base/data"

df = pd.read_csv(f"{data_dir}/mitocells.csv")
print(f'Number of cells: {len(df)}')
print(f'Number of columns: {len(df.columns)}')

Number of cells: 8179
Number of columns: 79


In [4]:
df["cell_stage"].value_counts()

M0               2000
M1M2             2000
M4M5             2000
M6M7_complete    1198
M3                981
Name: cell_stage, dtype: int64

In [5]:
import os
os.chdir("/home/aicsuser/cytodata-hackathon-base")

In [6]:
from datetime import datetime

# util to avoid referring to the same run unintentionally
now_str = lambda : datetime.now().strftime("%Y%m%d_%H%M%S")

# Load and train

In [7]:
run_name = f"teamx_single_channel_zproj_{now_str()}"
print(run_name)

!serotiny train \
    model=Single_channel_VAE_model \
    data=Single_channel_VAE_dataloader \
    mlflow.experiment_name=vae_single_channel \
    mlflow.run_name={run_name} \
    trainer.gpus=[0] \
    trainer.max_epochs=30

teamx_single_channel_zproj_20221019_230020
[2022-10-19 23:00:24,665][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42
[2022-10-19 23:00:24,665][serotiny.ml_ops.ml_ops][INFO] - Instantiating datamodule
[2022-10-19 23:00:27,216][serotiny.ml_ops.ml_ops][INFO] - Instantiating trainer
[2022-10-19 23:00:27,251][pytorch_lightning.utilities.rank_zero][INFO] - GPU available: True, used: True
[2022-10-19 23:00:27,251][pytorch_lightning.utilities.rank_zero][INFO] - TPU available: False, using: 0 TPU cores
[2022-10-19 23:00:27,251][pytorch_lightning.utilities.rank_zero][INFO] - IPU available: False, using: 0 IPUs
[2022-10-19 23:00:27,251][pytorch_lightning.utilities.rank_zero][INFO] - HPU available: False, using: 0 HPUs
[2022-10-19 23:00:27,251][serotiny.ml_ops.ml_ops][INFO] - Instantiating model
[2022-10-19 23:00:27,283][serotiny.networks.basic_cnn.basic_cnn][INFO] - Determined 'compressed size': 41496 for CNN
[2022-10-19 23:00:28,218][serotiny.networks.basic_cnn.basic_cnn][INFO] -

# Test and visualize latent space

In [8]:
!serotiny test \
    model=Single_channel_VAE_model \
    data=Single_channel_VAE_dataloader \
    mlflow.experiment_name=vae_single_channel \
    mlflow.run_name={run_name} \
    trainer/callbacks=vae \
    ++force=True

[2022-10-19 23:01:31,229][pytorch_lightning.utilities.seed][INFO] - Global seed set to 42
[2022-10-19 23:01:31,230][serotiny.ml_ops.ml_ops][INFO] - Instantiating datamodule
[2022-10-19 23:01:33,702][serotiny.ml_ops.ml_ops][INFO] - Instantiating trainer
  rank_zero_warn(
[2022-10-19 23:01:33,862][pytorch_lightning.utilities.rank_zero][INFO] - GPU available: True, used: False
[2022-10-19 23:01:33,863][pytorch_lightning.utilities.rank_zero][INFO] - TPU available: False, using: 0 TPU cores
[2022-10-19 23:01:33,863][pytorch_lightning.utilities.rank_zero][INFO] - IPU available: False, using: 0 IPUs
[2022-10-19 23:01:33,863][pytorch_lightning.utilities.rank_zero][INFO] - HPU available: False, using: 0 HPUs
  rank_zero_warn(
[2022-10-19 23:01:33,864][serotiny.ml_ops.ml_ops][INFO] - Instantiating model
[2022-10-19 23:01:33,895][serotiny.networks.basic_cnn.basic_cnn][INFO] - Determined 'compressed size': 41496 for CNN
[2022-10-19 23:01:34,850][serotiny.networks.basic_cnn.basic_cnn][INFO] - Deter

In [9]:
import mlflow
from serotiny.ml_ops.mlflow_utils import download_artifact

mlflow.set_tracking_uri("http://mlflow.mlflow.svc.cluster.local")

with download_artifact("dataframes/embeddings.csv", experiment_name="vae_single_channel", run_name=run_name) as path:
    embeddings = pd.read_csv(path)
    
with download_artifact("dataframes/stats_per_dim_test.csv", experiment_name="vae_single_channel", run_name=run_name) as path:
    kl_per_dimension = pd.read_csv(path)

  for _experiment in mlflow.list_experiments():
  return MlflowClient().list_experiments(
  for run_info in mlflow.list_run_infos(experiment_id=experiment_id):
  return MlflowClient().list_run_infos(


MlflowException: The following failures occurred while downloading one or more artifacts from http://mlflow.mlflow.svc.cluster.local/api/2.0/mlflow-artifacts/artifacts/10/cd94aa32c69e4f99b4995e7edbf21139/artifacts: {'dataframes/embeddings.csv': 'MlflowException("API request to http://mlflow.mlflow.svc.cluster.local/api/2.0/mlflow-artifacts/artifacts/10/cd94aa32c69e4f99b4995e7edbf21139/artifacts/dataframes/embeddings.csv failed with exception HTTPConnectionPool(host=\'mlflow.mlflow.svc.cluster.local\', port=80): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/10/cd94aa32c69e4f99b4995e7edbf21139/artifacts/dataframes/embeddings.csv (Caused by ResponseError(\'too many 500 error responses\'))")'}

In [None]:
# Get dimension ranks based on KLD values
from cytodata_aics.vae_utils import get_ranked_dims
ranked_z_dim_list, mu_std_list, mu_mean_list = get_ranked_dims(kl_per_dimension, 0, 8)

In [None]:
ranked_z_dim_list = [f"mu_{i}" for i in ranked_z_dim_list]
updated_ranks = [f"mu_{i+1}" for i in range(8)]
embeddings = embeddings[[i for i in embeddings.columns if i in ranked_z_dim_list] + ['CellId']]

In [None]:
# Rank embeddings from 1 to 8
rename_cols = {}
for i, j in zip(ranked_z_dim_list, updated_ranks):
    rename_cols[i] = j
embeddings.rename(columns = rename_cols, inplace=True)
embeddings = embeddings.reindex(sorted(embeddings.columns), axis=1)

In [None]:
embeddings = embeddings.merge(mitocells[['CellId'] + 
                                                 [i for i in mitocells.columns if "shape_mode" in i] + 
                                                ['nuclear_volume', 'nuclear_height', 'nuclear_surface_area']], on = 'CellId')

In [None]:
sns.heatmap(embeddings[[i for i in embeddings.columns if i != 'CellId']].corr(), vmin=-1, vmax=1, cmap='RdBu_r')