In [1]:
condition_key = 'dataset'
cell_type_key = 'cell_type_tumor'

# Ray Tune

## Data & libs

In [1]:
import pickle

import ray
import scanpy as sc
import scvi
import seaborn as sns
import torch
from ray import tune
from scvi import autotune

scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)

sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
# torch.set_float32_matmul_precision("high")
save_dir = '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner'
scvi.settings.logging_dir = save_dir

%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

Global seed set to 0


Last run with scvi-tools version: 1.1.5


In [2]:
def extract_model_params_to_dict(pickle_file_path):
    # Load the pickled object
    with open(pickle_file_path, 'rb') as f:
        data = pickle.load(f)

    # Initialize a dictionary to store the extracted model parameters
    model_params_dict = {}

    # Navigate through the object structure to get model_params
    try:
        model_params = data['experiment'].spec['config']['model_params']
        
        # Extract the categories for n_hidden, n_layers, and gene_likelihood
        for key, value in model_params.items():
            model_params_dict[key] = value.categories
        # model_params_dict['n_layers'] = model_params['n_layers'].categories
        # model_params_dict['gene_likelihood'] = model_params['gene_likelihood'].categories
        
    except KeyError as e:
        print(f"Key error: {e} - Please check if the structure of the pickled object is correct.")
    except AttributeError as e:
        print(f"Attribute error: {e} - Please ensure that the objects have a 'categories' attribute.")

    return model_params_dict

# Example usage:
# Replace 'path_to_your_pickle_file.pkl' with the actual path to your pickle file.
model_params_dict = extract_model_params_to_dict('/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_03-25-39/scvi1024/driver_artifacts/search_gen_state-2024-08-08_03-25-39.json')
model_params_dict

{'n_hidden': [256, 512, 1024],
 'n_layers': [3, 4, 5, 6],
 'gene_likelihood': ['nb', 'zinb']}

In [3]:
adata = sc.read_h5ad('/root/datos/maestria/netopaas/luca/data/atlas/extended_tumor_hvg.h5ad')

In [4]:
adata.obs.study.cat.categories

Index(['Chen_Zhang_2020', 'He_Fan_2021', 'Kim_Lee_2020',
       'Lambrechts_Thienpont_2018', 'Laughney_Massague_2020',
       'Maynard_Bivona_2020', 'UKIM-V', 'Wu_Zhou_2021', 'Zilionis_Klein_2019'],
      dtype='object')

For testing if configs work without loading a gigantic adata

In [6]:
# from scvi.data import synthetic_iid

# adata = synthetic_iid()
# adata

## Model and tune

In [5]:
model_cls = scvi.model.SCVI
model_cls.setup_anndata(adata, batch_key='dataset',
                        # labels_key='cell_type_tumor'
                       )

# for testing
# model_cls.setup_anndata(adata, batch_key='batch', labels_key='labels')

In [8]:
import torch
torch.cuda.is_available()

True

In [6]:
# We have seen before that smalle batch size always gives better predictions and generalizations, so we leave it at the 128 default.

search_space = {
    "model_params":{
        "n_hidden": tune.choice([ 256, 512 ]),
        "n_layers": tune.choice([3,4]),
        "gene_likelihood": 'nb',
        # 'encode_covariates':True,
        # 'deeply_inject_covariates':False,
        # 'use_layer_norm':"both",
        # 'use_batch_norm':"none",
    },
    # Doesnt make sense to change plateu vales as we dont ever get to that
    # "train_params":{
    #     'plan_kwargs':{
    #         "reduce_lr_on_plateau": tune.choice([True,False]),
    #         "lr_patience": 8,
    #         "lr_factor": tune.uniform(0.01,0.1),
    #     }
    # }
}

initial_points = [
        {
            'model_params':{
                "n_hidden": 256,
                "n_layers": 3,
                "gene_likelihood": 'nb',
            },
            #  "train_params":{
            #     'plan_kwargs':{
            #         "reduce_lr_on_plateau": True,
            #         'lr_factor': 0.1
            #     }
            # }
        }
    ]

In [7]:
# We have seen before that smalle batch size always gives better predictions and generalizations, so we leave it at the 128 default.

search_space = {
    "model_params":{
        "n_hidden": tune.choice([ 512, 1024, 2048 ]),
        "n_layers": tune.choice([5,6,7]),
        "gene_likelihood": tune.choice(['nb','zinb']),
    },
    "train_params":{
        'plan_kwargs':{
            "reduce_lr_on_plateau": tune.choice([True,False])
        }
    }
}

initial_points = [
        {
            'model_params':{
                "n_hidden": 2048,
                "n_layers": 7,
                "gene_likelihood": 'nb',
            },
             "train_params":{
                'plan_kwargs':{
                    "reduce_lr_on_plateau": True
                }
            }
        }
    ]

In [None]:
import ray
ray.init(log_to_driver=False, ignore_reinit_error=True, num_cpus=20)

scvi_tuner = autotune.run_autotune(model_cls, adata,  metrics="validation_loss", mode='min',
    search_space=search_space,
    num_samples=50,
    resources={"cpu": 20, 'gpu':0.25},
    experiment_name='test_encode2048',
    searcher_kwargs={'points_to_evaluate':initial_points},
)


0,1
Current time:,2024-08-22 12:46:39
Running for:,07:01:32.22
Memory:,647.4/1006.6 GiB

Trial name,status,loc,model_params/gene_li kelihood,model_params/n_hidde n,model_params/n_layer s,...rams/plan_kwargs/ reduce_lr_on_plateau
_trainable_4f149f2c,PENDING,,nb,2048,7,True




In [65]:
ray.shutdown()

## Model explore

In [14]:
results = scvi_tuner.result_grid

NameError: name 'scvi_tuner' is not defined

In [21]:
scvi_tuner.__dict__

{'_model_cls': scvi.model._scvi.SCVI,
 '_data': AnnData object with n_obs × n_vars = 402634 × 5989
     obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'ann_fine', 'cell_type_predicted', 'doublet_status', 'leiden', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'ann_coarse', 'cell_type_tumor', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'cell_type_major', 'cell_type_neutro', 'cell_type_neutro_coarse', 'suspension_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_sta

In [12]:
best_result = results.get_best_result('validation_loss', mode='min')

NameError: name 'results' is not defined

In [28]:
best_result.__dict__

{'metrics': {'validation_loss': 748.1448364257812,
  'timestamp': 1723090185,
  'checkpoint_dir_name': None,
  'done': True,
  'training_iteration': 20,
  'trial_id': '15780a2d',
  'date': '2024-08-08_04-09-45',
  'time_this_iter_s': 96.944420337677,
  'time_total_s': 2443.7222259044647,
  'pid': 11872,
  'hostname': '6ec9f87ec9e7',
  'node_ip': '172.17.0.3',
  'config': {'model_params': {'n_hidden': 1024,
    'n_layers': 5,
    'gene_likelihood': 'zinb'},
   'train_params': {'plan_kwargs': {}}},
  'time_since_restore': 2443.7222259044647,
  'iterations_since_restore': 20,
  'experiment_tag': '5_gene_likelihood=zinb,n_hidden=1024,n_layers=5'},
 'checkpoint': None,
 'error': None,
 'path': '/tmp/tmphrzogsut/scvi1024/scvi1024/_trainable_15780a2d_5_gene_likelihood=zinb,n_hidden=1024,n_layers=5_2024-08-08_03-26-25',
 'metrics_dataframe':     validation_loss   timestamp checkpoint_dir_name   done  \
 0        790.597290  1723087906                None  False   
 1        768.241699  1723088

## Tuning comparison

In [38]:
!find /tmp -type d -name 'scvi*'

/tmp/ray/session_2024-08-08_13-37-30_046242_94/artifacts/2024-08-08_13-37-46/scvi2048_2
/tmp/ray/session_2024-08-21_04-12-28_260517_83/artifacts/2024-08-21_04-12-39/scvi_lr_1024
/tmp/ray/session_2024-08-21_04-29-31_381905_100/artifacts/2024-08-21_04-29-42/scvi_lr_1024
/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_05-50-51/scvi2048
/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_03-25-39/scvi1024
/tmp/ray/session_2024-08-21_14-39-26_471113_83/artifacts/2024-08-21_14-39-32/scvi_lr_1024
/tmp/ray/session_2024-08-21_14-39-26_471113_83/artifacts/2024-08-21_23-15-17/scvi_lr_1024


In [41]:
! ls /root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner

scvi1024  scvi2048  scvi2048_2	scvi_lr_1024  test_encode  test_lr


In [40]:
!find /root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner -type d -name 'scvi*'

/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi1024
/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024
/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024/scvi_lr_1024
/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048
/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048_2


In [36]:
import glob
paths = glob.glob(f'/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi*')
paths

['/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi1024',
 '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024',
 '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048',
 '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048_2']

In [48]:
import glob
from ray.tune.analysis import ExperimentAnalysis

# paths = glob.glob(f'/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi*')
paths = [
    # '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_encode/test_encode/',
    # '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_lr/test_lr',
    # '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024/scvi_lr_1024',
    '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048',
    '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi1024',
    '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048_2',
]

for path in paths:

    # Load the experiment analysis
    analysis = ExperimentAnalysis(path)
    files = glob.glob(f'{path}/search_gen*')
    search_dict = extract_model_params_to_dict(files[0])
    
    
    best_trial = analysis.get_best_trial(metric="validation_loss", mode="min")
    
    best_config = best_trial.config
    
    # Get the best result (e.g., accuracy)
    best_result = best_trial.metric_analysis['validation_loss']['min']

    print('Path:', path)
    print("Search space:", search_dict)
    print("Best Configuration:", best_config)
    print("Best Validation:", best_result)
    print("Epochs:", best_trial.last_result['training_iteration'])
    print("Time:", best_trial.last_result['time_this_iter_s'])
    print('\n\n')

- _trainable_6ddb8648: FileNotFoundError('Could not fetch metrics for _trainable_6ddb8648: both result.json and progress.csv were not found at /root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_encode/test_encode/_trainable_6ddb8648_2_gene_likelihood=nb,n_hidden=256,n_layers=3,use_batch_norm=none,use_layer_norm=both_2024-08-21_23-39-11')
- _trainable_9074ac84: FileNotFoundError('Could not fetch metrics for _trainable_9074ac84: both result.json and progress.csv were not found at /root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_lr/test_lr/_trainable_9074ac84_10_gene_likelihood=nb,n_hidden=1024,n_layers=6,lr_factor=0.0306,lr_patience=8,reduce_lr_on_plateau=True_2024-08-21_04-02-53')
- _trainable_79143b31: FileNotFoundError('Could not fetch metrics for _trainable_79143b31: both result.json and progress.csv were not found at /root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024/scvi_lr_1024/_trainable_79143b31_6_deeply_inject_covariates=Fa

Attribute error: 'str' object has no attribute 'categories' - Please ensure that the objects have a 'categories' attribute.
Search space: {'n_hidden': [256, 512], 'n_layers': [3, 4]}
Best Configuration: {'model_params': {'n_hidden': 256, 'n_layers': 3, 'gene_likelihood': 'nb', 'use_layer_norm': 'both', 'use_batch_norm': 'none'}}
Best Validation: 1491.1842041015625
Epochs: 3
Time: 141.40493392944336



Search space: {'n_hidden': [512, 1024, 2048], 'n_layers': [5, 6, 7], 'gene_likelihood': ['nb', 'zinb']}
Best Configuration: {'model_params': {'n_hidden': 512, 'n_layers': 5, 'gene_likelihood': 'nb'}, 'train_params': {'plan_kwargs': {'reduce_lr_on_plateau': False, 'lr_patience': 8, 'lr_factor': 0.07004252699581417}}}
Best Validation: 331.71368408203125
Epochs: 16
Time: 0.09638857841491699



Attribute error: 'str' object has no attribute 'categories' - Please ensure that the objects have a 'categories' attribute.
Search space: {'n_hidden': [512, 1024], 'n_layers': [3, 4, 5, 6]}
Best Config

In [17]:
analysis.__dict__

{'default_metric': None,
 'default_mode': None,
 '_fs': <pyarrow._fs.LocalFileSystem at 0x7f39524b00f0>,
 '_experiment_fs_path': '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024/scvi_lr_1024',
 '_experiment_json_fs_path': '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi_lr_1024/scvi_lr_1024/experiment_state-2024-08-21_14-39-32.json',
 'trials': [_trainable_e331c8cc,
  _trainable_3e46ca2c,
  _trainable_9b8ab6f3,
  _trainable_26b19390,
  _trainable_d9f63e8a,
  _trainable_ba9bf592,
  _trainable_11bc74f4,
  _trainable_72c36eee,
  _trainable_479fba9a,
  _trainable_e0b8cb47,
  _trainable_3caf076c,
  _trainable_76734887,
  _trainable_e66242cf,
  _trainable_cf9f3299,
  _trainable_82eb0d03,
  _trainable_8245645b,
  _trainable_46d394fa,
  _trainable_246c5262,
  _trainable_203d339d,
  _trainable_0dd58be6,
  _trainable_cb9da8ab],
 '_trial_dataframes': {'e331c8cc':    validation_loss   timestamp checkpoint_dir_name  done  training_iteration  \
  0      19