In [1]:
condition_key = 'dataset'
cell_type_key = 'cell_type_tumor'

# Ray Tune

## Data & libs

In [2]:
import pickle

import ray
import scanpy as sc
import scvi
import seaborn as sns
import torch
from ray import tune
from scvi import autotune

scvi.settings.seed = 0
print("Last run with scvi-tools version:", scvi.__version__)

sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
torch.set_float32_matmul_precision("high")
save_dir = '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner'
scvi.settings.logging_dir = save_dir

%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

Global seed set to 0


Last run with scvi-tools version: 1.1.5


In [4]:
def extract_model_params_to_dict(pickle_file_path):
    # Load the pickled object
    with open(pickle_file_path, 'rb') as f:
        data = pickle.load(f)

    # Initialize a dictionary to store the extracted model parameters
    model_params_dict = {}

    # Navigate through the object structure to get model_params
    try:
        model_params = data['experiment'].spec['config']['model_params']
        
        # Extract the categories for n_hidden, n_layers, and gene_likelihood
        for key, value in model_params.items():
            model_params_dict[key] = value.categories
        # model_params_dict['n_layers'] = model_params['n_layers'].categories
        # model_params_dict['gene_likelihood'] = model_params['gene_likelihood'].categories
        
    except KeyError as e:
        print(f"Key error: {e} - Please check if the structure of the pickled object is correct.")
    except AttributeError as e:
        print(f"Attribute error: {e} - Please ensure that the objects have a 'categories' attribute.")

    return model_params_dict

# Example usage:
# Replace 'path_to_your_pickle_file.pkl' with the actual path to your pickle file.
model_params_dict = extract_model_params_to_dict('/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_03-25-39/scvi1024/driver_artifacts/search_gen_state-2024-08-08_03-25-39.json')
model_params_dict

{'n_hidden': [256, 512, 1024],
 'n_layers': [3, 4, 5, 6],
 'gene_likelihood': ['nb', 'zinb']}

In [146]:
adata = sc.read_h5ad('/root/datos/maestria/netopaas/luca/data/atlas/extended.h5ad')

KeyboardInterrupt: 

In [None]:
adata

In [None]:
adata = adata[(adata.obs.origin == 'tumor_primary')]

# stages = ['III','III or IV', 'IV']
stages = ['I','II','III','III or IV', 'IV']
adata = adata[adata.obs.uicc_stage.isin(stages)]

# we remove these because some are enriched for endotheliar and some for immune cells
studies = [
    'Goveia_Carmeliet_2020',
    'Leader_Merad_2021','Guo_Zhang_2018']
adata = adata[~adata.obs.study.isin(studies)]
adata = adata[:, adata.var.is_highly_variable == 'True']

In [5]:
from scvi.data import synthetic_iid

adata = synthetic_iid()
adata

AnnData object with n_obs × n_vars = 400 × 100
    obs: 'batch', 'labels'
    uns: 'protein_names'
    obsm: 'protein_expression', 'accessibility'

In [14]:
adata = adata.copy()

## Model and tune

In [6]:
model_cls = scvi.model.SCVI
# model_cls.setup_anndata(adata, batch_key='dataset', labels_key='cell_type_tumor')
model_cls.setup_anndata(adata, batch_key='batch', labels_key='labels')

In [10]:
search_space = {
    "model_params":{
        "n_hidden": tune.choice([ 512, 1024, 2048 ]),
        "n_layers": tune.choice([5,6,7]),
        "gene_likelihood": tune.choice(['nb','zinb']),
    },
    "train_params":{
        'plan_kwargs':{
            "reduce_lr_on_plateau": tune.choice([True,False])
        }
    }
}

initial_points = [
        {
            'model_params':{
                "n_hidden": 2048,
                "n_layers": 7,
                "gene_likelihood": 'nb',
            },
             "train_params":{
                'plan_kwargs':{
                    "reduce_lr_on_plateau": True
                }
            }
        }
    ]

In [18]:
import ray
ray.init(log_to_driver=False, ignore_reinit_error=True, num_cpus=10)

scvi_tuner = autotune.run_autotune(model_cls, adata,  metrics="validation_loss", mode='min',
    search_space=search_space,
    num_samples=50,
    resources={"cpu": 2, 'gpu':0.25},
    experiment_name='test_lr',
    searcher_kwargs={'points_to_evaluate':initial_points},
)




0,1
Current time:,2024-08-21 03:53:33
Running for:,00:00:59.51
Memory:,522.6/1006.6 GiB

Trial name,status,loc,model_params/gene_li kelihood,model_params/n_hidde n,model_params/n_layer s,...rams/plan_kwargs/ reduce_lr_on_plateau,iter,total time (s),validation_loss
_trainable_69079fee,PENDING,,zinb,2048,7,False,,,
_trainable_ee9fcca4,TERMINATED,172.17.0.2:18914,nb,2048,7,True,100.0,17.6035,332.432
_trainable_8e5bf1a6,TERMINATED,172.17.0.2:19094,nb,512,6,True,1.0,3.16776,333.064
_trainable_2f76f84b,TERMINATED,172.17.0.2:19265,zinb,2048,7,False,100.0,17.5648,304.047
_trainable_bcfd18a9,TERMINATED,172.17.0.2:19436,nb,1024,6,True,1.0,3.28943,332.809
_trainable_6869e8c0,TERMINATED,172.17.0.2:19577,nb,1024,5,False,1.0,3.39749,332.907


2024-08-21 03:53:33,654	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_lr/test_lr' in 0.0237s.
2024-08-21 03:53:36,148	INFO tune.py:1039 -- Total run time: 62.03 seconds (59.48 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_lr/test_lr", trainable=...)
- _trainable_69079fee: FileNotFoundError('Could not fetch metrics for _trainable_69079fee: both result.json and progress.csv were not found at /root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/test_lr/test_lr/_trainable_69079fee_6_gene_likelihood=zinb,n_hidden=2048,n_layers=7,reduce_lr_on_plateau=False_2024-08-21_03-53-26')


In [14]:
ray.shutdown()

## Model explore

In [26]:
scvi_tuner.result_grid

ResultGrid<[
  Result(
    metrics={'validation_loss': 754.237548828125},
    path='/tmp/tmphrzogsut/scvi1024/scvi1024/_trainable_b87f55fe_1_gene_likelihood=nb,n_hidden=256,n_layers=4_2024-08-08_03-25-39',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'validation_loss': 790.5972900390625},
    path='/tmp/tmphrzogsut/scvi1024/scvi1024/_trainable_899e4360_2_gene_likelihood=zinb,n_hidden=1024,n_layers=5_2024-08-08_03-25-51',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'validation_loss': 750.9778442382812},
    path='/tmp/tmphrzogsut/scvi1024/scvi1024/_trainable_0a38590b_3_gene_likelihood=nb,n_hidden=512,n_layers=5_2024-08-08_03-26-02',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    metrics={'validation_loss': 772.2880859375},
    path='/tmp/tmphrzogsut/scvi1024/scvi1024/_trainable_af8dd8f4_4_gene_likelihood=nb,n_hidden=512,n_layers=4_2024-08-08_03-26-14',
    filesystem='local',
    checkpoint=None
  ),
  Result(
    me

In [21]:
scvi_tuner.__dict__

{'_model_cls': scvi.model._scvi.SCVI,
 '_data': AnnData object with n_obs × n_vars = 402634 × 5989
     obs: 'sample', 'uicc_stage', 'ever_smoker', 'age', 'donor_id', 'origin', 'dataset', 'ann_fine', 'cell_type_predicted', 'doublet_status', 'leiden', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'ann_coarse', 'cell_type_tumor', 'tumor_stage', 'EGFR_mutation', 'TP53_mutation', 'ALK_mutation', 'BRAF_mutation', 'ERBB2_mutation', 'KRAS_mutation', 'ROS_mutation', 'origin_fine', 'study', 'platform', 'cell_type_major', 'cell_type_neutro', 'cell_type_neutro_coarse', 'suspension_type', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_sta

In [27]:
best_result = results.get_best_result('validation_loss', mode='min')

In [28]:
best_result.__dict__

{'metrics': {'validation_loss': 748.1448364257812,
  'timestamp': 1723090185,
  'checkpoint_dir_name': None,
  'done': True,
  'training_iteration': 20,
  'trial_id': '15780a2d',
  'date': '2024-08-08_04-09-45',
  'time_this_iter_s': 96.944420337677,
  'time_total_s': 2443.7222259044647,
  'pid': 11872,
  'hostname': '6ec9f87ec9e7',
  'node_ip': '172.17.0.3',
  'config': {'model_params': {'n_hidden': 1024,
    'n_layers': 5,
    'gene_likelihood': 'zinb'},
   'train_params': {'plan_kwargs': {}}},
  'time_since_restore': 2443.7222259044647,
  'iterations_since_restore': 20,
  'experiment_tag': '5_gene_likelihood=zinb,n_hidden=1024,n_layers=5'},
 'checkpoint': None,
 'error': None,
 'path': '/tmp/tmphrzogsut/scvi1024/scvi1024/_trainable_15780a2d_5_gene_likelihood=zinb,n_hidden=1024,n_layers=5_2024-08-08_03-26-25',
 'metrics_dataframe':     validation_loss   timestamp checkpoint_dir_name   done  \
 0        790.597290  1723087906                None  False   
 1        768.241699  1723088

## Tuning comparison

In [73]:
!find /tmp -type d -name 'scvi*'

/tmp/ray/session_2024-08-08_13-37-30_046242_94/artifacts/2024-08-08_13-37-46/scvi2048_2
/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_05-50-51/scvi2048
/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_03-25-39/scvi1024


In [129]:
import pickle
file_path = '/tmp/ray/session_2024-08-08_02-12-53_330111_1230/artifacts/2024-08-08_03-25-39/scvi1024/driver_artifacts/searcher-state-2024-08-08_03-25-39.pkl'


with open(file_path, 'rb') as f:
    data = pickle.load(f)

data['_space']['model_params']['n_hidden']

<hyperopt.pyll.base.Apply at 0x7f4fe87e6b90>

In [143]:
import glob
from ray.tune.analysis import ExperimentAnalysis

paths = [
    '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi1024',
    '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048',
    '/root/datos/maestria/netopaas/lung_scRNA/LUCA_model/RayTuner/scvi2048_2',
]

for path in paths:

    # Load the experiment analysis
    analysis = ExperimentAnalysis(path)
    files = glob.glob(f'{path}/search_gen*')
    search_dict = extract_model_params_to_dict(files[0])
    
    
    best_trial = analysis.get_best_trial(metric="validation_loss", mode="min")
    
    best_config = best_trial.config
    
    # Get the best result (e.g., accuracy)
    best_result = best_trial.metric_analysis['validation_loss']['min']

    print("Search space:", search_dict)
    print("Best Configuration:", best_config)
    print("Best Accuracy:", best_result)
    print("Epochs:", best_trial.last_result['training_iteration'])
    print("Time:", best_trial.last_result['time_this_iter_s'])
    print('\n\n')

Search space: {'n_hidden': [256, 512, 1024], 'n_layers': [3, 4, 5, 6], 'gene_likelihood': ['nb', 'zinb']}
Best Configuration: {'model_params': {'n_hidden': 1024, 'n_layers': 5, 'gene_likelihood': 'zinb'}, 'train_params': {'plan_kwargs': {}}}
Best Accuracy: 748.1448364257812
Epochs: 20
Time: 96.944420337677



Search space: {'n_hidden': [512, 1024, 2048], 'n_layers': [5, 6, 7], 'gene_likelihood': ['nb', 'zinb']}
Best Configuration: {'model_params': {'n_hidden': 1024, 'n_layers': 5, 'gene_likelihood': 'nb'}, 'train_params': {'plan_kwargs': {}}}
Best Accuracy: 749.8037109375
Epochs: 20
Time: 84.13480496406555



Search space: {'n_hidden': [512, 1024, 2048], 'n_layers': [5, 6, 7], 'gene_likelihood': ['nb', 'zinb']}
Best Configuration: {'model_params': {'n_hidden': 1024, 'n_layers': 6, 'gene_likelihood': 'nb'}}
Best Accuracy: 749.8407592773438
Epochs: 20
Time: 104.96014261245728



