## Experiment 1: base network selection

This experiment studies different options for the base GRN provided to CellOracle.

In [1]:
EXPERIMENT_NAME="baseNetwork_v1"

In [9]:
import warnings
warnings.filterwarnings('ignore')
import importlib
import os
import gc
import sys
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import celloracle as co

#      visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

In [10]:
# Deal with various file paths specific to this project
PROJECT_PATH = '/home/ekernf01/Desktop/jhu/research/projects/perturbation_prediction/cell_type_knowledge_transfer/'
os.chdir(PROJECT_PATH + "benchmarking/")
try:
    os.makedirs("results/" + EXPERIMENT_NAME)
except:
    pass

sys.path.append(os.path.expanduser(PROJECT_PATH + 'networks/load_networks'))
sys.path.append(os.path.expanduser(PROJECT_PATH + 'perturbations/load_perturbations')) 
sys.path.append(os.path.expanduser(PROJECT_PATH + 'benchmarking/evaluator')) 
import evaluator
import load_networks
import load_perturbations
importlib.reload(evaluator) 
importlib.reload(load_networks) 
importlib.reload(load_perturbations)
os.environ["GRN_PATH"]           = PROJECT_PATH + "networks/networks"
os.environ["PERTURBATION_PATH"]  = PROJECT_PATH + "perturbations/perturbations"

Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2


In [17]:
X = evaluator.makeRandomNetwork(density = 1, TFs = ["foo", "bar", "baz"])
X.iloc[:, 2:] = np.array(X.iloc[:, 2:])   #undo sparse representation         
dict(X.iloc[:, 2:].apply(lambda x: x[x>0].index.values, axis=1))

{0: array(['foo', 'bar', 'baz'], dtype=object),
 1: array(['foo', 'bar', 'baz'], dtype=object),
 2: array(['foo', 'bar', 'baz'], dtype=object),
 3: array(['foo', 'bar', 'baz'], dtype=object),
 4: array(['foo', 'bar', 'baz'], dtype=object),
 5: array(['foo', 'bar', 'baz'], dtype=object),
 6: array(['foo', 'bar', 'baz'], dtype=object),
 7: array(['foo', 'bar', 'baz'], dtype=object),
 8: array(['foo', 'bar', 'baz'], dtype=object),
 9: array(['foo', 'bar', 'baz'], dtype=object),
 10: array(['foo', 'bar', 'baz'], dtype=object),
 11: array(['foo', 'bar', 'baz'], dtype=object),
 12: array(['foo', 'bar', 'baz'], dtype=object),
 13: array(['foo', 'bar', 'baz'], dtype=object),
 14: array(['foo', 'bar', 'baz'], dtype=object),
 15: array(['foo', 'bar', 'baz'], dtype=object),
 16: array(['foo', 'bar', 'baz'], dtype=object),
 17: array(['foo', 'bar', 'baz'], dtype=object),
 18: array(['foo', 'bar', 'baz'], dtype=object),
 19: array(['foo', 'bar', 'baz'], dtype=object),
 20: array(['foo', 'bar', 'baz

### Networks setup

This experiment aims to test a variety of published sparse regulatory network structures. 

In [4]:
networks = {
    'dense': evaluator.makeRandomNetwork(density = 1),
    'random0.1': evaluator.makeRandomNetwork(density = 0.1),
    'random0.01': evaluator.makeRandomNetwork(density = 0.01),
    'cellOracle': co.data.load_human_promoter_base_GRN()
}
gc.collect()

Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2


0

In [5]:
pd.DataFrame({bn:sys.getsizeof(networks[bn])/1e6 for bn in networks}, index = ["memory"])

Unnamed: 0,dense,random0.1,random0.01,cellOracle
memory,2.636366,125.36657,14.906018,329.156543


In [None]:
for network in [
    # Networks used in GWAS-related projects
    'gtex_rna',
    'magnum_compendium_32',   
    'magnum_compendium_394', 
    'magnum_compendium_ppi',
    'humanbase',
    # Networks used in reprogramming-related and differentiation-related projects
    'cellnet_human_Hg1332',
    'cellnet_human_Hugene',
    'MARA_FANTOM4',
    'STRING',    
    'ANANSE_0.5',
    'ANANSE_tissue_0.5'
]:
    print("Loading " + network)
    if not network in networks:
        networks[network] = evaluator.networkEdgesToMatrix(load_networks.load_grn_all_subnetworks(network))
    gc.collect()
    
# One more network used in a reprogramming-related project
networks["mogrify"] = pd.concat([networks[n] for n in ['MARA_FANTOM4','STRING']])

network_sizes = pd.DataFrame({bn:evaluator.countMatrixEdges(networks[bn]) for bn in networks}, index = ["numEdges"])
network_sizes = network_sizes.T.reset_index().rename({"index":"network"}, axis = 1)
network_sizes

Loading gtex_rna
Loading magnum_compendium_32
Loading magnum_compendium_394
Loading magnum_compendium_ppi
Loading humanbase
Loading cellnet_human_Hg1332
Loading cellnet_human_Hugene
Loading MARA_FANTOM4
Loading STRING
Loading ANANSE
Loading ANANSE_tissue


### Memory consumption

This experiment has been a little problematic recently in terms of memory consumed. We can check on that briefly. 

In [None]:
pd.DataFrame({bn:sys.getsizeof(networks[bn])/1e6 for bn in networks}, index = ["memory"])

### Data setup

We use the Nakatake et al data. This experiment is on per-cluster versus shared regression models, so we run Leiden clustering at many different resolutions.

In [None]:
ko_lab_esc_data = sc.read_h5ad(os.environ["PERTURBATION_PATH"] + "/nakatake/" + "test.h5ad")

In [None]:
ko_lab_esc_data.obs.columns

In [None]:
allowedRegulators = set.intersection(*[set(networks[key].columns) for key in networks])
ko_lab_esc_data_train, ko_lab_esc_data_heldout, perturbationsToPredict = \
    evaluator.splitData(ko_lab_esc_data, allowedRegulators, minTestSetSize=250)

### Experimental metadata

In [None]:
n_networks = len(networks.keys())
experiments = pd.DataFrame({"network":[n for n in networks.keys()], 
                            "p":[1]*n_networks,
                            "threshold_number":[int(network_sizes['numEdges'].max())]*n_networks,
                            "pruning":["none"]*n_networks})
experiments["index"] = experiments.index
experiments.to_csv("results/" + EXPERIMENT_NAME + "/networkExperiments.csv")
experiments

In [None]:
predictions = {
    i: evaluator.trainCausalModelAndPredict(expression=ko_lab_esc_data_train,
                                  baseNetwork=networks[experiments.loc[i,'network']],
                                  memoizationName="results/" + EXPERIMENT_NAME + "/" + str(i) + ".celloracle.oracle", 
                                  perturbations=perturbationsToPredict,
                                  clusterColumnName = ,
                                  pruningParameters = {"p":experiments.loc[i,'p'], 
                                                       "threshold_number":experiments.loc[i,'threshold_number']}) 
    for i in experiments.index
}


In [None]:
predictions[0]

### Evaluation

We compute the correlation of the predictions with held-out perturbations.

In [None]:
controlIndex = ko_lab_esc_data_train.obs["perturbation"]=="Control"
evaluator.evaluateCausalModel(heldout = ko_lab_esc_data_heldout, 
                    predictions = predictions,   
                    baseline = ko_lab_esc_data_train[controlIndex,:].X.mean(axis=0), 
                    classifier=cellFateClassifier, 
                    experiments = experiments,
                    factor_varied = "network",
                    experiment_name = EXPERIMENT_NAME)