In [None]:
# Load modules

from inferelator import utils
from inferelator.distributed.inferelator_mp import MPControl

from inferelator import single_cell_cv_workflow
from inferelator import workflow
from inferelator.preprocessing import single_cell
from inferelator.postprocessing.results_processor_mtl import ResultsProcessorMultiTask

# Set verbosity level to "Talky"
utils.Debug.set_verbose_level(1)

In [None]:
# Set the location of the input data and the desired location of the output files

DATA_DIR = '../data/yeast'
OUTPUT_DIR = '~/jackson_2019/'

EXPRESSION_FILE_NAME = '101718_SS_Subset_Data.tsv.gz'
GENE_METADATA_FILE_NAME = 'orfs.tsv'
TF_LIST_FILE_NAME = 'tf_names_restrict.tsv'

In [None]:
# Start Multiprocessing Engine
# Default to a single computer. Setting up a cluster is left as an exercise to the reader.

n_cores_dask = 200
activate_path = '~/.local/anaconda3/bin/activate'
dask_engine = False

n_cores_local = 3
local_engine = True

# The if __name__ is __main__ pragma protects against runaway multiprocessing
# Dask requires a slurm controller in an HPC environment.
# The conda or venv activate script is necessary to set the worker environment
# This code does NOT set the environment for the current process, only for workers

if __name__ == '__main__' and dask_engine:
    MPControl.set_multiprocess_engine("dask-cluster")
    MPControl.client.minimum_cores = n_cores_dask
    MPControl.client.maximum_cores = n_cores_dask
    MPControl.client.walltime = '48:00:00'
    MPControl.client.add_worker_env_line('module load slurm')
    MPControl.client.add_worker_env_line('module load gcc/8.3.0')
    MPControl.client.add_worker_env_line('source ' + activate_path)
    MPControl.client.cluster_controller_options.append("-p ccb")
    MPControl.connect()
    
# Multiprocessing uses the pathos implementation of multiprocessing (with dill instead of cPickle)
# This is suited for a single computer, but will likely be too slow for the example here
    
if __name__ == '__main__' and local_engine:
    MPControl.set_multiprocess_engine("multiprocessing")
    MPControl.client.processes = n_cores_local
    MPControl.connect()

In [None]:
# Define the general run parameters used for all figures

def set_up_workflow(wkf):
    wkf.input_dir = DATA_DIR
    wkf.output_dir = OUTPUT_DIR
    wkf.append_to_path('output_dir', 'jackson_2019')
    wkf.expression_matrix_file = EXPRESSION_FILE_NAME
    wkf.gene_metadata_file = GENE_METADATA_FILE_NAME
    wkf.tf_names_file = TF_LIST_FILE_NAME
    wkf.expression_matrix_columns_are_genes = True
    wkf.extract_metadata_from_expression_matrix = True
    wkf.split_gold_standard_for_crossvalidation = True
    wkf.cv_split_ratio = 0.2
    wkf.num_bootstraps = 5
    wkf.add_preprocess_step(single_cell.log2_data)
    return wkf


def set_up_fig5a():
    wkf = set_up_workflow(single_cell_cv_workflow.SingleCellSizeSampling())
    wkf.random_seed = 1
    wkf.seeds = list(range(42, 52))
    wkf.sizes = [1]
    wkf.sample_with_replacement = False
    return wkf


def set_up_fig5b():
    wkf = set_up_workflow(single_cell_cv_workflow.SingleCellSizeSampling())
    wkf.random_seed = 1
    wkf.seeds = list(range(42, 52))
    wkf.sizes = [0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1]
    wkf.sample_with_replacement = False
    return wkf

In [None]:
# Figure 5A: Shuffled Priors
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5a_shuffled')
worker.shuffle_prior_axis = 0
worker.run()
del worker

In [None]:
# Figure 5A: Random Data
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5a_neg_data')
worker.expression_matrix_file = '110518_SS_NEG_Data.tsv.gz'
worker.run()
del worker

In [None]:
# Figure 5A: No Imputation
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5a_no_impute')
worker.run()
del worker

In [None]:
# Figure 5A: MAGIC
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5a_magic')
worker.expression_matrix_file = 'MAGIC_DATA.tsv.gz'
worker.preprocessing_workflow = list()
worker.run()
del worker

In [None]:
# Figure 5A: scImpute
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5a_scImpute')
worker.expression_matrix_file = 'SCIMPUTE_DATA.tsv.gz'
worker.run()
del worker

In [None]:
# Figure 5A: VIPER
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5a_VIPER')
worker.expression_matrix_file = 'VIPER_DATA.tsv.gz'
fig5a_viper = worker.run()
del worker

In [None]:
# Figure 5B: ATAC-Seq prior
worker = set_up_fig5b()
worker.append_to_path('output_dir', 'figure_5b_atac')
worker.priors_file = "yeast-motif-prior.tsv"
worker.gold_standard_file = "gold_standard.tsv"
worker.run()
del worker

In [None]:
# Figure 5B: Bussemaker
worker = set_up_fig5b()
worker.append_to_path('output_dir', 'figure_5b_bussemaker')
worker.priors_file = "Bussemaker_pSAM_priors.tsv"
worker.gold_standard_file = "gold_standard.tsv"
worker.run()
del worker

In [None]:
# Figure 5B: No Priors
worker = set_up_fig5b()
worker.append_to_path('output_dir', 'figure_5b_no_priors')
from inferelator.preprocessing.tfa import NoTFA

worker.tfa_driver = NoTFA
worker.run()
del worker

In [None]:
# Figure 5B: Gold Standard
worker = set_up_fig5b()
worker.append_to_path('output_dir', 'figure_5b_gold_standard_cv')
worker.run()
del worker

In [None]:
# Figure 5B: YEASTRACT
worker = set_up_fig5b()
worker.append_to_path('output_dir', 'figure_5b_yeastract')
worker.priors_file = "YEASTRACT_Both_20181118.tsv"
worker.gold_standard_file = "gold_standard.tsv"
worker.run()
del worker

In [None]:
# Figure 5C: Condition Specific
worker = set_up_workflow(single_cell_cv_workflow.SingleCellDropoutConditionSampling())
worker.append_to_path('output_dir', 'figure_5c_conditions')
worker.sample_batches_to_size = 500
worker.drop_column = "Condition"
worker.model_dropouts = False
worker.seeds = list(range(42, 52))
worker.run()

In [None]:
# Figure 5D: Single Task Learning
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5d_stl')
worker.priors_file = "YEASTRACT_Both_20181118.tsv"
worker.seeds = list(range(52, 62))
worker.run()
del worker

In [None]:
# Figure 5D: Multi Task Learning
worker = set_up_fig5a()
worker.append_to_path('output_dir', 'figure_5d_mtl')
worker.priors_file = "YEASTRACT_Both_20181118.tsv"
worker.cv_workflow_type = "amusr"
worker.cv_regression_type = "amusr"
worker.cv_result_processor_type = ResultsProcessorMultiTask
worker.seeds = list(range(52, 62))
worker.task_expression_filter = "intersection"
worker.run()
del worker

In [None]:
# Figure 6: Final Network
worker = set_up_workflow(workflow.inferelator_workflow(regression="amusr", workflow="amusr"))
worker.append_to_path('output_dir', 'figure_6_final')
worker.priors_file = "YEASTRACT_Both_20181118.tsv"
worker.gold_standard_file = "YEASTRACT_Both_20181118.tsv"
worker.split_gold_standard_for_crossvalidation = False
worker.split_priors_for_gold_standard = False
worker.cv_split_ratio = None
worker.num_bootstraps = 50
worker.random_seed = 100
worker.task_expression_filter = "intersection"
final_network = worker.run()
del worker