In [None]:
# This is the code used to generate the figures in: 
# Gene regulatory network reconstruction using single-cell RNA sequencing of barcoded genotypes in diverse environments
# https://doi.org/10.1101/581678

# The data files to run this script are located on Zenodo
# https://zenodo.org/record/3354412

In [None]:
# Load modules
from inferelator import inferelator_workflow, inferelator_verbose_level, MPControl, CrossValidationManager

# Set verbosity level to "Talky"
inferelator_verbose_level(1)

In [None]:
# Set the location of the input data and the desired location of the output files

DATA_DIR = '../data/yeast'
OUTPUT_DIR = '~/jackson_2019/'

EXPRESSION_FILE_NAME = '103118_SS_Data.tsv.gz'
GENE_METADATA_FILE_NAME = 'orfs.tsv'
METADATA_COLUMNS = ['TF', 'strain', 'date', 'restriction', 'mechanism', 'time']

YEASTRACT_PRIOR = "YEASTRACT_20190713_BOTH.tsv"

TF_NAMES = "tf_names_gold_standard.txt"
YEASTRACT_TF_NAMES = "tf_names_yeastract.txt"

In [None]:
# Start Multiprocessing Engine

n_cores_local = 10
local_engine = True

# Multiprocessing uses the pathos implementation of multiprocessing (with dill instead of cPickle)
# This is suited for a single computer, but will likely be too slow for the example here

if __name__ == '__main__' and local_engine:
    MPControl.set_multiprocess_engine("multiprocessing")
    MPControl.client.processes = n_cores_local
    MPControl.connect()

In [None]:
# Define the general run parameters used for all figures

def set_up_workflow(wkf):
    wkf.set_file_paths(input_dir=DATA_DIR,
                       output_dir=OUTPUT_DIR,
                       expression_matrix_file='103118_SS_Data.tsv.gz',
                       gene_metadata_file='orfs.tsv',
                       gold_standard_file='gold_standard.tsv',
                       priors_file='gold_standard.tsv',
                       tf_names_file=TF_NAMES)
    wkf.set_file_properties(extract_metadata_from_expression_matrix=True,
                            expression_matrix_metadata=METADATA_COLUMNS,
                            expression_matrix_columns_are_genes=True,
                            gene_list_index="SystematicName")
    wkf.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=True,
                                       cv_split_ratio=0.5)
    wkf.set_run_parameters(num_bootstraps=5)
    wkf.set_count_minimum(0.05)
    wkf.add_preprocess_step("log2")
    return wkf

def set_up_fig5a(wkf):
    cv_wrap = CrossValidationManager(wkf)
    cv_wrap.add_gridsearch_parameter('random_seed', list(range(42, 52)))
    return cv_wrap


def set_up_fig5b(wkf):
    cv_wrap = CrossValidationManager(wkf)
    cv_wrap.add_gridsearch_parameter('random_seed', list(range(42, 52)))
    cv_wrap.add_size_subsampling([0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 1], seed=86)
    return cv_wrap

def yeastract(wkf):
    wkf.set_file_paths(tf_names_file=YEASTRACT_TF_NAMES, priors_file=YEASTRACT_PRIOR)

In [None]:
# Figure 5A: Shuffled Priors
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_shuffle_parameters(shuffle_prior_axis=0)
worker.append_to_path('output_dir', 'figure_5a_shuffled')

set_up_fig5a(worker).run()

In [None]:
# Figure 5A: Random Data
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_file_paths(expression_matrix_file='110518_SS_NEG_Data.tsv.gz')
worker.append_to_path('output_dir', 'figure_5a_neg_data')

set_up_fig5a(worker).run()

In [None]:
# Figure 5A: No Imputation
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.append_to_path('output_dir', 'figure_5a_no_impute')

set_up_fig5a(worker).run()

In [None]:
# Figure 5A: MAGIC
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_file_paths(expression_matrix_file='MAGIC_DATA.tsv.gz')
worker.preprocessing_workflow = list()
worker.append_to_path('output_dir', 'figure_5a_magic')

set_up_fig5a(worker).run()

In [None]:
# Figure 5A: scImpute
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_file_paths(expression_matrix_file='SCIMPUTE_DATA.tsv.gz')
worker.append_to_path('output_dir', 'figure_5a_scImpute')

set_up_fig5a(worker).run()

In [None]:
# Figure 5A: VIPER
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_file_paths(expression_matrix_file='VIPER_DATA.tsv.gz')
worker.append_to_path('output_dir', 'figure_5a_VIPER')

set_up_fig5a(worker).run()

In [None]:
# Figure 5B: ATAC-Seq prior
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_file_paths(priors_file="yeast-motif-prior.tsv", gold_standard_file="gold_standard.tsv")
worker.append_to_path('output_dir', 'figure_5b_atac')

set_up_fig5b(worker).run()

In [None]:
# Figure 5B: Bussemaker
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_file_paths(priors_file="Bussemaker_pSAM_priors.tsv", gold_standard_file="gold_standard.tsv")
worker.append_to_path('output_dir', 'figure_5b_atac')

set_up_fig5b(worker).run()

In [None]:
# Figure 5B: No Priors
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.set_tfa(tfa_driver=False)
worker.append_to_path('output_dir', 'figure_5b_no_priors')

set_up_fig5b(worker).run()

In [None]:
# Figure 5B: Gold Standard
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.append_to_path('output_dir', 'figure_5b_gold_standard_cv')

set_up_fig5b(worker).run()

In [None]:
# Figure 5B: YEASTRACT
worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
yeastract(worker)
worker.append_to_path('output_dir', 'figure_5b_yeastract')

set_up_fig5b(worker).run()

In [None]:
# Figure 5C: Condition Specific

worker = set_up_workflow(inferelator_workflow(regression="bbsr", workflow="single-cell"))
worker.append_to_path('output_dir', 'figure_5c_conditions')

cv_wrap = CrossValidationManager(worker)
cv_wrap.add_gridsearch_parameter('random_seed', list(range(42, 52)))
cv_wrap.add_grouping_dropin("Condition", group_size=500)

cv_wrap.run()
del cv_wrap

In [None]:
# Figure 5D: Single Task Learning

worker = set_up_workflow(workflow.inferelator_workflow(regression="bbsr", workflow="single-cell"))
yeastract(worker)
worker.append_to_path('output_dir', 'figure_5d_stl')

cv_wrap = crossvalidation_workflow.CrossValidationManager(worker)
cv_wrap.add_gridsearch_parameter('random_seed', list(range(52, 62)))
cv_wrap.run()

del cv_wrap

In [None]:
# Figure 5D: BBSR By Task Learning

worker = set_up_workflow(inferelator_workflow(regression="bbsr-by-task", workflow="multitask"))
yeastract(worker)
worker.append_to_path('output_dir', 'figure_5d_mtl_bbsr')

cv_wrap = CrossValidationManager(worker)
cv_wrap.add_gridsearch_parameter('random_seed', list(range(52, 62)))
cv_wrap.run()

del cv_wrap

In [None]:
# Figure 5D: Multi Task Learning

worker = set_up_workflow(inferelator_workflow(regression="amusr", workflow="multitask"))
yeastract(worker)
worker.append_to_path('output_dir', 'figure_5d_mtl_amusr')

cv_wrap = CrossValidationManager(worker)
cv_wrap.add_gridsearch_parameter('random_seed', list(range(52, 62)))
cv_wrap.run()

del cv_wrap

In [None]:
# Figure 6: Final Network

worker = set_up_workflow(inferelator_workflow(regression="amusr", workflow="multitask"))
yeastract(worker)
worker.set_file_paths(gold_standard_file="YEASTRACT_Both_20181118.tsv")
worker.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=False, cv_split_ratio=None)
worker.set_run_parameters(num_bootstraps=50, random_seed=100)
worker.append_to_path('output_dir', 'figure_6_final')

final_network = worker.run()
del worker