In [1]:
# Load modules
from inferelator import inferelator_workflow, inferelator_verbose_level, MPControl, CrossValidationManager

# Set verbosity level to "Normal"
inferelator_verbose_level(0)

In [2]:
# Set the location of the input data and the desired location of the output files

DATA_DIR = '../data/yeast'
OUTPUT_DIR = '~/yeast_inference/'

EXPRESSION_FILE_NAME = 'yeast_microarray_expression.tsv.gz'
META_DATA_FILE_NAME = 'yeast_microarray_meta_data.tsv'
PRIORS_FILE_NAME = 'gold_standard.tsv'
GOLD_STANDARD_FILE_NAME = 'gold_standard.tsv'
TF_LIST_FILE_NAME = 'tf_names_restrict.tsv'

GENE_METADATA_FILE_NAME = 'orfs.tsv'
GENE_METADATA_COLUMN = 'SystematicName'

CV_SEEDS = list(range(42,52))


In [3]:
# Multiprocessing uses the pathos implementation of multiprocessing (with dill instead of cPickle)
# This is suited for a single computer but will not work on a distributed cluster

n_cores_local = 10
local_engine = True 

# Multiprocessing needs to be protected with the if __name__ == 'main' pragma
if __name__ == '__main__' and local_engine:
    MPControl.set_multiprocess_engine("multiprocessing")
    MPControl.client.processes = n_cores_local
    MPControl.connect()

 Regression on YKR009C [0 / 5716]
 Regression on YNR038W [100 / 5716]
 Regression on YBR002C [200 / 5716]
 Regression on YLR137W [300 / 5716]
 Regression on YBR141C [400 / 5716]
 Regression on YMR301C [500 / 5716]
 Regression on YGL143C [600 / 5716]
 Regression on YGL169W [700 / 5716]
 Regression on YPL204W [800 / 5716]
 Regression on YLR063W [900 / 5716]
 Regression on YHR155W [1000 / 5716]
 Regression on YGR208W [1100 / 5716]
 Regression on YGR223C [1200 / 5716]
 Regression on YGR253C [1300 / 5716]
 Regression on YJR025C [1400 / 5716]
 Regression on YDL066W [1500 / 5716]
 Regression on YAL023C [1600 / 5716]
 Regression on YEL046C [1700 / 5716]
 Regression on YGL131C [1800 / 5716]
 Regression on YIL092W [1900 / 5716]
 Regression on YOL123W [2000 / 5716]
 Regression on YLL066W-B [2100 / 5716]
 Regression on YGL197W [2200 / 5716]
 Regression on YIL064W [2300 / 5716]
 Regression on YGR256W [2400 / 5716]
 Regression on YAR015W [2500 / 5716]
 Regression on YPR108W-A [2600 / 5716]
 Regressi

In [4]:
# Define the general run parameters
# This function will take a workflow and set the file paths
# As well as a 5-fold cross validation

def set_up_workflow(wkf):
    wkf.set_file_paths(input_dir=DATA_DIR,
                       output_dir=OUTPUT_DIR,
                       tf_names_file=TF_LIST_FILE_NAME,
                       meta_data_file=META_DATA_FILE_NAME,
                       priors_file=PRIORS_FILE_NAME,
                       gold_standard_file=GOLD_STANDARD_FILE_NAME)
    wkf.set_expression_file(tsv=EXPRESSION_FILE_NAME)
    wkf.set_file_properties(expression_matrix_columns_are_genes=False)
    wkf.set_run_parameters(num_bootstraps=5)
    wkf.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=True, cv_split_ratio=0.2)
    return wkf

In [None]:
# Inference with BBSR (crossvalidation)
# Using the crossvalidation wrapper
# Run the regression 10 times and hold 20% of the gold standard out of the priors for testing each time
# Each run is seeded differently (and therefore has different holdouts)

# Create a worker
worker = inferelator_workflow(regression="bbsr", workflow="tfa")
worker = set_up_workflow(worker)
worker.append_to_path("output_dir", "bbsr")

# Create a crossvalidation wrapper
cv_wrap = CrossValidationManager(worker)

# Assign variables for grid search
cv_wrap.add_gridsearch_parameter('random_seed', CV_SEEDS)

# Run
cv_wrap.run()

In [None]:
# Inference with LASSO (crossvalidation)
# Using the crossvalidation wrapper
# Run the regression 10 times and hold 20% of the gold standard out of the priors for testing each time
# Each run is seeded differently (and therefore has different holdouts)

from sklearn.linear_model import Lasso

# Create a worker
worker = inferelator_workflow(regression="sklearn", workflow="tfa")
worker = set_up_workflow(worker)
worker.append_to_path("output_dir", "lasso")

# Set L1 ratio to 1 (This is now LASSO regression instead of Elastic Net)
# Parameters set with this function are passed to sklearn.linear_model.ElasticNetCV
worker.set_regression_parameters(model=Lasso, max_iter=2000)

# Create a crossvalidation wrapper
cv_wrap = CrossValidationManager(worker)

# Assign variables for grid search
cv_wrap.add_gridsearch_parameter('random_seed', CV_SEEDS)

# Run
cv_wrap.run()

 Loading expression data file yeast_microarray_expression.tsv.gz
 Loading metadata file yeast_microarray_meta_data.tsv
 Loaded yeast_microarray_expression.tsv.gz:
Data loaded: InferelatorData [float64 (2577, 5716), Metadata (2577, 5)] Memory: 117.84 MB
 Setting crossvalidation param random_seed to 42
 Resampling GS ((993, 98)) for crossvalidation
 CV prior (794, 98) and gold standard (199, 98)
 Metadata ((2577, 5)) and expression data ((5716, 2577)) alignment off
 Bootstrap 1 of 5
 Calculating betas using SKLearn model Lasso


In [None]:
# Final network
worker = inferelator_workflow(regression="bbsr", workflow="tfa")
worker = set_up_workflow(worker)
worker.append_to_path('output_dir', 'final')
worker.set_crossvalidation_parameters(split_gold_standard_for_crossvalidation=False, cv_split_ratio=None)
worker.set_run_parameters(num_bootstraps=50, random_seed=100)

final_network_results = worker.run()

In [None]:
# Visualize network results
# The workflow returns an InferelatorResults object

# There is a dataframe with an edge table for the final network
final_network_results.network.head()

In [None]:
# There is a list of dataframes with model coeffcients
# Each list element is a dataframe with the results from one bootstrap
# The dataframes are genes x TFs

final_network_results.betas[0].iloc[0:5, 0:5]

In [None]:
# The confidence scores for each network edge are also accessible
# This dataframe is genes x TFs

final_network_results.combined_confidences.iloc[0:5, 0:5]