## A walkthrough of the SPEER pipeline.

In [1]:
import pandas as pd
import numpy as np

# import src code
import ase_evaluation as ae
#import benchmark_posteriors as bp
import bootstrap as btstrp
import cross_validation as cv
import logistic_regression as lr
import naive_bayes as nb
import network as ntwk
import process as prcs
#import RIVER as rvr
import simulate_data as sim
import benchmark_posteriors as bnchmk
import RIVER as river
import sklearn
from sklearn import metrics

### Simulate data

In [27]:
s = sim.SimulateData("./test_output/", 'with_transfer', 0.4, 0.6, 0.01)
s._run()

# create a process object
p = prcs.Process('./test_output/', 0.1)
p._process_simulated_data()

In [4]:
p.tissues

['brain', 'group1', 'muscle', 'epithelial', 'digestive']

In [8]:
p.train_list[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,expr_label,median_expr_label,z_label,median_z_label,SPEER,SPEER without transfer,RIVER,shared tissue genome only,tissue specific genome only,tissue
8159,1.0,0.233208,0.941775,0.344153,0.241151,0.454677,0.008107,0.318218,0.962903,0.620152,...,0,0,0,0,-1,-1,-1,-1,-1,brain
6332,1.0,0.24259,0.018061,0.695752,0.948027,0.013553,0.5582,0.556575,0.847465,0.306071,...,1,0,0,0,-1,-1,-1,-1,-1,brain
8895,1.0,0.829874,0.614525,0.005478,0.047789,0.169802,0.8981,0.920814,0.585368,0.296906,...,0,0,0,0,-1,-1,-1,-1,-1,brain
5351,1.0,0.639045,0.474632,0.202959,0.501118,0.68819,0.512928,0.020292,0.673802,0.435826,...,1,0,0,0,-1,-1,-1,-1,-1,brain
4314,1.0,0.519206,0.691351,0.406914,0.211778,0.866877,0.226372,0.672933,0.108786,0.142141,...,1,0,0,0,-1,-1,-1,-1,-1,brain


### Bootstrap $\{\lambda_{1:M}, \Lambda\}$

In [9]:
# create a bootstrap object
b = btstrp.Bootstrap(p.train_list, p.tissues, p.genomic_features, num_simulations=2, num_folds=3)
lambda_hp_children_dict, lambda_hp_parent = b._run_bootstrap()

tissue:  0
tissue:  1
tissue:  2
tissue:  3
tissue:  4


In [11]:
"tissue-specific transfer factors: ", lambda_hp_children_dict, "global transfer factor: ", lambda_hp_parent

('tissue-specific transfer factors: ',
 {'brain': 7.4172971971631014,
  'digestive': 7.3886612982907165,
  'epithelial': 11.435711558323156,
  'group1': 17.323054124623582,
  'muscle': 6.1983994384518892},
 'global transfer factor: ',
 32.038242453902434)

### Compute SPEER scores

In [13]:
n = ntwk.Network(p.train_list, p.test_list, p.tissues, p.genomic_features, 
                 with_transfer=True, output_dir="SPEER_output", 
                 lambda_hp_parent = None,
                 lambda_hp_children_dict = None,
                 e_distribution = 'cat')
train_list, test_list, beta_parent, beta_children, phi = n.run()

### Compute SPEER scores without transfer

In [16]:
lambda_hp_children_dict = {'brain': 0.01, 'group1': 0.01, 'muscle': 0.01, 'epithelial': 0.01, 'digestive': 0.01}
n = ntwk.Network(train_list, test_list, p.tissues, p.genomic_features, 
                 with_transfer=False, output_dir="SPEER_output", 
                 lambda_hp_parent = None, 
                 lambda_hp_children_dict = lambda_hp_children_dict, 
                 e_distribution = 'cat')
train_list, test_list, beta_parent, beta_children, phi = n.run()

### Compute RIVER scores

In [17]:
n = river.River(p.train_list, p.test_list, p.genomic_features, output_dir='RIVER_output')
train_list, test_list, beta_parent_river, beta_children_river, phi_river = n.run()

In [24]:
sklearn.metrics.roc_auc_score(train_list[4]["z_label"], train_list[4]["SPEER"])

0.94205400661967387

### Compute other benchmark scores (tissue-specific and shared tissue genomic annotation only models)

In [25]:
bn = bnchmk.BenchmarkPosteriors(train_list, test_list, p.genomic_features)
train_list, test_list = bn.fit_models()

## Evaluation

In [26]:

for i in range(5):
    print(sklearn.metrics.roc_auc_score(test_list[i]["z_label"], test_list[i]["SPEER"]),
          sklearn.metrics.roc_auc_score(test_list[i]["z_label"], test_list[i]["SPEER without transfer"]),
          sklearn.metrics.roc_auc_score(test_list[i]["z_label"], test_list[i]["tissue specific genome only"]),
          sklearn.metrics.roc_auc_score(test_list[i]["z_label"], test_list[i]["shared tissue genome only"]))

0.95047487657 0.890294492783 0.813036406413 0.6107610649
0.95631492253 0.759544636676 0.763091388747 0.598111371825
0.956264995565 0.784310469004 0.752917098734 0.608893084034
0.953893473163 0.522345026131 0.383862746657 0.610069808413
0.951750000962 0.760310220633 0.698271041987 0.60107907311
