# Simulation pipeline

This is the pipeline versions of simulation and data analysis of gene-based CNV analysis methods.
## Large scale Bayesian variable selection
This pipeline implements simulation and data analysis using Carbonetto & Stephens 2012.

In [None]:
%set -j 4
%run simulate
[simulate: shared = {'simulated_data': 'output'}]
parameter: gamma = [(None, None), (3,1), (5,1)]
parameter: sample_size = [50, 100, 500]
parameter: n_causal_gene = [50, 100]
parameter: seed = [999]
parameter: name = 'data/del_sim'
input: for_each = ['gamma', 'sample_size', 'n_causal_gene', 'seed']
output: "${name}_%s.pkl" % "scale{}_shape{}_N{}_M{}".format("${_gamma[0]}", "${_gamma[1]}", "${_sample_size}", "${_n_causal_gene}", "${_seed}")
task:
python:
import sys, os
sys.path.append(os.path.expanduser('~/GIT/cnv-gene-mapping/analysis'))
from simulation import *
args = Environment()
args['odds_ratio_params']['shape'] = ${_gamma[0]}
args['odds_ratio_params']['scale'] = ${_gamma[1]}
args['n_case'] = args['n_ctrl'] = ${_sample_size}
args['n_causal_gene'] = ${_n_causal_gene}
args['output'] = ${name!r}
args.seed = ${_seed}
run_simulation(args, ${_index})

[varbvs_1]
depends: sos_variable("simulated_data"), Py_Module("feather")
input: simulated_data, group_by = 'single'
output: "${_input!n}.feather"
task:
python:
import pickle
import pandas as pd
import numpy as np
import feather
sys.path.append(os.path.expanduser('~/GIT/cnv-gene-mapping/analysis'))
from simulation import *
args = Environment()
#
dat = pickle.load(open(${_input!r}, "rb"))
ref = load_reference_gene(args["refgene_file"])
# create regression data
genes = pd.Series(ref['gene_name'])
regression_data = np.array([np.array(genes.isin(item["gene_name"]), dtype = float) for item in dat['case'] + dat['ctrl']])
regression_data = np.hstack((np.matrix([1] * len(dat['case']) + [0] * len(dat['ctrl'])).T, regression_data))
regression_data = regression_data[:,~np.ravel((regression_data==0).all(0))]
# save to disk
feather.write_dataframe(pd.DataFrame(regression_data), ${_output!r})

[varbvs_2]
depends: R_library("varbvs"), R_library("feather")
input: group_by = "single"
output: "${_input!n}.varbvs.rds"
task:
R:
dat = as.matrix(feather::read_feather(${input!r}))
# FIXME: need to check if the usage is correct
fit = varbvs::varbvs(dat[,-1], NULL, dat[,1], family = "binomial", logodds = seq(-3.5,-1,0.1), sa = 1, verbose = F)
saveRDS(fit, ${output!r})

Set sos options to "-j 4"


0,1
,e88151fc6fbc45fa3068fb4160e3f299


0,1
,04d4faa702537e7bb1fd6a85d8b7ecc4


0,1
,105e6f0f40abaee394ad86b5c64d6bc9


0,1
,29ff60815ce0c1e7be526ecb26bdd0c0


0,1
,a9877f74aabdd9ac7d68f4cde9e055dd


0,1
,7cfa584dd037ebac85d03a06b1ef0c24


0,1
,52abb89e170a3d4f3275138401189147


0,1
,aa104a0388f91ee81ae29c81150defac


0,1
,8d44261a06e35f40e8a8b444efbd8a39


0,1
,4fe8f9cb53afa3e5dbd7ec91b6defbac


0,1
,4b6355d7280ef82317510c9c990045a7


0,1
,40c9843a45e3e8bd4ddc84d478643305


0,1
,1669a8018dbd3c4b9335fcf27b9f2b5c


0,1
,39b22431d867cd8fcedd18a44d741f37


0,1
,eef15edc907ac42d402112a83566a92d


0,1
,32b2dafb601dd01f4efefa5a76cb596e


0,1
,49b3160710cf87ef095133f3af2fbeb3


0,1
,9e133a517e0c71c86f6683fcf216cc60
