# Copy model simulation and analysis workflow

In [1]:
!sos run 20190717_workflow.ipynb -h

usage: sos run 20190717_workflow.ipynb
               [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  get_hist
  simulate
  analyze
  default
  get_data_hist

Global Workflow Options:
  --cnv-type deletion
  --cwd /home/min/GIT/github/cnv-gene-mapping/data (as path)
  --genotype-file  path(f"{cwd:a}/{cnv_type}.X.gz")

  --phenotype-file  path(f"{cwd:a}/{cnv_type}.y") # real CNV data phenotype


Sections
  get_hist_1, simulate_1, analyze_1:
    Workflow Options:
      --n-gene-in-block 1 (as int)
                        For simulation: get real deletion/duplication CNV data
                        and its block n_gene_in_block: get_hist: 1, simulate:
                        20~50, anal

## Run this workflow
### Simulation:
```
sos run dsc/20190717_workflow.ipynb simulate:1-5 --n_gene_in_block 30 --shape 1 --scale 0.5 -s build
```
### Get histogram
- For simulation
```
sos run dsc/20190717_workflow.ipynb get_hist:1-2 --genotype_file /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz \
--n_gene_in_block 1 -s build
```
- For real data
```
sos run dsc/20190717_workflow.ipynb get_hist:1-2 --n_gene_in_block 1 -s build
```

### Analyze
```
sos run dsc/20190717_workflow.ipynb analyze:1-2 --genotype_file /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz \
--phenotype_file /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --n_gene_in_block 1 -s build
```

In [None]:
[global]
parameter: cnv_type = "deletion"
parameter: cwd = path("~/GIT/github/cnv-gene-mapping/data")
parameter: genotype_file = path(f"{cwd:a}/{cnv_type}.X.gz")
parameter: phenotype_file = path(f"{cwd:a}/{cnv_type}.y") # real CNV data phenotype
def fmtP(x):
    return str(x).replace(".", "p").replace(' ', '_').replace('"', "").replace("'", "").replace("-", '_')

In [None]:
[get_hist_1, simulate_1, susie_1, varbvs_1]
# For simulation: get real deletion/duplication CNV data and its block
# n_gene_in_block: get_hist: 1, simulate: 20~50, analyze: 1
parameter: n_gene_in_block = 1
input: genotype_file
output: f"{_input:nn}.genes.block{n_gene_in_block}.gz", f"{_input:nn}.block{n_gene_in_block}.start.end.csv"
python: expand = '${ }'
    import pandas as pd
    from operator import itemgetter
    from itertools import *
    data = pd.read_csv(${_input:r}, compression = "gzip", sep = "\t", header = None)
    data_clean = data.loc[:, (data != 0).any(axis = 0)]
    data_clean.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = True, index = False)
    indices = list(data_clean.columns)
    groups = list()
    for k, g in groupby(enumerate(indices), lambda x: x[0]-x[1]):
        groups.extend(list(map(itemgetter(1), g)))
    if groups[0] != list(data.columns)[0]:
        groups = [list(data.columns)[0]] + groups
    if groups[-1] != list(data.columns)[-1]:
        groups = groups + [list(data.columns)[-1]]
    bound = list()
    i = 0; j = 1; n_0 = len(groups)
    while (j < n_0):
        if groups[j] - groups[i] >= ${n_gene_in_block} and groups[j] - groups[j-1] > 1:
            bound.append([groups[i], groups[j-1]])
            i = j
        j += 1
    bound = [item for item in bound if item[1] != 0]
    pd.DataFrame(bound, columns = ["block_start", "block_end"]).to_csv(${_output[1]:r}, sep = "\t", header = True, index = False)

In [None]:
[simulate_2]
output: f"{_input[0]:n}.for_simu.gz"
python: expand = '${ }'
    import pandas as pd
    data = pd.read_csv(${genotype_file:r}, compression = "gzip", header = None, sep = "\t")
    bound = pd.read_csv(${_input[1]:r}, header = 0, sep = "\t")
    bound2 = [[item[0], item[1]] if item[0] == bound.values[-1][0] else [item[0], bound.values[j+1][0]-1] for j, item in enumerate(bound.values)]
    fill = list()
    for l in range(data.shape[0]):
        fill.append([data.loc[l, k[0]:k[1]].tolist() for k in bound2])
    res = pd.DataFrame(fill)
    res.to_csv(${_output:r}, sep = "\t", header = False, index = False)

In [None]:
[simulate_3]
parameter: sample_size = 100000 # sample size: default 100000, test: 1000
parameter: n_batch = 200 # number of simulated sample for each job, default: 200, test: 20
assert sample_size % n_batch == 0
batches = [x+1 for x in range(n_batch)]
input: for_each = ['batches']
output: f"{cwd:a}/{cnv_type}_simu/{_input[0]:bn}.sample.{_batches}.gz"
python: expand = '${ }'
    import pandas as pd, numpy as np
    import random, itertools, ast
    data = pd.read_csv(${_input:r}, compression = "gzip", header = None, sep = "\t")
    size = int(${sample_size} / ${n_batch})
    random.seed(${_batches})
    samples_genome = list()
    for i in range(size):
        order = random.sample(data.index.tolist(), data.shape[1])
        s = list(itertools.chain(*list(ast.literal_eval(n) for n in np.diag(data.loc[order, :]))))
        samples_genome.append(s)
    samples_genome_df = pd.DataFrame(samples_genome) # row: sample, column: gene
    samples_genome_df.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[simulate_4]
input: group_by = 'all'
output: f'{_input[0]:nn}.combined.gz'
bash: expand = "${ }"
    zcat ${_input} | gzip > ${_output}

In [None]:
[simulate_5]
# shape = 3; scale = 1 for gamma
# shape = 2.191013; scale = 0.2682398 for normal
parameter: shape = 3.0 # mean for normal (1), shape for gamma (3)
parameter: scale = 1.0 # se for normal (0.5), scale for gamma (1)
# 'gamma' or 'normal'
parameter: beta_method = 'normal'
parameter: penetrance = 0.05
parameter: seed = 999999
parameter: ctrl_case_ratio = 1.0
parameter: pi0 = 0.95
output: f'{_input:nn}.X.gz', f'{_input:nn}.y', f'{_input:nn}.beta'
python: expand = "${ }"
    import pandas as pd, numpy as np
    np.random.seed(${seed})
    # For normal distribution the -3*sigma to 3*sigma on x-axis should correspond to
    # log(4) and log(20). The shape and scale parameters are thus:
    # mu = (log(20) + log(4))/2 = 2.191013; sigma = (log(20) - mu) / 3 = 0.2682398
    def logor_gamma(shape, scale, n):
        return np.log(np.random.gamma(shape, scale, n))

    def logor_normal(mean, se, n):
        return np.random.normal(mean, se, n)

    data = pd.read_csv(${_input:r}, compression = "gzip", sep = "\t", header = None)
    beta0 = np.log(${penetrance} / (1-${penetrance}))
    beta1s = [x for x in logor_${beta_method}(${shape}, ${scale}, data.shape[1])]
    beta1s = [np.random.binomial(1, 1-${pi0}) * i for i in beta1s]
    with open(${_output[2]:r}, 'w') as f:
        f.write("\n".join([str(b) for b in beta1s]))
    logit_y = np.matmul(data.values, beta1s) + beta0
    ys_p = np.exp(logit_y) / (1+np.exp(logit_y))
    ys = np.random.binomial(1, ys_p)
    case_index = np.ravel(np.where(ys == 1))
    ctrl_index = sorted(np.random.choice(np.ravel(np.where(ys == 0)), int(len(case_index) * ${ctrl_case_ratio})))
    genotype = data.iloc[case_index.tolist() + ctrl_index, :]
    genotype.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    with open(${_output[1]:r}, 'w') as f:
        f.write('\n'.join(['1'] * len(case_index) + ['0'] * len(ctrl_index)))

In [None]:
[susie_2]
depends: R_library("data.table"), R_library('susieR'), R_library("reticulate")
parameter: L = 1
parameter: pve = 0.005
parameter: method = "optim"
suffix = f"SuSiE.L_{L}.prior_{fmtP(pve)}"
output: f"{_input[0]:n}.{suffix}.susie.rds"
R: expand = '${ }', stderr = f'{_input[0]:n}.stderr', stdout = f'{_input[0]:n}.stdout'
    library(susieR)
    library(data.table)
    library(reticulate)
    library(varbvs)
    X <- as.matrix(data.table::fread(${_input[0]:r}, header = F))
    bound <- as.matrix(data.table::fread(${_input[1]:r}, header = T))
    y <- as.matrix(data.table::fread("${phenotype_file}"))
    storage.mode(X) = 'double'
    storage.mode(y) = 'double'
    res <- list()
    res_varbvs <- list()
    for (row in 1:nrow(bound)){
        x <- as.matrix(X[, as.character(bound[row,1]:bound[row,2])])
        # print (head(x))
        res_x <- susie(x, y, L = ${L}, scaled_prior_variance = ${pve}, estimate_prior_method = '${method}')
        res[[row]] <- res_x
        logodds <- seq(-log10(ncol(x)), 1, length.out = 40)
        fit <- varbvs::varbvs(x, NULL, y, logodds = logodds, verbose = FALSE)
        b <- as.vector(coef(fit)[, "averaged"])
        res1_x <- list(fit = fit, mu = b[1], beta = b[-1])
        res_varbvs[[row]] <- res1_x
    }
    saveRDS(res, ${_output[0]:r})
    saveRDS(res_varbvs, ${_output[1]:r})

In [None]:
[varbvs_2]
depends: R_library("data.table"), R_library("reticulate"), R_library("varbvs")
output: f"{_input[0]:n}.varbvs.rds"


In [None]:
[get_hist_2]
output: f"{_input[0]:n}.histogram.pdf"
python: expand = '${ }'
    import pandas as pd, matplotlib.pyplot as plt
    blocks = pd.read_csv(${_input[1]:r}, sep = "\t", header = 0)
    spans = [j-i+1 for i,j in zip(blocks["block_start"], blocks["block_end"])]
    counts = {i: spans.count(i) for i in set(spans) if i != 0}
    fig, ax = plt.subplots(figsize = (8,6))
    plt.bar(list(counts.keys()), list(counts.values()), width = 0.8)
    ax.set_title("Histogram of number of genes in blocks")
    plt.savefig(${_output:r})

## Note
```
cd ~/GIT/cnv-gene-mapping
sos run dsc/20190717_workflow.ipynb get_hist:1-2 -s build
sos run dsc/20190717_workflow.ipynb get_hist:1-2 --genotype_file /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz
sos run dsc/20190717_workflow.ipynb analyze:1-2 -s build
sos run dsc/20190717_workflow.ipynb analyze:1-2 --simu_pheno /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y -s build
sos run dsc/20190717_workflow.ipynb simulate:1-5 --n_gene_in_block 30 --shape 1 --scale 0.5 -s build
sos run dsc/20190717_workflow.ipynb -s build -j 6
```
```
sinteractive --time=01:00:00 --partition=bigmem2 --nodes=1 --ntasks-per-node=1 --mem-per-cpu=100G
sos run dsc/20190717_workflow.ipynb simulate:1-5 --n_gene_in_block 30 --shape 1 --scale 0.5 -s build

sos run dsc/20190717_workflow.ipynb get_hist:1-2 --genotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz \
--phenotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --n_gene_in_block 1 -s build

sos run dsc/20190717_workflow.ipynb analyze:1-2 --genotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz \
--phenotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --n_gene_in_block 1 \
--simu_pheno /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --real "FALSE" -s build
```