```
cd ~/GIT/cnv-gene-mapping
sos run dsc/20190717_workflow.ipynb default:1-4 -s build
sos run dsc/20190717_workflow.ipynb -s build -j 6
```

In [None]:
[global]
parameter: cwd = path("~/GIT/github/cnv-gene-mapping/data")
def fmtP(x):
    return str(x).replace(".", "p").replace(' ', '_').replace('"', "").replace("'", "").replace("-", '_')

In [None]:
[default_1]
input: f"{cwd:a}/deletion.gz"
output: f'{_input:n}_geneblock.gz'
python: expand = '${ }'
    import pandas as pd
    data = pd.read_table("${_input}", compression = "gzip", sep = "\t", header = 0)
    data = data.rename(columns = {"Unnamed: 0": "gene"})
    all0 = list()
    for i in range(data.shape[0]):
        if sum(data.iloc[i, 1:]) == 0:
            all0.append(i)
    bound = list()
    i = 0; j = 1; n_0 = len(all0)
    while (j < n_0):
        if all0[j] - all0[i] >= 50:
            bound.append(all0[j])
            i = j
        j += 1
    bound = [0] + bound[:-1] + [data.shape[0]]
    fill = list()
    for k in range(len(bound)-1):
        fill_tmp = list()
        if bound[k]+1 > data.shape[0]:
            print (k, bound[k]+1, data.shape[0], "break")
            break
        for l in range(data.shape[1]-1):
            fill_tmp.append(data.iloc[bound[k]:bound[k+1], l+1].tolist())
        fill.append(fill_tmp)
    res = pd.DataFrame(fill) # row: gene block, column: sample name
    res.iloc.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_2]
parameter: sample_size = 100000 # sample size: default 100000, test: 1000
parameter: n_batch = 200 # number of simulated sample for each job, default: 200, test: 20
assert sample_size % n_batch == 0
batches = [x+1 for x in range(n_batch)]
input: for_each = ['batches']
output: f'{cwd:a}/simu_data_test/{_input:bnn}.sample.{_batches}.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    import random, itertools, ast
    size = int(${sample_size} / ${n_batch})
    random.seed(${_batches})
    data = pd.read_table(${_input:r}, compression = "gzip", header = None, sep = "\t")
    samples_genome = list()
    for i in range(size):
        order = random.sample(data.columns.tolist(), data.shape[0])
        sample_genome = list(itertools.chain(*list(ast.literal_eval(i) for i in np.diag(data.loc[:, order]))))
        samples_genome.append(sample_genome)
    samples_genome_df = pd.DataFrame(samples_genome) # row: sample name, column: genes
    samples_genome_df.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_3]
input: group_by = 'all'
output: f'{_input[0]:nn}.combined.gz'
bash: expand = "${ }"
    zcat ${_input} | gzip > ${_output}

In [None]:
[default_4]
parameter: shape = 3
parameter: scale = 1
parameter: penetrance = 0.05
parameter: seed = 999999
parameter: percentile = 95
parameter: fold = 1.0
output: f'{_input:nn}.X.gz', f'{_input:nn}.y.gz', f'{_input:nn}.ycase.index.gz', f'{_input:nn}.yctrl.index.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    import random
    np.random.seed(${seed})
    data = pd.read_table(${_input:r}, compression = "gzip", sep = "\t", header = None)
    d1 = data.values
    beta0 = np.log(${penetrance}/(1-${penetrance}))
    beta1s = [np.log(x) for x in np.random.gamma(${shape}, ${scale}, data.shape[1])] # ORs follow gamma(5,1)
    ys = np.matmul(d1, beta1s) + beta0
    res = pd.DataFrame(pd.np.column_stack([ys, data]))
    ys_logit = [np.exp(i) / (1+np.exp(i)) for i in res.iloc[:, 0]]
    res = pd.DataFrame(pd.np.column_stack([ys_logit, res]))
    cutoff = np.percentile(ys_logit, ${percentile})
    case = res[res[0] >= cutoff]
    ctrl = res[res[0] < cutoff].sample(n = int(case.shape[0] * ${fold}), random_state = ${seed})
    y_x = case.append(ctrl).sort_index()
    case_index = pd.DataFrame(sorted(case.index.values))
    ctrl_index = pd.DataFrame(sorted(ctrl.index.values))
    y_x.iloc[:, 2:].to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    y_x.iloc[:, [0]].to_csv(${_output[1]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    case_index.to_csv(${_output[2]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    ctrl_index.to_csv(${_output[3]:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_5]
depends: R_library("data.table"), R_library('susieR')
parameter: L = 10
parameter: pve = 0.005
parameter: method = "optim"
suffix = f'SuSiE.L_{L}.prior_{fmtP(pve)}'
print (suffix)
output: f'{_input[0]:nn}.{suffix}.susie.rds'
R: expand = '${ }', stderr = f'{_input[0]:nn}.stderr', stdout = f'{_input[0]:nn}.stdout'
    library(susieR)
    library(data.table)
    X = as.matrix(data.table::fread("zcat ${_input[0]}"))
    y = as.matrix(data.table::fread("zcat ${_input[1]}"))
    storage.mode(X) = 'double'
    storage.mode(y) = 'double'
    res = susie(X, y, L = ${L}, scaled_prior_variance = ${pve}, estimate_prior_method = '${method}')
    saveRDS(res, ${_output:r})

In [1]:
import pandas as pd, numpy as np
data = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.X.gz", compression = "gzip", sep = "\t", header = None)
data.shape

(100, 23343)

In [2]:
beta0 = np.log(0.05/(1-0.05))
beta1s = [np.log(x) for x in np.random.gamma(5, 1, data.shape[1])]
np.random.seed(9999999)
ys = [np.around(sum([x*y for x,y in zip(beta1s, data.iloc[i, :].tolist())]) + beta0, 6) for i in range(data.shape[0])]

In [3]:
y_df = pd.DataFrame(ys)

In [4]:
y_df.tail()

Unnamed: 0,0
95,-2.944439
96,43.804725
97,-2.944439
98,-0.131875
99,-2.944439


In [5]:
ys_1 = np.around(np.matmul(data.values, beta1s) + beta0, 6)
y_df_1 = pd.DataFrame(ys_1)

In [6]:
y_df_1.tail()

Unnamed: 0,0
95,-2.944439
96,43.804725
97,-2.944439
98,-0.131875
99,-2.944439


In [7]:
df = y_df == y_df_1

In [8]:
sum(df.iloc[:, 0].tolist())

100

In [34]:
susie_res = readRDS("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.SuSiE.L_10.prior_0p005.susie.rds")

In [35]:
names(susie_res)

In [36]:
mean(susie_res$pip)

In [37]:
tail(sort(susie_res$pip), 5)

In [38]:
# %put susie_res

In [40]:
susie_res["pip"].index(max(susie_res["pip"]))

In [150]:
for i,j in enumerate(susie_res["pip"]):
    if j == max(susie_res["pip"]):
        print (i)

16007
16008
16009
16010
16011
16012


In [8]:
susie_res.keys()

dict_keys(['alpha', 'mu', 'mu2', 'Xr', 'KL', 'lbf', 'sigma2', 'V', 'pi', 'null_index', 'converged', 'elbo', 'niter', 'intercept', 'fitted', 'sets', 'pip', 'X_column_scale_factors'])

In [153]:
sorted(susie_res["pip"])[-10:]

[0.000665602401835996,
 0.000665783181054036,
 0.000665783181054036,
 0.000665783181054036,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969]

In [154]:
susie_res["fitted"][:10]

[0.622747786415506,
 0.62230394639043,
 0.622833298789739,
 0.62230394639043,
 0.626508087810574,
 0.622663406869125,
 0.623389166799308,
 0.623124755851558,
 0.623217382940279,
 0.62230394639043]

In [155]:
susie_res["sets"].keys()

dict_keys(['cs', 'coverage'])

In [156]:
susie_res["sets"]

{'coverage': 0.95, 'cs': None}

In [26]:
import pandas as pd

In [32]:
d = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.y.gz", compression = "gzip", sep = "\t", header = None)

In [33]:
d.head()

Unnamed: 0,0
0,0.226804
1,0.999957
2,0.05
3,1.0
4,0.999973


In [34]:
d1 = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/v1/deletion_geneblock.sample.y.gz", compression = "gzip", sep = "\t", header = None)

In [35]:
d1.head()

Unnamed: 0,0
0,0.986416
1,0.05
2,0.999957
3,0.05
4,1.0
