```
cd ~/GIT/cnv-gene-mapping
sos run dsc/20190717_workflow.ipynb default:1-4 -s build
sos run dsc/20190717_workflow.ipynb -s build -j 6
```

In [None]:
[global]
parameter: cwd = path("~/GIT/cnv-gene-mapping/data")
def fmtP(x):
    return str(x).replace(".", "p").replace(' ', '_').replace('"', "").replace("'", "").replace("-", '_')

In [None]:
[default_1]
input: f"{cwd:a}/deletion.gz"
output: f'{_input:n}_geneblock.gz'
python: expand = '${ }'
    import pandas as pd
    data = pd.read_table("${_input}", compression = "gzip", sep = "\t", header = 0)
    data = data.rename(columns = {"Unnamed: 0": "gene"})
    all0 = list()
    for i in range(data.shape[0]):
        if sum(data.iloc[i, 1:]) == 0:
            all0.append(i)
    bound = list()
    i = 0; j = 1; n_0 = len(all0)
    while (j < n_0):
        if all0[j] - all0[i] >= 50:
            bound.append(all0[j])
            i = j
        j += 1
    bound = [0] + bound[:-1] + [data.shape[0]]
    fill = list()
    for k in range(len(bound)-1):
        fill_tmp = list()
        if bound[k]+1 > data.shape[0]:
            print (k, bound[k]+1, data.shape[0], "break")
            break
        for l in range(data.shape[1]-1):
            fill_tmp.append(data.iloc[bound[k]:bound[k+1], l+1].tolist())
        fill.append(fill_tmp)
    res = pd.DataFrame(fill) # row: gene block, column: sample name
    res.iloc.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_2]
parameter: sample_size = 100000 # sample size: default 100000
parameter: n_batch = 200 # number of simulated sample for each job, default: 200
assert sample_size % n_batch == 0
batches = [x+1 for x in range(n_batch)]
input: for_each = ['batches']
output: f'{cwd:a}/simu_data/{_input:bnn}.sample.{_batches}.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    import random, itertools, ast
    size = int(${sample_size} / ${n_batch})
    random.seed(${_batches})
    data = pd.read_table(${_input:r}, compression = "gzip", header = None, sep = "\t")
    samples_genome = list()
    for i in range(size):
        order = random.sample(data.columns.tolist(), data.shape[0])
        sample_genome = list(itertools.chain(*list(ast.literal_eval(i) for i in np.diag(data.loc[:, order]))))
        samples_genome.append(sample_genome)
    samples_genome_df = pd.DataFrame(samples_genome) # row: sample name, column: genes
    samples_genome_df.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_3]
input: group_by = 'all'
output: f'{_input[0]:nn}.combined.gz'
bash: expand = "${ }"
    zcat ${_input} | gzip > ${_output}

In [None]:
[default_4]
parameter: shape = 3
parameter: scale = 1
parameter: penetrance = 0.05
parameter: seed = 999999
parameter: percentile = 95
parameter: fold = 1.0
output: f'{_input:nn}.X.gz', f'{_input:nn}.y.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    import random
    np.random.seed(${seed})
    data = pd.read_table(${_input:r}, compression = "gzip", sep = "\t", header = None)
    d1 = data.values
    beta0 = np.log(${penetrance}/(1-${penetrance}))
    beta1s = [np.log(x) for x in np.random.gamma(${shape}, ${scale}, data.shape[1])] # ORs follow gamma(5,1)
    # FIXME: simulate y from logistic regression
    ys = np.matmul(d1, beta1s) + beta0
    ys_logit = list(np.exp(list(ys)) / (1+np.exp(list(ys))))
    y_df = pd.DataFrame(ys_logit)
    cutoff = np.percentile(ys_logit, ${percentile})
    case_index = [i for i,x in enumerate(ys_logit) if x >= cutoff]
    ctrl_all = [i for i in list(data.index.values) if i not in case_index]
    # select cases and same number of controls for fine-mapping
    ctrl_index = random.sample(ctrl_all, int(len(case_index) * ${fold}))
    y_all = y_df.iloc[sorted(case_index + ctrl_index), :]
    x_all = data.iloc[sorted(case_index + ctrl_index), :]
    x_all.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    y_all.to_csv(${_output[1]:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_5]
depends: R_library("data.table"), R_library('susieR')
parameter: L = 10
parameter: pve = 0.005
parameter: method = "optim"
suffix = f'SuSiE.L_{L}.prior_{fmtP(pve)}'
print (suffix)
output: f'{_input[0]:nn}.{suffix}.susie.rds'
R: expand = '${ }', stderr = f'{_input[0]:nn}.stderr', stdout = f'{_input[0]:nn}.stdout'
    library(susieR)
    library(data.table)
    X = as.matrix(data.table::fread("zcat ${_input[0]}"))
    y = as.matrix(data.table::fread("zcat ${_input[1]}"))
    storage.mode(X) = 'double'
    storage.mode(y) = 'double'
    res = susie(X, y, L = ${L}, scaled_prior_variance = ${pve}, estimate_prior_method = '${method}')
    saveRDS(res, ${_output:r})

In [1]:
import pandas as pd, numpy as np
y_df = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/copies/deletion_geneblock.sample.y.beforelogit.gz", compression = "gzip", sep = "\t", header = None)

In [2]:
y_df.head()

Unnamed: 0,0
0,-2.944439
1,-4.242502
2,10.062224
3,-0.363442
4,8.770897


In [3]:
y = y_df.iloc[:, 0].tolist()

In [4]:
y[:5]

[-2.9444389791664403,
 -4.2425019404130255,
 10.0622238091021,
 -0.3634422433225977,
 8.77089684344651]

In [5]:
from datetime import datetime
print (datetime.now().isoformat())
y_logit = list(np.exp(y)/(1+np.exp(y)))
print (datetime.now().isoformat())

2019-08-01T18:02:50.036399
2019-08-01T18:02:50.064510


In [6]:
y_logit_df = pd.DataFrame(y_logit)

In [7]:
y_logit_df.head()

Unnamed: 0,0
0,0.05
1,0.014168
2,0.999957
3,0.410127
4,0.999845


In [10]:
y_logit_df.to_csv("/home/min/GIT/cnv-gene-mapping/data/simu_data/deletion_geneblock.sample.y.gz", compression = "gzip", sep = "\t", header = False, index = False)

In [21]:
import pandas as pd, numpy as np
data = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.X.gz", compression = "gzip", sep = "\t", header = None)
data.shape

(100, 23343)

In [22]:
beta0 = np.log(0.05/(1-0.05))
beta1s = [np.log(x) for x in np.random.gamma(5, 1, data.shape[1])]
np.random.seed(9999999)
ys = [np.around(sum([x*y for x,y in zip(beta1s, data.iloc[i, :].tolist())]) + beta0, 6) for i in range(data.shape[0])]

In [23]:
y_df = pd.DataFrame(ys)

In [24]:
y_df.head()

Unnamed: 0,0
0,0.061631
1,15.16388
2,4.538161
3,90.675717
4,19.550117


In [25]:
ys_1 = np.around(np.matmul(data.values, beta1s) + beta0, 6)
y_df_1 = pd.DataFrame(ys_1)

In [26]:
y_df_1.head()

Unnamed: 0,0
0,0.061631
1,15.16388
2,4.538161
3,90.675717
4,19.550117


In [27]:
df = y_df == y_df_1

In [28]:
sum(df.iloc[:, 0].tolist())

100

In [10]:
susie_res = readRDS("/home/min/GIT/cnv-gene-mapping/data/simu_data/deletion_geneblock.sample.SuSiE.L_10.prior_0p005.susie.rds")

In [2]:
names(susie_res)

In [4]:
mean(susie_res$pip)

In [5]:
tail(sort(susie_res$pip), 5)

In [6]:
%put susie_res

  #!/usr/bin/env python3


In [7]:
susie_res["pip"].index(max(susie_res["pip"]))

17052

In [8]:
for i,j in enumerate(susie_res["pip"]):
    if j == max(susie_res["pip"]):
        print (i)

17052


In [9]:
susie_res.keys()

dict_keys(['alpha', 'mu', 'mu2', 'Xr', 'KL', 'lbf', 'sigma2', 'V', 'pi', 'null_index', 'converged', 'elbo', 'niter', 'intercept', 'fitted', 'sets', 'pip', 'X_column_scale_factors'])

In [10]:
sorted(susie_res["pip"])[-10:]

[0.102526022504687,
 0.102526022504687,
 0.102526022504687,
 0.102526022504687,
 0.473374034567583,
 0.473374034567583,
 0.5,
 0.5,
 0.64628289593692,
 1]

In [21]:
susie_res["fitted"][:10]

[4.1306497206848,
 4.1306497206848,
 4.1306497206848,
 4.1306497206848,
 13.6094415971896,
 4.1306497206848,
 4.1306497206848,
 10.9087392585089,
 4.1306497206848,
 4.1306497206848]

In [12]:
susie_res["sets"].keys()

dict_keys(['cs', 'purity', 'cs_index', 'coverage'])

In [14]:
susie_res["sets"]["cs"]["L3"]

17053