```
cd ~/GIT/cnv-gene-mapping
sos run dsc/20190717_workflow.ipynb default:1 -s build
sos run dsc/20190717_workflow.ipynb -s build -j 6
```

In [None]:
[global]
parameter: cwd = path("~/GIT/cnv-gene-mapping/data")

In [None]:
[default_1]
input: f"{cwd:a}/deletion.gz"
output: f'{cwd:a}/deletion_geneblock.gz'
python: expand = '${ }'
    import pandas as pd
    data = pd.read_table("${_input}", compression = "gzip", sep = "\t", header = 0)
    data = data.rename(columns = {"Unnamed: 0": "gene"})
    all0 = list()
    for i in range(data.shape[0]):
        if sum(data.iloc[i, 1:]) == 0:
            all0.append(i)
    bound = list()
    i = 0; j = 1; n_0 = len(all0)
    while (j < n_0):
        if all0[j] - all0[i] >= 50:
            bound.append(all0[j])
            i = j
        j += 1
    bound = [0] + bound[:-1] + [data.shape[0]]
    fill = list()
    for k in range(len(bound)-1):
        fill_tmp = list()
        if bound[k]+1 > data.shape[0]:
            print (k, bound[k]+1, data.shape[0], "break")
            break
        for l in range(data.shape[1]-1):
            fill_tmp.append(data.iloc[bound[k]:bound[k+1], l+1].tolist())
        fill.append(fill_tmp)
    res = pd.DataFrame(fill) # row: gene block, column: sample name
    res.iloc.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_2]
parameter: sample_size = 100000 # sample size
parameter: n_batch = 200 # number of simulated sample for each job
assert sample_size % n_batch == 0
batches = [x+1 for x in range(n_batch)]
input: for_each = ['batches']
output: f'{cwd:a}/simu_data/{_input:bnn}.sample.{_batches}.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    import random, itertools, ast
    size = int(${sample_size} / ${n_batch})
    random.seed(${_batches})
    data = pd.read_table(${_input:r}, compression = "gzip", header = None, sep = "\t")
    samples_genome = list()
    for i in range(size):
        order = random.sample(data.columns.tolist(), data.shape[0])
        sample_genome = list(itertools.chain(*list(ast.literal_eval(i) for i in np.diag(data.loc[:, order]))))
        samples_genome.append(sample_genome)
    samples_genome_df = pd.DataFrame(samples_genome) # row: sample name, column: genes
    samples_genome_df.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_3]
input: group_by = 'all'
output: f'{_input[0]:nn}.combined.gz'
bash: expand = "${ }"
    zcat ${_input} | gzip > ${_output}

In [None]:
[default_4]
parameter: shape = 3
parameter: scale = 1
parameter: penetrance = 0.05
parameter: seed = 999999
parameter: percentile = 95
output: f'{_input:nn}.y.gz', f'{_input:nn}.ycase.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    np.random.seed(${seed})
    data = pd.read_table(${_input:r}, compression = "gzip", header = None, sep = "\t")
    d1 = data.values
    beta0 = np.log(${penetrance}/(1-${penetrance}))
    beta1s = [np.log(x) for x in np.random.gamma(${shape}, ${scale}, data.shape[1])] # ORs follow gamma(5,1)
    ys = np.matmul(d1, beta1s) + beta0
    y_df = pd.DataFrame(ys)
    y_df.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    cutoff = np.percentile(ys, ${percentile})
    case_index = [i for i,x in enumerate(ys) if x >= cutoff]
    y_case = pd.DataFrame(case_index)
    y_case.to_csv(${_output[1]:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [1]:
import pandas as pd, numpy as np
data = pd.read_table("~/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.combined.gz", compression = "gzip", sep = "\t", header = None)

In [2]:
data.shape

(10, 23343)

In [3]:
beta0 = np.log(0.05/(1-0.05))
beta1s = [np.log(x) for x in np.random.gamma(5, 1, data.shape[1])]
np.random.seed(9999999)
ys = [sum([x*y for x,y in zip(beta1s, data.iloc[i, :].tolist())]) + beta0 for i in range(data.shape[0])]

In [4]:
y_df = pd.DataFrame(ys)

In [5]:
y_df

Unnamed: 0,0
0,2.786281
1,-2.944439
2,-2.944439
3,10.642412
4,2.972804
5,6.935966
6,-2.944439
7,2.219492
8,-2.944439
9,14.617213


In [6]:
d1 = data.values

In [7]:
ys = np.matmul(d1, beta1s) + beta0

In [16]:
[i for i,x in enumerate(ys) if x >= np.percentile(ys, 88)]

[3, 9]

In [9]:
type(ys)

numpy.ndarray

In [10]:
pd.DataFrame(ys)

Unnamed: 0,0
0,2.786281
1,-2.944439
2,-2.944439
3,10.642412
4,2.972804
5,6.935966
6,-2.944439
7,2.219492
8,-2.944439
9,14.617213
