```
cd ~/GIT/cnv-gene-mapping
sos run dsc/20190717_workflow.ipynb get_hist:1-2 -s build
sos run dsc/20190717_workflow.ipynb get_hist:1-2 --genotype_file /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz
sos run dsc/20190717_workflow.ipynb analyze:1-2 -s build
sos run dsc/20190717_workflow.ipynb analyze:1-2 --simu_pheno /home/min/GIT/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y -s build
sos run dsc/20190717_workflow.ipynb simulate:1-5 --n_gene_in_block 30 --shape 1 --scale 0.5 -s build
sos run dsc/20190717_workflow.ipynb -s build -j 6
```
```
sinteractive --time=01:00:00 --partition=bigmem2 --nodes=1 --ntasks-per-node=1 --mem-per-cpu=100G
sos run dsc/20190717_workflow.ipynb simulate:1-5 --n_gene_in_block 30 --shape 1 --scale 0.5 -s build

sos run dsc/20190717_workflow.ipynb get_hist:1-2 --genotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz \
--phenotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --n_gene_in_block 1 -s build

sos run dsc/20190717_workflow.ipynb analyze:1-2 --genotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.X.gz \
--phenotype_file /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --n_gene_in_block 1 \
--simu_pheno /home/gaow/GIT/github/cnv-gene-mapping/data/deletion_simu/deletion.genes.block30.for_simu.sample.y --real "FALSE" -s build
```

In [None]:
[global]
parameter: cnv_type = "deletion"
parameter: cwd = path("~/GIT/github/cnv-gene-mapping/data")
parameter: genotype_file = path(f"{cwd:a}/{cnv_type}.X.gz")
parameter: phenotype_file = path(f"{cwd:a}/{cnv_type}.y") # real CNV data phenotype
def fmtP(x):
    return str(x).replace(".", "p").replace(' ', '_').replace('"', "").replace("'", "").replace("-", '_')

In [None]:
[get_hist_1, simulate_1, analyze_1]
# For simulation: get real deletion/duplication CNV data and its block
# n_gene_in_block: get_hist: 1, simulate: 20~50, analyze: 1
parameter: n_gene_in_block = 1
input: genotype_file
output: f"{_input:nn}.genes.block{n_gene_in_block}.gz", f"{_input:nn}.block{n_gene_in_block}.start.end.csv"
python: expand = '${ }'
    import pandas as pd
    from operator import itemgetter
    from itertools import *
    data = pd.read_csv(${_input:r}, compression = "gzip", sep = "\t", header = None)
    data_clean = data.loc[:, (data != 0).any(axis = 0)]
    data_clean.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = True, index = False)
    indices = list(data_clean.columns)
    groups = list()
    for k, g in groupby(enumerate(indices), lambda x: x[0]-x[1]):
        groups.extend(list(map(itemgetter(1), g)))
    if groups[0] != list(data.columns)[0]:
        groups = [list(data.columns)[0]] + groups
    if groups[-1] != list(data.columns)[-1]:
        groups = groups + [list(data.columns)[-1]]
    bound = list()
    i = 0; j = 1; n_0 = len(groups)
    while (j < n_0):
        if groups[j] - groups[i] >= ${n_gene_in_block} and groups[j] - groups[j-1] > 1:
            bound.append([groups[i], groups[j-1]])
            i = j
        j += 1
    bound = [item for item in bound if item[1] != 0]
    pd.DataFrame(bound, columns = ["block_start", "block_end"]).to_csv(${_output[1]:r}, sep = "\t", header = True, index = False)

In [None]:
[simulate_2]
output: f"{_input[0]:n}.for_simu.gz"
python: expand = '${ }'
    import pandas as pd
    data = pd.read_csv(${genotype_file:r}, compression = "gzip", header = None, sep = "\t")
    bound = pd.read_csv(${_input[1]:r}, header = 0, sep = "\t")
    bound2 = [[item[0], item[1]] if item[0] == bound.values[-1][0] else [item[0], bound.values[j+1][0]-1] for j, item in enumerate(bound.values)]
    fill = list()
    for l in range(data.shape[0]):
        fill.append([data.loc[l, k[0]:k[1]].tolist() for k in bound2])
    res = pd.DataFrame(fill)
    res.to_csv(${_output:r}, sep = "\t", header = False, index = False)

In [None]:
[simulate_3]
parameter: sample_size = 100000 # sample size: default 100000, test: 1000
parameter: n_batch = 200 # number of simulated sample for each job, default: 200, test: 20
assert sample_size % n_batch == 0
batches = [x+1 for x in range(n_batch)]
input: for_each = ['batches']
output: f"{cwd:a}/{cnv_type}_simu/{_input[0]:bn}.sample.{_batches}.gz"
python: expand = '${ }'
    import pandas as pd, numpy as np
    import random, itertools, ast
    data = pd.read_csv(${_input:r}, compression = "gzip", header = None, sep = "\t")
    size = int(${sample_size} / ${n_batch})
    random.seed(${_batches})
    samples_genome = list()
    for i in range(size):
        order = random.sample(data.index.tolist(), data.shape[1])
        s = list(itertools.chain(*list(ast.literal_eval(n) for n in np.diag(data.loc[order, :]))))
        samples_genome.append(s)
    samples_genome_df = pd.DataFrame(samples_genome) # row: sample, column: gene
    samples_genome_df.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[simulate_4]
input: group_by = 'all'
output: f'{_input[0]:nn}.combined.gz'
bash: expand = "${ }"
    zcat ${_input} | gzip > ${_output}

In [None]:
[simulate_5]
# shape = 3; scale = 1 for gamma
# shape = 2.191013; scale = 0.2682398 for normal
parameter: shape = 3.0 # mean for normal (1), shape for gamma (3)
parameter: scale = 1.0 # se for normal (0.5), scale for gamma (1)
# 'gamma' or 'normal'
parameter: beta_method = 'normal'
parameter: penetrance = 0.05
parameter: seed = 999999
parameter: ctrl_case_ratio = 1.0
parameter: pi0 = 0.95
output: f'{_input:nn}.X.gz', f'{_input:nn}.y'
python: expand = "${ }"
    import pandas as pd, numpy as np
    np.random.seed(${seed})
    # For normal distribution the -3*sigma to 3*sigma on x-axis should correspond to
    # log(4) and log(20). The shape and scale parameters are thus:
    # mu = (log(20) + log(4))/2 = 2.191013; sigma = (log(20) - mu) / 3 = 0.2682398
    def logor_gamma(shape, scale, n):
        return np.log(np.random.gamma(shape, scale, n))

    def logor_normal(mean, se, n):
        return np.random.normal(mean, se, n)

    data = pd.read_csv(${_input:r}, compression = "gzip", sep = "\t", header = None)
    beta0 = np.log(${penetrance} / (1-${penetrance}))
    beta1s = [x for x in logor_${beta_method}(${shape}, ${scale}, data.shape[1])]
    beta1s = [np.random.binomial(1, 1-${pi0}) * i for i in beta1s]
    logit_y = np.matmul(data.values, beta1s) + beta0
    ys_p = np.exp(logit_y) / (1+np.exp(logit_y))
    ys = np.random.binomial(1, ys_p)
    case_index = np.ravel(np.where(ys == 1))
    ctrl_index = sorted(np.random.choice(np.ravel(np.where(ys == 0)), int(len(case_index) * ${ctrl_case_ratio})))
    genotype = data.iloc[case_index.tolist() + ctrl_index, :]
    genotype.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    with open(${_output[1]:r}, 'w') as f:
        f.write('\n'.join(['1'] * len(case_index) + ['0'] * len(ctrl_index)))

In [None]:
[analyze_2]
depends: R_library("data.table"), R_library('susieR'), R_library("reticulate"), R_library("varbvs")
parameter: L = 1
parameter: pve = 0.005
parameter: method = "optim"
parameter: real = "TRUE"
parameter: simu_pheno = None
suffix = f"SuSiE.L_{L}.prior_{fmtP(pve)}"
output: f"{_input[0]:n}.{suffix}.susie.rds", f"{_input[0]:n}.varbvs.rds"
R: expand = '${ }', stderr = f'{_input[0]:n}.stderr', stdout = f'{_input[0]:n}.stdout'
    library(susieR)
    library(data.table)
    library(reticulate)
    library(varbvs)
    X <- as.matrix(data.table::fread(${_input[0]:r}, header = T))
    bound <- as.matrix(data.table::fread(${_input[1]:r}, header = T))
    if (${real} == TRUE){
        y <- as.matrix(data.table::fread("${phenotype_file}"))
    } else {
        y <- as.matrix(data.table::fread("${simu_pheno}"))
    }
    storage.mode(X) = 'double'
    storage.mode(y) = 'double'
    res <- list()
    res_varbvs <- list()
    for (row in 1:nrow(bound)){
        x <- as.matrix(X[, as.character(bound[row,1]:bound[row,2])])
        print (head(x))
        res_x <- susie(x, y, L = ${L}, scaled_prior_variance = ${pve}, estimate_prior_method = '${method}')
        res[[row]] <- res_x
        logodds <- seq(-log10(ncol(x)), 1, length.out = 40)
        fit <- varbvs::varbvs(x, NULL, y, logodds = logodds, verbose = FALSE)
        b <- as.vector(coef(fit)[, "averaged"])
        res1_x <- list(fit = fit, mu = b[1], beta = b[-1])
        res_varbvs[[row]] <- res1_x
    }
    saveRDS(res, ${_output[0]:r})
    saveRDS(res_varbvs, ${_output[1]:r})

In [None]:
[get_hist_2]
output: f"{_input[0]:n}.histogram.pdf"
python: expand = '${ }'
    import pandas as pd, matplotlib.pyplot as plt
    blocks = pd.read_csv(${_input[1]:r}, sep = "\t", header = 0)
    spans = [j-i+1 for i,j in zip(blocks["block_start"], blocks["block_end"])]
    counts = {i: spans.count(i) for i in set(spans) if i != 0}
    fig, ax = plt.subplots(figsize = (8,6))
    plt.bar(list(counts.keys()), list(counts.values()), width = 0.8)
    ax.set_title("Histogram of number of genes in blocks")
    plt.savefig(${_output:r})

In [None]:
[default_1, get_data_hist_1]
parameter: n_gene_in_block = 20
input: f"{cwd:a}/deletion.gz"
output: f'{_input:n}_geneblock.{n_gene_in_block}.gz', f'{_input:n}.cleaned.histogram.gz'
python: expand = '${ }'
    import pandas as pd
    data = pd.read_csv("${_input}", compression = "gzip", sep = "\t", header = 0)
    data = data.rename(columns = {"Unnamed: 0": "gene"})
    all0 = list()
    for i in range(data.shape[0]):
        if sum(data.iloc[i, 1:]) == 0:
            all0.append(i)
    bound = list()
    i = 0; j = 1; n_0 = len(all0)
    while (j < n_0):
        if all0[j] - all0[i] >= ${n_gene_in_block}:
            bound.append(all0[j])
            i = j
        j += 1
    bound = [0] + bound[:-1] + [data.shape[0]]
    fill = list()
    for k in range(len(bound)-1):
        fill_tmp = list()
        if bound[k]+1 > data.shape[0]:
            print (k, bound[k]+1, data.shape[0], "break")
            break
        for l in range(data.shape[1]-1):
            fill_tmp.append(data.iloc[bound[k]:bound[k+1], l+1].tolist())
        fill.append(fill_tmp)
    res = pd.DataFrame(fill) # row: gene block, column: sample name
    res.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    data1 = data.iloc[:, 1:]
    data1 = data1.T
    data1.to_csv(${_output[1]:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_2]
parameter: sample_size = 100000 # sample size: default 100000, test: 1000
parameter: n_batch = 200 # number of simulated sample for each job, default: 200, test: 20
assert sample_size % n_batch == 0
batches = [x+1 for x in range(n_batch)]
input: for_each = ['batches']
output: f'{cwd:a}/simu_data_20/{_input[0]:bnn}.sample.{_batches}.gz'
python: expand = "${ }"
    import pandas as pd, numpy as np
    import random, itertools, ast
    size = int(${sample_size} / ${n_batch})
    random.seed(${_batches})
    data = pd.read_csv(${_input[0]:r}, compression = "gzip", header = None, sep = "\t")
    samples_genome = list()
    for i in range(size):
        order = random.sample(data.columns.tolist(), data.shape[0])
        sample_genome = list(itertools.chain(*list(ast.literal_eval(i) for i in np.diag(data.loc[:, order]))))
        samples_genome.append(sample_genome)
    samples_genome_df = pd.DataFrame(samples_genome) # row: sample name, column: genes
    samples_genome_df.to_csv(${_output:r}, compression = "gzip", sep = "\t", header = False, index = False)

In [None]:
[default_3]
input: group_by = 'all'
output: f'{_input[0]:nn}.combined.gz'
bash: expand = "${ }"
    zcat ${_input} | gzip > ${_output}

In [None]:
[default_4]
# For shape = 3; scale = 1 for gamma
# shape = 2.191013; scale = 0.2682398 for normal
parameter: shape = 3
parameter: scale = 1
# 'gamma' or 'normal'
parameter: beta_method = 'normal'
parameter: penetrance = 0.05
parameter: seed = 999999
parameter: ctrl_case_ratio = 1.0
parameter: pi0 = 0.95
output: f'{_input:nn}.X.gz', f'{_input:nn}.y'
python: expand = "${ }"
    import pandas as pd, numpy as np
    np.random.seed(${seed})
    # For normal distribution the -3*sigma to 3*sigma on x-axis should correspond to
    # log(4) and log(20). The shape and scale parameters are thus:
    # mu = (log(20) + log(4))/2 = 2.191013; sigma = (log(20) - mu) / 3 = 0.2682398
    def logor_gamma(shape, scale, n):
        return np.log(np.random.gamma(shape, scale, n))

    def logor_normal(shape, scale, n):
        return np.random.normal(shape, scale, n)
    
    data = pd.read_csv(${_input:r}, compression = "gzip", sep = "\t", header = None)
    beta0 = np.log(${penetrance}/(1-${penetrance}))
    beta1s = [x for x in logor_${beta_method}(${shape}, ${scale}, data.shape[1])]
    beta1s = [np.random.binomial(1, 1-${pi0}) * i for i in beta1s]
    logit_y = np.matmul(data.values, beta1s) + beta0
    ys_p = np.exp(logit_y) / (1+np.exp(logit_y))
    ys = np.random.binomial(1, ys_p)
    case_index = np.ravel(np.where(ys == 1))
    ctrl_index = sorted(np.random.choice(np.ravel(np.where(ys == 0)), int(len(case_index) * ${ctrl_case_ratio})))
    genotype = data.iloc[case_index.tolist() + ctrl_index.tolist(),:]
    genotype.to_csv(${_output[0]:r}, compression = "gzip", sep = "\t", header = False, index = False)
    with open(${_output[1]:r}, 'w') as f:
        f.write('\n'.join(['1'] * len(case_index) + ['0'] * len(ctrl_index)))

In [None]:
[default_5, get_data_hist_2]
output: f'{_input[0]:nn}.blocks.histogram.pdf'
python: expand = "${ }"
    import pandas as pd, matplotlib.pyplot as plt
    from operator import itemgetter
    from itertools import *
    import pickle
    data = pd.read_csv(${_input[0]:r}, compression = "gzip", header = None, sep = "\t")
    data = data.loc[:, (data != 0).any(axis = 0)]
    indices = list(data.columns)
    groups = []
    for k, g in groupby(enumerate(indices), lambda x: x[0]-x[1]):
        groups.append(list(map(itemgetter(1), g)))
    group_len = [len(item) for item in groups]
    counts = {i: group_len.count(i) for i in set(group_len) if i != 0}
    fig, ax = plt.subplots(figsize = (8,6))
    plt.bar(list(counts.keys()), list(counts.values()), width = 0.8)
    ax.set_title("Histogram of number of genes in blocks")
    plt.savefig(${_output:r})

In [None]:
[default_6]
input: output_from("default_4")[1]
print (_input)
output: f'{_input:nn}.X.removed0.blocks.pkl'
python: expand = "${ }"
    import pandas as pd, matplotlib.pyplot as plt
    from operator import itemgetter
    from itertools import *
    from collections import OrderedDict
    import pickle
    data = pd.read_csv(${_input:r}, compression = "gzip", sep = "\t", header = None)
    data = data.loc[:, (data != 0).any(axis = 0)]
    indices = list(data.columns)
    groups = []
    for k, g in groupby(enumerate(indices), lambda x: x[0] - x[1]):
        groups.append(list(map(itemgetter(1), g)))
    dfs_dict = OrderedDict()
    i = 1
    for item in groups:
        dfs_dict[f"block{i}"] = data.loc[:, item]
        i += 1
    pickle.dump(dfs_dict, open(${_output:r}, "wb"))

## Simulated

In [13]:
import pandas as pd, matplotlib.pyplot as plt
from operator import itemgetter
from itertools import *
from collections import Counter
data = pd.read_csv("~/GIT/cnv-gene-mapping/data/simu_data/deletion_geneblock.sample.X.gz", compression = "gzip", header = None, sep = "\t")

In [14]:
data.shape

(12212, 23343)

In [15]:
data1 = data.loc[:, (data != 0).any(axis = 0)]

In [16]:
data1.shape

(12212, 2259)

In [17]:
indices = list(data1.columns)

In [19]:
groups = []
for k, g in groupby(enumerate(indices), lambda x: x[0]-x[1]):
    groups.append(list(map(itemgetter(1), g)))

In [20]:
group_len = [len(item) for item in groups]
counts = {i: group_len.count(i) for i in set(group_len) if i != 0}

In [21]:
Counter(group_len)

Counter({1: 205,
         2: 113,
         3: 56,
         4: 31,
         5: 20,
         6: 14,
         7: 3,
         8: 8,
         9: 12,
         10: 11,
         11: 4,
         12: 2,
         13: 1,
         14: 4,
         15: 1,
         16: 2,
         17: 1,
         18: 7,
         19: 5,
         20: 3,
         21: 3,
         23: 3,
         24: 1,
         27: 2,
         28: 1,
         30: 2,
         33: 1,
         37: 1,
         38: 1,
         40: 1,
         59: 1,
         62: 1})

In [9]:
fig, ax = plt.subplots(figsize = (8,6))
plt.bar(list(counts.keys()), list(counts.values()), width = 0.8)
ax.set_title("Histogram of number of genes in blocks (simulation)")
# plt.savefig("~/GIT/cnv-gene-mapping/data/deletion_geneblock.real.blocks.histogram.pdf")
plt.show()

## Real

In [3]:
dat = pd.read_csv("~/GIT/cnv-gene-mapping/data/deletion.20.cleaned.histogram.gz", compression = "gzip", header = None, sep = "\t")

In [4]:
dat.shape

(3086, 23343)

In [5]:
dat.iloc[:5, 29:55]

Unnamed: 0,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
dat1 = dat.loc[:, (dat != 0).any(axis = 0)]

In [7]:
dat1.shape

(3086, 2290)

In [9]:
indices1 = list(dat1.columns)

In [10]:
len(indices1)

2290

In [11]:
groups1 = []
for k, g in groupby(enumerate(indices1), lambda x: x[0]-x[1]):
    groups1.append(list(map(itemgetter(1), g)))

In [12]:
group_len1 = [len(item) for item in groups1]
counts1 = {i: group_len1.count(i) for i in set(group_len1) if i != 0}

In [14]:
Counter(group_len1)

Counter({1: 208,
         2: 114,
         3: 56,
         4: 30,
         5: 23,
         6: 14,
         7: 3,
         8: 8,
         9: 12,
         10: 11,
         11: 4,
         12: 2,
         13: 1,
         14: 4,
         15: 2,
         16: 2,
         17: 1,
         18: 7,
         19: 5,
         20: 3,
         21: 3,
         23: 3,
         24: 1,
         27: 2,
         28: 1,
         30: 2,
         33: 1,
         37: 1,
         38: 1,
         40: 1,
         59: 1,
         62: 1})

In [49]:
fig, ax = plt.subplots(figsize = (8,6))
plt.bar(list(counts.keys()), list(counts1.values()), width = 0.8)
ax.set_title("Histogram of number of genes in blocks (simulation)")
plt.savefig("/home/min/GIT/cnv-gene-mapping/data/deletion_geneblock.real.blocks.histogram.pdf")
plt.show()

In [None]:
[default_7]
depends: R_library("data.table"), R_library('susieR'), R_library("reticulate")
parameter: L = 10
parameter: pve = 0.005
parameter: method = "optim"
suffix = f'SuSiE.L_{L}.prior_{fmtP(pve)}'
print (suffix)
output: f'{_input[0]:nn}.{suffix}.susie.rds'
R: expand = '${ }', stderr = f'{_input[0]:nn}.stderr', stdout = f'{_input[0]:nn}.stdout'
    library(susieR)
    library(data.table)
    library(reticulate)
    source_python("pickle_reader.py")
    dat <- read_pickle_file(${_input[0]:r})
    for (x in dat){
        
    }
        
    
    
    X = as.matrix(data.table::fread("zcat ${_input[0]}"))
    y = as.matrix(data.table::fread("zcat ${_input[1]}"))
    storage.mode(X) = 'double'
    storage.mode(y) = 'double'
    res = susie(X, y, L = ${L}, scaled_prior_variance = ${pve}, estimate_prior_method = '${method}')
    saveRDS(res, ${_output:r})

In [9]:
library("reticulate")
source_python("pickle_reader.py")

In [10]:
pickle_data <- read_pickle_file("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.X.removed0.blocks.pkl")

In [11]:
for (i in pickle_data){
    print (dim(i))
}

[1] 12  8
[1] 12  1
[1] 12  1
[1] 12  1
[1] 12  1
[1] 12  2
[1] 12  1
[1] 12  1
[1] 12  5
[1] 12  8
[1] 12  1
[1] 12  6


In [136]:
import pandas as pd, numpy as np
data = pd.read_csv("/home/min/GIT/cnv-gene-mapping/dsc/deletion_geneblock.sample.1.gz", sep = "\t", compression = "gzip", header = None)

In [137]:
data = data.iloc[:, 0:5000]

In [138]:
data.shape

(500, 5000)

In [139]:
data = data.loc[:, (data != 0).any(axis = 0)]

In [140]:
data.head()

Unnamed: 0,30,31,32,33,34,35,36,37,44,45,46,47,48,49,50,51,52,53,194,195,196,197,198,199,200,201,883,1030,1058,1059,1060,1062,1184,1185,1234,1245,1261,1262,1263,1264,...,3268,3269,3270,3271,3272,3273,3274,3309,3319,3320,3321,3718,3719,3720,3721,3833,3949,3961,4300,4301,4344,4345,4410,4411,4433,4505,4507,4508,4509,4510,4633,4634,4635,4778,4806,4824,4859,4942,4943,4957
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [141]:
data.iloc[0:3, 0:5]

Unnamed: 0,30,31,32,33,34
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0


In [142]:
data.loc[:, 30:37].head()

Unnamed: 0,30,31,32,33,34,35,36,37
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0


In [8]:
sum(data.iloc[:, 115].tolist())

14

In [9]:
indices = list(data.columns)

In [10]:
from operator import itemgetter
from itertools import *
groups = []
for k, g in groupby(enumerate(indices), lambda x: x[0] - x[1]):
    groups.append(list(map(itemgetter(1), g)))
print (groups)

[[30, 31, 32, 33, 34, 35, 36, 37], [44, 45, 46, 47, 48, 49, 50, 51, 52, 53], [194, 195, 196, 197, 198, 199, 200, 201], [883], [1030], [1058, 1059, 1060], [1062], [1184, 1185], [1234], [1245], [1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286], [1289, 1290, 1291, 1292, 1293, 1294], [1643, 1644, 1645, 1646, 1647], [1874, 1875, 1876], [2193, 2194], [2275], [2330, 2331, 2332, 2333, 2334, 2335], [2611], [2627], [2632, 2633], [2662, 2663, 2664, 2665], [2708], [2842, 2843], [2899], [2915], [3037, 3038, 3039, 3040, 3041, 3042, 3043, 3044, 3045, 3046, 3047, 3048, 3049, 3050, 3051, 3052], [3056, 3057, 3058, 3059, 3060, 3061, 3062, 3063], [3169, 3170], [3204, 3205], [3221], [3231], [3265, 3266, 3267, 3268, 3269, 3270, 3271, 3272, 3273, 3274], [3309], [3319, 3320, 3321], [3718, 3719, 3720, 3721], [3833], [3949], [3961], [4300, 4301], [4344, 4345], [4410, 4411], [4433], [4505], [4507, 4508, 450

In [11]:
import pickle
from collections import OrderedDict
dfs_dict = OrderedDict()
for item in groups:
    dfs_dict[f"block{item[0]}"] = data.loc[:, item]

In [13]:
pickle.dump(dfs_dict, open("/home/min/GIT/cnv-gene-mapping/dsc/tmp.pkl", "wb"))

In [12]:
dfs_dict["block30"].head()

Unnamed: 0,30,31,32,33,34,35,36,37
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0


In [14]:
group_len = [len(item) for item in groups]

In [19]:
import matplotlib.pyplot as plt
counts = {i: group_len.count(i) for i in set(group_len) if i!=0}
fig, ax = plt.subplots(figsize = (8,6))
plt.bar(list(counts.keys()), list(counts.values()), width = 0.8)
ax.set_title("Histogram of number of genes in blocks")
plt.savefig("/home/min/GIT/cnv-gene-mapping/dsc/tmp.pdf")
# plt.show()

In [1]:
import pandas as pd, numpy as np
data = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.X.gz", compression = "gzip", sep = "\t", header = None)
data.shape

(100, 23343)

In [2]:
beta0 = np.log(0.05/(1-0.05))
beta1s = [np.log(x) for x in np.random.gamma(5, 1, data.shape[1])]
np.random.seed(9999999)
ys = [np.around(sum([x*y for x,y in zip(beta1s, data.iloc[i, :].tolist())]) + beta0, 6) for i in range(data.shape[0])]

In [3]:
y_df = pd.DataFrame(ys)

In [4]:
y_df.tail()

Unnamed: 0,0
95,-2.944439
96,43.804725
97,-2.944439
98,-0.131875
99,-2.944439


In [5]:
ys_1 = np.around(np.matmul(data.values, beta1s) + beta0, 6)
y_df_1 = pd.DataFrame(ys_1)

In [6]:
y_df_1.tail()

Unnamed: 0,0
95,-2.944439
96,43.804725
97,-2.944439
98,-0.131875
99,-2.944439


In [7]:
df = y_df == y_df_1

In [8]:
sum(df.iloc[:, 0].tolist())

100

In [34]:
susie_res = readRDS("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.SuSiE.L_10.prior_0p005.susie.rds")

In [35]:
names(susie_res)

In [36]:
mean(susie_res$pip)

In [37]:
tail(sort(susie_res$pip), 5)

In [38]:
# %put susie_res

In [40]:
susie_res["pip"].index(max(susie_res["pip"]))

In [150]:
for i,j in enumerate(susie_res["pip"]):
    if j == max(susie_res["pip"]):
        print (i)

16007
16008
16009
16010
16011
16012


In [8]:
susie_res.keys()

dict_keys(['alpha', 'mu', 'mu2', 'Xr', 'KL', 'lbf', 'sigma2', 'V', 'pi', 'null_index', 'converged', 'elbo', 'niter', 'intercept', 'fitted', 'sets', 'pip', 'X_column_scale_factors'])

In [153]:
sorted(susie_res["pip"])[-10:]

[0.000665602401835996,
 0.000665783181054036,
 0.000665783181054036,
 0.000665783181054036,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969,
 0.000761617759273969]

In [154]:
susie_res["fitted"][:10]

[0.622747786415506,
 0.62230394639043,
 0.622833298789739,
 0.62230394639043,
 0.626508087810574,
 0.622663406869125,
 0.623389166799308,
 0.623124755851558,
 0.623217382940279,
 0.62230394639043]

In [155]:
susie_res["sets"].keys()

dict_keys(['cs', 'coverage'])

In [156]:
susie_res["sets"]

{'coverage': 0.95, 'cs': None}

In [26]:
import pandas as pd

In [32]:
d = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/deletion_geneblock.sample.y.gz", compression = "gzip", sep = "\t", header = None)

In [33]:
d.head()

Unnamed: 0,0
0,0.226804
1,0.999957
2,0.05
3,1.0
4,0.999973


In [34]:
d1 = pd.read_table("/home/min/GIT/cnv-gene-mapping/data/simu_data_test/v1/deletion_geneblock.sample.y.gz", compression = "gzip", sep = "\t", header = None)

In [35]:
d1.head()

Unnamed: 0,0
0,0.986416
1,0.05
2,0.999957
3,0.05
4,1.0
