# Solutions for discussion January
## Single effect blocks for pymc3 and logistic
n_gene_in_block = 20, sample_size = 200000

In [1]:
import pandas as pd, numpy as np
import os
import pyreadr
from dsc.dsc_io import load_rds
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")
folder = "deletion_simu_20"
name = "deletion.genes.block20.for_simu.sample.combined.genes.block1"

In [2]:
# +1 in gene index for gene name
fisher = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block20.for_simu.sample.combined.genes.block1.fisher.gz", header = 0, sep = "\t")
fisher["p_deleterious"] = [k if i >= j else 1 for i,j,k in zip(fisher["d_c"], fisher["nd_c"], fisher["p"])]

In [4]:
beta = pd.read_csv(f"{cwd}/deletion.genes.block20.for_simu.shape0.scale1.beta", header = None, names = ["beta"])

In [7]:
# index1 = pd.read_csv(f"{cwd}/deletion.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])
index1 = pd.read_csv(f"{cwd}/{folder}/deletion.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])

In [8]:
# index2 = pd.read_csv(f"{cwd}/deletion.block1.forsimu.index.csv", header = None, sep = "\t", names = ["is1", "is2"])
index2 = pd.read_csv(f"{cwd}/{folder}/deletion.block1.forsimu.index.csv", header = None, sep = "\t", names = ["is1", "is2"])

In [9]:
index = pd.concat([index1, index2], axis = 1)

In [10]:
index.tail()

Unnamed: 0,id1,id2,is1,is2
523,2250,2251,21934,21935
524,2252,2257,21966,21971
525,2258,2260,22009,22011
526,2261,2270,22118,22127
527,2271,2289,22367,22385


In [11]:
index.head()

Unnamed: 0,id1,id2,is1,is2
0,0,7,30,37
1,8,17,44,53
2,18,22,123,127
3,23,36,194,207
4,37,44,259,266


In [9]:
index["number_effect"] = [np.count_nonzero(beta.loc[item[0]:item[1]]["beta"].tolist()) for idx, item in index2.iterrows()]

In [10]:
idx_beta = index[index["number_effect"] == 1][["is1", "is2", "id1", "id2"]]

In [13]:
beta_in_block = list()
fisher_in_block = list()
gene_in_block = list()
for idx, item in idx_beta.iterrows():
    beta_in_block.append(beta["beta"][item[0]:item[1]+1].tolist())
    fisher_in_block.append(fisher[fisher["gene"].isin([f"gene_{i+1}" for i in range(item[2], item[3]+1)])]["p_deleterious"].tolist())
    gene_in_block.append([f"gene_{i+1}" for i in range(item[2], item[3]+1)])

In [14]:
fisher_in_block[:5]

[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 [0.6249641277247204, 0.6249641277247204, 0.6249641277247204],
 [0.5799946293546432, 0.7886821920435936, 0.7886821920435936, 1.0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 1.0, 1.0]]

In [15]:
index = index[index["number_effect"] == 1][["id1", "id2"]]
index = index.set_index([[i for i in range(index.shape[0])]])

In [16]:
index.shape

(66, 2)

In [17]:
logit_pymc3 = pd.DataFrame()
for idx, item in index.iterrows():
    logit = pyreadr.read_r(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block20.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.logit.rds")
    pymc3 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block20.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.gz", sep = "\t")
    pymc3["block"] = f"block{idx+1}"
    susie = load_rds(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block20.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.SuSiE.L_1.prior_0p005.susie.rds")["pip"]
    res = pd.concat([logit[None][["p1"]], pymc3[["inclusion_probability", "block"]]], axis = 1)
    res["susie"] = susie
    res["beta"] = beta_in_block[idx]
    res["fisher"] = fisher_in_block[idx]
    res["gene"] = gene_in_block[idx]
    res = res.rename(columns = {"p1": "logit", "inclusion_probability": "pymc3"})
    logit_pymc3 = pd.concat([logit_pymc3, res])

  from pandas.core.index import Index as PandasIndex


In [18]:
logit_pymc3["beta"] = logit_pymc3.apply(lambda x: 0 if x["beta"] == -0 else x["beta"], axis = 1)
logit_pymc3["logit"] = [np.round(x, 6) for x in logit_pymc3["logit"]]
logit_pymc3["pymc3"] = [np.round(x, 6) for x in logit_pymc3["pymc3"]]
logit_pymc3["susie"] = [np.round(x, 6) for x in logit_pymc3["susie"]]

In [22]:
logit_pymc3 = logit_pymc3[["block", "gene", "beta", "fisher", "logit", "pymc3", "susie"]]

In [26]:
logit_pymc3[logit_pymc3["fisher"] < 0.1]

Unnamed: 0,block,gene,beta,fisher,logit,pymc3,susie
0,block28,gene_966,-1.49575,0.07026019,0.1,0.093,0.1
1,block28,gene_967,0.0,0.07026019,0.1,0.1055,0.1
2,block28,gene_968,0.0,0.07026019,0.1,0.1235,0.1
3,block28,gene_969,0.0,0.07026019,0.1,0.1025,0.1
4,block28,gene_970,0.0,0.07026019,0.1,0.108,0.1
5,block28,gene_971,0.0,0.07026019,0.1,0.0945,0.1
6,block28,gene_972,0.0,0.07026019,0.1,0.1305,0.1
7,block28,gene_973,0.0,0.07026019,0.1,0.105,0.1
8,block28,gene_974,0.0,0.07026019,0.1,0.1125,0.1
9,block28,gene_975,0.0,0.07026019,0.1,0.1195,0.1


In [24]:
logit_pymc3.to_csv(f"{cwd}/{folder}/logit_pymc3_comparison.csv", sep = "\t", header = True, index = False)

In [42]:
logit_pymc3[logit_pymc3["block"] == "block18"]

Unnamed: 0,block,gene,beta,fisher,logit,pymc3,susie
0,block18,gene_618,0.0,1.0,0.5,0.0225,0.0
1,block18,gene_619,-0.537504,1.0,0.5,0.035,0.0


In [43]:
pymc3

Unnamed: 0,inclusion_probability,beta,beta_given_inclusion,block
0,0.027,-0.008167,-0.302492,block65
1,0.0285,-0.007865,-0.27596,block65
2,0.0195,-0.005697,-0.292139,block65
3,0.024,-0.003135,-0.130628,block65
4,0.0225,-0.007494,-0.333062,block65
5,0.0195,-0.001529,-0.078386,block65
6,0.0195,-0.004695,-0.240773,block65
7,0.0315,-0.011108,-0.352644,block65
8,0.0275,-0.009185,-0.33401,block65
9,0.0225,-0.009203,-0.409024,block65
