# Solutions for discussion January
## Single effect blocks for pymc3 and logistic
n_gene_in_block = 20, sample_size = 200000

In [1]:
import pandas as pd, numpy as np
import os
import pyreadr
from dsc.dsc_io import load_rds
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")
folder = "deletion_simu_30_shape0.777_scale0.843"
# name = "deletion.genes.block20.for_simu.sample.combined.genes.block1"

In [2]:
# +1 in gene index for gene name
fisher = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.genes.block1.fisher.gz", header = 0, sep = "\t")
fisher["p_deleterious"] = [k if i >= j else 1 for i,j,k in zip(fisher["d_c"], fisher["nd_c"], fisher["p"])]

In [3]:
fisher.shape

(2290, 7)

In [4]:
beta = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.block30.shape0.777.scale0.843.beta", header = None, names = ["beta"])

In [5]:
beta.tail()

Unnamed: 0,beta
22351,0.0
22352,0.0
22353,0.0
22354,0.0
22355,0.0


In [6]:
# index1 = pd.read_csv(f"{cwd}/deletion.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])
index1 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])

In [7]:
index2 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.block1.forsimu.index.csv", header = None, sep = "\t", names = ["is1", "is2"])

In [8]:
index2.tail()

Unnamed: 0,is1,is2
523,21904,21905
524,21936,21941
525,21979,21981
526,22088,22097
527,22337,22355


In [9]:
index = pd.concat([index1, index2], axis = 1)

In [10]:
index.head()

Unnamed: 0,id1,id2,is1,is2
0,0,7,0,7
1,8,17,14,23
2,18,22,93,97
3,23,36,164,177
4,37,44,229,236


In [11]:
index["number_effect"] = [np.count_nonzero(beta.loc[item[0]:item[1]]["beta"].tolist()) for idx, item in index2.iterrows()]

In [12]:
from collections import Counter
Counter(index["number_effect"])

Counter({1: 69, 0: 440, 5: 1, 2: 16, 3: 2})

In [13]:
idx_beta = index[index["number_effect"] >= 1][["is1", "is2", "id1", "id2"]]

In [15]:
idx_beta.shape

(88, 4)

In [16]:
beta_in_block = list()
fisher_in_block = list()
d_c = list()
d_nc = list()
nd_c = list()
nd_nc = list()
gene_in_block = list()
for idx, item in idx_beta.iterrows():
    beta_in_block.append(beta["beta"][item[0]:item[1]+1].tolist())
    fisher_in_block.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["p"].tolist())
    d_c.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["d_c"].tolist())
    d_nc.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["d_nc"].tolist())
    nd_c.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["nd_c"].tolist())
    nd_nc.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["nd_nc"].tolist())
    gene_in_block.append([f"gene_{i+1}" for i in range(item[2], item[3]+1)])

In [17]:
index1 = index[index["number_effect"] >= 1][["id1", "id2"]]
index1 = index1.set_index([[i for i in range(index1.shape[0])]])

In [30]:
index1.tail(20)

Unnamed: 0,id1,id2
68,1710,1749
69,1750,1772
70,1806,1814
71,1815,1826
72,1895,1904
73,1930,1939
74,1970,1971
75,1979,1981
76,1987,1987
77,1991,2011


In [20]:
logit_pymc3 = pd.DataFrame()
for idx, item in index1.iterrows():
    logit = pyreadr.read_r(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.logit.rds")
    pymc3 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.chain_1.uniform_new_bound.gz", usecols = [0], 
                        sep = "\t", header = 0)
    pymc3_1 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.chain_1.uniform_new_bound.seed_1.gz", usecols = [0], 
                          sep = "\t", header = 0, names = ["pymc3_s1"])
    pymc3_2 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.chain_3.uniform_new_bound.gz", usecols = [0], 
                         sep = "\t", header = 0, names = ["pymc3_c3"])
    pymc3_3 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.chain_3.uniform_new_bound.seed_1.gz", usecols = [0], 
                         sep = "\t", header = 0, names = ["pymc3_s1_c3"])
    pymc3["block"] = f"block{idx}"
    susie = load_rds(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.SuSiE.L_10.prior_0p005.susie.rds")["pip"]
    res = pd.concat([logit[None][["p1"]], pymc3[["inclusion_probability", "block"]], pymc3_1[["pymc3_s1"]], pymc3_2[["pymc3_c3"]], pymc3_3[["pymc3_s1_c3"]]], axis = 1)
    res["susie"] = susie
    res["beta"] = beta_in_block[idx]
    res["fisher"] = fisher_in_block[idx]
    res["gene"] = gene_in_block[idx]
    res["d_c"] = d_c[idx]
    res["d_nc"] = d_nc[idx]
    res["nd_c"] = nd_c[idx]
    res["nd_nc"] = nd_nc[idx]
    res = res.rename(columns = {"p1": "logit", "inclusion_probability": "pymc3"})
    logit_pymc3 = pd.concat([logit_pymc3, res])

In [21]:
logit_pymc3.head()

Unnamed: 0,logit,pymc3,block,pymc3_s1,pymc3_c3,pymc3_s1_c3,susie,beta,fisher,gene,d_c,d_nc,nd_c,nd_nc
0,0.125,0.0145,block0,0.0125,0.076,0.084,0.0,0.0,0.790466,gene_1,6,11922,8,11920
1,0.125,0.012,block0,0.015,0.074167,0.079333,0.0,0.0,0.790466,gene_2,6,11922,8,11920
2,0.125,0.017,block0,0.019,0.077833,0.070167,0.0,0.0,0.790466,gene_3,6,11922,8,11920
3,0.125,0.011,block0,0.015,0.071667,0.0745,0.0,0.0,0.790466,gene_4,6,11922,8,11920
4,0.125,0.016,block0,0.017,0.071167,0.075,0.0,0.552649,0.790466,gene_5,6,11922,8,11920


In [22]:
logit_pymc3["beta"] = logit_pymc3.apply(lambda x: 0 if x["beta"] == -0 else x["beta"], axis = 1)
logit_pymc3["beta"] = [np.round(x, 6) for x in logit_pymc3["beta"]]
logit_pymc3["logit"] = [np.round(x, 6) for x in logit_pymc3["logit"]]
logit_pymc3["pymc3"] = [np.round(x, 6) for x in logit_pymc3["pymc3"]]
logit_pymc3["pymc3_s1"] = [np.round(x, 6) for x in logit_pymc3["pymc3_s1"]]
logit_pymc3["pymc3_c3"] = [np.round(x, 6) for x in logit_pymc3["pymc3_c3"]]
logit_pymc3["pymc3_s1_c3"] = [np.round(x, 6) for x in logit_pymc3["pymc3_s1_c3"]]
logit_pymc3["susie"] = [np.round(x, 6) for x in logit_pymc3["susie"]]

In [23]:
logit_pymc3 = logit_pymc3[["block", "gene", "beta", "fisher", "d_c", "d_nc", "nd_c", "nd_nc", "logit", "pymc3", "pymc3_s1", "pymc3_c3", "pymc3_s1_c3", "susie"]]

In [24]:
logit_pymc3.head()

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,pymc3,pymc3_s1,pymc3_c3,pymc3_s1_c3,susie
0,block0,gene_1,0.0,0.790466,6,11922,8,11920,0.125,0.0145,0.0125,0.076,0.084,0.0
1,block0,gene_2,0.0,0.790466,6,11922,8,11920,0.125,0.012,0.015,0.074167,0.079333,0.0
2,block0,gene_3,0.0,0.790466,6,11922,8,11920,0.125,0.017,0.019,0.077833,0.070167,0.0
3,block0,gene_4,0.0,0.790466,6,11922,8,11920,0.125,0.011,0.015,0.071667,0.0745,0.0
4,block0,gene_5,0.552649,0.790466,6,11922,8,11920,0.125,0.016,0.017,0.071167,0.075,0.0


In [25]:
logit_pymc3[logit_pymc3["block"] == "block71"]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,pymc3,pymc3_s1,pymc3_c3,pymc3_s1_c3,susie
0,block71,gene_1816,1.230252,8.67353e-27,113,11815,6,11922,0.0,0.106,0.1045,0.1165,0.108667,0.0
1,block71,gene_1817,0.0,8.67353e-27,113,11815,6,11922,0.0,0.1,0.1015,0.118667,0.117333,0.0
2,block71,gene_1818,0.0,8.67353e-27,113,11815,6,11922,0.0,0.101,0.105,0.113833,0.115167,0.0
3,block71,gene_1819,0.0,4.424518e-36,163,11765,11,11917,0.0,0.067,0.056,0.221,0.2465,0.0
4,block71,gene_1820,0.0,4.424518e-36,163,11765,11,11917,0.0,0.0625,0.076,0.2325,0.204167,0.0
5,block71,gene_1821,0.0,6.837438e-57,310,11618,35,11893,0.0,0.0265,0.044,0.681667,0.679,0.0
6,block71,gene_1822,0.0,8.076649e-65,329,11599,31,11897,0.5,1.0,0.6115,0.686667,0.724667,0.5
7,block71,gene_1823,1.813771,8.076649e-65,329,11599,31,11897,0.5,0.035,0.426,1.0,0.96,0.5
8,block71,gene_1824,0.0,1.224925e-53,270,11658,25,11903,0.0,0.022,0.023,0.300333,0.232833,0.0
9,block71,gene_1825,0.0,1.224925e-53,270,11658,25,11903,0.0,0.0145,0.018,0.240167,0.272333,0.0


In [26]:
logit_pymc3.to_csv(f"{cwd}/{folder}/PIP_comparison.csv", sep = "\t", header = True, index = False)

In [27]:
sum(logit_pymc3[logit_pymc3["block"] == "block56"]["pymc3_s1_c3"])

3.5780000000000003

In [28]:
logit_pymc3.tail()

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,pymc3,pymc3_s1,pymc3_c3,pymc3_s1_c3,susie
14,block87,gene_2286,0.0,0.209819,15,11913,8,11920,0.008967,0.019,0.0225,0.153,0.152333,0.015341
15,block87,gene_2287,0.0,0.0351,12,11916,3,11925,0.059251,0.0785,0.083,0.103833,0.106667,0.057696
16,block87,gene_2288,0.0,0.0351,12,11916,3,11925,0.059251,0.0725,0.075,0.1,0.107167,0.057696
17,block87,gene_2289,0.0,0.0351,12,11916,3,11925,0.059251,0.077,0.074,0.103333,0.104167,0.057696
18,block87,gene_2290,0.0,0.0351,12,11916,3,11925,0.059251,0.0715,0.0745,0.102833,0.097333,0.057696


In [29]:
logit_pymc3[logit_pymc3["fisher"] == min(logit_pymc3["fisher"])]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,pymc3,pymc3_s1,pymc3_c3,pymc3_s1_c3,susie
6,block71,gene_1822,0.0,8.076649e-65,329,11599,31,11897,0.5,1.0,0.6115,0.686667,0.724667,0.5
7,block71,gene_1823,1.813771,8.076649e-65,329,11599,31,11897,0.5,0.035,0.426,1.0,0.96,0.5
