# Solutions for discussion January
## Single effect blocks for pymc3 and logistic
n_gene_in_block = 20, sample_size = 200000

In [1]:
import pandas as pd, numpy as np
import os
import pyreadr
from dsc.dsc_io import load_rds
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")
folder = "deletion_simu_30_shape0.777_scale0.843"
# name = "deletion.genes.block20.for_simu.sample.combined.genes.block1"

In [2]:
# +1 in gene index for gene name
fisher = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.genes.block1.fisher.gz", header = 0, sep = "\t")
fisher["p_deleterious"] = [k if i >= j else 1 for i,j,k in zip(fisher["d_c"], fisher["nd_c"], fisher["p"])]

In [3]:
fisher.shape

(2290, 7)

In [4]:
beta = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.block30.shape0.777.scale0.843.beta", header = None, names = ["beta"])

In [5]:
# index1 = pd.read_csv(f"{cwd}/deletion.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])
index1 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])

In [6]:
index2 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.block1.forsimu.index.csv", header = None, sep = "\t", names = ["is1", "is2"])

In [7]:
index2.head()

Unnamed: 0,is1,is2
0,0,7
1,14,23
2,93,97
3,164,177
4,229,236


In [8]:
index = pd.concat([index1, index2], axis = 1)

In [9]:
index.head(10)

Unnamed: 0,id1,id2,is1,is2
0,0,7,0,7
1,8,17,14,23
2,18,22,93,97
3,23,36,164,177
4,37,44,229,236
5,45,45,399,399
6,46,49,785,788
7,50,50,841,841
8,51,53,853,855
9,54,54,891,891


In [10]:
index.shape

(528, 4)

In [11]:
index["number_effect"] = [np.count_nonzero(beta.loc[item[0]:item[1]]["beta"].tolist()) for idx, item in index2.iterrows()]

In [12]:
from collections import Counter
Counter(index["number_effect"])

Counter({1: 69, 0: 440, 5: 1, 2: 16, 3: 2})

In [13]:
idx_beta = index[index["number_effect"] >= 1][["is1", "is2", "id1", "id2"]]

In [14]:
idx_beta.head(10)

Unnamed: 0,is1,is2,id1,id2
0,0,7,0,7
3,164,177,23,36
8,853,855,51,53
19,1215,1273,81,139
20,1613,1617,140,144
22,1752,1770,146,164
27,2043,2043,174,174
36,2272,2294,186,208
38,2315,2327,218,230
54,2900,2929,266,295


In [15]:
idx_beta.shape

(88, 4)

In [16]:
beta_in_block = list()
fisher_in_block = list()
d_c = list()
d_nc = list()
nd_c = list()
nd_nc = list()
gene_in_block = list()
for idx, item in idx_beta.iterrows():
    beta_in_block.append(beta["beta"][item[0]:item[1]+1].tolist())
    fisher_in_block.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["p"].tolist())
    d_c.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["d_c"].tolist())
    d_nc.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["d_nc"].tolist())
    nd_c.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["nd_c"].tolist())
    nd_nc.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["nd_nc"].tolist())
    gene_in_block.append([f"gene_{i+1}" for i in range(item[2], item[3]+1)])

In [17]:
index1 = index[index["number_effect"] >= 1][["id1", "id2"]]
# index1 = index1.set_index([[i for i in range(index1.shape[0])]])

In [28]:
index1.tail(20)

Unnamed: 0,id1,id2
445,1710,1749
446,1750,1772
449,1806,1814
450,1815,1826
464,1895,1904
473,1930,1939
479,1970,1971
484,1979,1981
489,1987,1987
491,1991,2011


In [19]:
index1.shape

(88, 2)

In [20]:
logit_pymc3 = pd.DataFrame()
logit1 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.genes.logit.all.blocks.pip.regional.csv", sep = "\t", header = None, names = ["logit1"])
pymc31 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block30.for_simu.sample.combined.genes.pymc3.chain_5.all.blocks.uniform.multi_seeds.pip.gz", sep = "\t", header = None, names = ["pymc3_pip1"])
i = 0
for idx, item in index1.iterrows():
    logit = pyreadr.read_r(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.logit.rds")
    pymc3 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.chain_1.uniform_new_bound.gz", usecols = [0], sep = "\t", header = 0)
    pymc3["block"] = f"block{idx}"
    susie = load_rds(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block30.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.SuSiE.L_10.prior_0p005.susie.rds")["pip"]
    res = pd.concat([logit[None][["p1"]], pymc3[["inclusion_probability", "block"]]], axis = 1)
    res["logit1"] = logit1["logit1"].tolist()[item[0]:item[1]+1]
    res["pymc3_new"] = pymc31["pymc3_pip1"].tolist()[item[0]:item[1]+1]
    res["susie"] = susie
    res["beta"] = beta_in_block[i]
    res["fisher"] = fisher_in_block[i]
    res["gene"] = gene_in_block[i]
    res["d_c"] = d_c[i]
    res["d_nc"] = d_nc[i]
    res["nd_c"] = nd_c[i]
    res["nd_nc"] = nd_nc[i]
    res = res.rename(columns = {"p1": "logit", "inclusion_probability": "pymc3"})
    logit_pymc3 = pd.concat([logit_pymc3, res])
    i += 1

In [21]:
logit_pymc3["beta"] = logit_pymc3.apply(lambda x: 0 if x["beta"] == -0 else x["beta"], axis = 1)
logit_pymc3["beta"] = [np.round(x, 6) for x in logit_pymc3["beta"]]
logit_pymc3["logit"] = [np.round(x, 6) for x in logit_pymc3["logit"]]
logit_pymc3["logit1"] = [np.round(x, 6) for x in logit_pymc3["logit1"]]
logit_pymc3["pymc3"] = [np.round(x, 6) for x in logit_pymc3["pymc3"]]
logit_pymc3["pymc3_new"] = [np.round(x, 6) for x in logit_pymc3["pymc3_new"]]
logit_pymc3["susie"] = [np.round(x, 6) for x in logit_pymc3["susie"]]

In [22]:
logit_pymc3 = logit_pymc3[["block", "gene", "beta", "fisher", "d_c", "d_nc", "nd_c", "nd_nc", "logit", "logit1", "susie", "pymc3_new"]]

In [23]:
logit_pymc3.shape

(985, 12)

In [29]:
logit_pymc3.head()

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,logit1,susie,pymc3_new
0,block0,gene_1,0.0,0.790466,6,11922,8,11920,0.125,0.015036,0.0,0.0174
1,block0,gene_2,0.0,0.790466,6,11922,8,11920,0.125,0.015036,0.0,0.0139
2,block0,gene_3,0.0,0.790466,6,11922,8,11920,0.125,0.015036,0.0,0.0174
3,block0,gene_4,0.0,0.790466,6,11922,8,11920,0.125,0.015036,0.0,0.0168
4,block0,gene_5,0.552649,0.790466,6,11922,8,11920,0.125,0.015036,0.0,0.019


In [30]:
logit_pymc3[logit_pymc3["block"] == "block8"]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,logit1,susie,pymc3_new
0,block8,gene_52,1.957626,1.957547e-09,56,11872,9,11919,1.0,1.0,1.0,1.0
1,block8,gene_53,0.0,0.823729,9,11919,11,11917,0.0,0.0,0.0,0.0148
2,block8,gene_54,0.0,1.0,5,11923,4,11924,0.0,0.0,0.0,0.0251


In [31]:
block1 = list()
for i in sorted(set(logit_pymc3["block"]), key=logit_pymc3["block"].tolist().index):
    if min(logit_pymc3[logit_pymc3["block"] == i]["fisher"]) < 1e-3:# or np.count_nonzero(logit_pymc3[logit_pymc3["block"] == i]["beta"]) >= 2:
        block1.append(i)

In [32]:
logit_pymc3_1 = logit_pymc3[logit_pymc3["block"].isin(block1)]

In [35]:
logit_pymc3_1.tail()

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,logit1,susie,pymc3_new
16,block507,gene_2123,0.0,5.775135e-15,53,11875,1,11927,0.052632,0.023004,0.052632,0.1264
17,block507,gene_2124,0.0,5.775135e-15,53,11875,1,11927,0.052632,0.023004,0.052632,0.1276
18,block507,gene_2125,0.0,5.775135e-15,53,11875,1,11927,0.052632,0.023004,0.052632,0.1356
0,block514,gene_2141,0.888633,1.818127e-05,74,11854,30,11898,0.750675,0.746846,0.760358,0.702
1,block514,gene_2142,0.0,6.821863e-05,58,11870,22,11906,0.249325,0.248053,0.239642,0.3122


In [36]:
logit_pymc3_1[logit_pymc3_1["block"] == "block450"]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,logit1,susie,pymc3_new
0,block450,gene_1816,1.230252,8.67353e-27,113,11815,6,11922,0.0,0.0,0.0,0.1038
1,block450,gene_1817,0.0,8.67353e-27,113,11815,6,11922,0.0,0.0,0.0,0.1015
2,block450,gene_1818,0.0,8.67353e-27,113,11815,6,11922,0.0,0.0,0.0,0.1041
3,block450,gene_1819,0.0,4.424518e-36,163,11765,11,11917,0.0,0.0,0.0,0.0607
4,block450,gene_1820,0.0,4.424518e-36,163,11765,11,11917,0.0,0.0,0.0,0.0635
5,block450,gene_1821,0.0,6.837438e-57,310,11618,35,11893,0.0,0.0,0.0,0.0423
6,block450,gene_1822,0.0,8.076649e-65,329,11599,31,11897,0.5,0.5,0.5,0.3719
7,block450,gene_1823,1.813771,8.076649e-65,329,11599,31,11897,0.5,0.5,0.5,0.6827
8,block450,gene_1824,0.0,1.224925e-53,270,11658,25,11903,0.0,0.0,0.0,0.0187
9,block450,gene_1825,0.0,1.224925e-53,270,11658,25,11903,0.0,0.0,0.0,0.0211


In [37]:
logit_pymc3.to_csv(f"{cwd}/{folder}/PIP_comparison_0430.csv", sep = "\t", header = True, index = False)

In [39]:
sum(logit_pymc3[logit_pymc3["block"] == "block8"]["logit"])

1.0

In [40]:
logit_pymc3[logit_pymc3["fisher"] == min(logit_pymc3["fisher"])]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,logit1,susie,pymc3_new
6,block450,gene_1822,0.0,8.076649e-65,329,11599,31,11897,0.5,0.5,0.5,0.3719
7,block450,gene_1823,1.813771,8.076649e-65,329,11599,31,11897,0.5,0.5,0.5,0.6827
