# Solutions for discussion January
## Single effect blocks for pymc3 and logistic
n_gene_in_block = 20, sample_size = 200000

In [1]:
import pandas as pd, numpy as np
import os
import pyreadr
from dsc.dsc_io import load_rds
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/data")
folder = "deletion_simu_20"
name = "deletion.genes.block20.for_simu.sample.combined.genes.block1"

In [2]:
# +1 in gene index for gene name
fisher = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block20.for_simu.sample.combined.genes.block1.fisher.gz", header = 0, sep = "\t")
fisher["p_deleterious"] = [k if i >= j else 1 for i,j,k in zip(fisher["d_c"], fisher["nd_c"], fisher["p"])]

In [3]:
fisher.shape

(2290, 7)

In [4]:
beta = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block20.for_simu.shape0.scale1.beta", header = None, names = ["beta"])

In [5]:
beta.tail()

Unnamed: 0,beta
22351,0.0
22352,0.0
22353,0.0
22354,0.0
22355,0.0


In [6]:
# index1 = pd.read_csv(f"{cwd}/deletion.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])
index1 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block20.for_simu.sample.combined.block1.index.csv", header = None, sep = "\t", names = ["id1", "id2"])

In [7]:
index2 = pd.read_csv(f"{cwd}/{folder}/deletion.genes.block20.for_simu.sample.combined.block1.forsimu.index.csv", header = None, sep = "\t", names = ["is1", "is2"])

In [8]:
index2.tail()

Unnamed: 0,is1,is2
523,21904,21905
524,21936,21941
525,21979,21981
526,22088,22097
527,22337,22355


In [9]:
index = pd.concat([index1, index2], axis = 1)

In [10]:
index.tail()

Unnamed: 0,id1,id2,is1,is2
523,2250,2251,21904,21905
524,2252,2257,21936,21941
525,2258,2260,21979,21981
526,2261,2270,22088,22097
527,2271,2289,22337,22355


In [11]:
index["number_effect"] = [np.count_nonzero(beta.loc[item[0]:item[1]]["beta"].tolist()) for idx, item in index2.iterrows()]

In [12]:
idx_beta = index[index["number_effect"] == 1][["is1", "is2", "id1", "id2"]]

In [13]:
beta_in_block = list()
fisher_in_block = list()
d_c = list()
d_nc = list()
nd_c = list()
nd_nc = list()
gene_in_block = list()
for idx, item in idx_beta.iterrows():
    beta_in_block.append(beta["beta"][item[0]:item[1]+1].tolist())
    fisher_in_block.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["p"].tolist())
    d_c.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["d_c"].tolist())
    d_nc.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["d_nc"].tolist())
    nd_c.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["nd_c"].tolist())
    nd_nc.append(fisher.set_index('gene').loc[[f"gene_{i+1}" for i in range(item[2], item[3]+1)]].reset_index()["nd_nc"].tolist())
    gene_in_block.append([f"gene_{i+1}" for i in range(item[2], item[3]+1)])

In [14]:
index = index[index["number_effect"] == 1][["id1", "id2"]]
index = index.set_index([[i for i in range(index.shape[0])]])

In [15]:
index.shape

(69, 2)

In [24]:
index.tail()

Unnamed: 0,id1,id2
64,2062,2067
65,2133,2137
66,2140,2141
67,2261,2270
68,2271,2289


In [17]:
logit_pymc3 = pd.DataFrame()
for idx, item in index.iterrows():
    logit = pyreadr.read_r(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block20.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.logit.rds")
    pymc3 = pd.read_csv(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block20.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.pymc3.gz", sep = "\t")
    pymc3["block"] = f"block{idx}"
    susie = load_rds(f"{cwd}/{folder}/block_{item[0]}_{item[1]}/deletion.genes.block20.for_simu.sample.combined.genes.block_{item[0]}_{item[1]}.SuSiE.L_1.prior_0p005.susie.rds")["pip"]
    res = pd.concat([logit[None][["p1"]], pymc3[["inclusion_probability", "block"]]], axis = 1)
    res["susie"] = susie
    res["beta"] = beta_in_block[idx]
    res["fisher"] = fisher_in_block[idx]
    res["gene"] = gene_in_block[idx]
    res["d_c"] = d_c[idx]
    res["d_nc"] = d_nc[idx]
    res["nd_c"] = nd_c[idx]
    res["nd_nc"] = nd_nc[idx]
    res = res.rename(columns = {"p1": "logit", "inclusion_probability": "pymc3"})
    logit_pymc3 = pd.concat([logit_pymc3, res])

  from pandas.core.index import Index as PandasIndex


In [19]:
logit_pymc3["beta"] = logit_pymc3.apply(lambda x: 0 if x["beta"] == -0 else x["beta"], axis = 1)
logit_pymc3["beta"] = [np.round(x, 6) for x in logit_pymc3["beta"]]
logit_pymc3["logit"] = [np.round(x, 6) for x in logit_pymc3["logit"]]
logit_pymc3["pymc3"] = [np.round(x, 6) for x in logit_pymc3["pymc3"]]
logit_pymc3["susie"] = [np.round(x, 6) for x in logit_pymc3["susie"]]

In [20]:
logit_pymc3 = logit_pymc3[["block", "gene", "beta", "fisher", "d_c", "d_nc", "nd_c", "nd_nc", "logit", "pymc3", "susie"]]

In [21]:
logit_pymc3[logit_pymc3["block"] == "block12"]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,pymc3,susie
0,block12,gene_379,-0.587536,0.624964,3,10452,1,10454,0.333333,0.065,0.333333
1,block12,gene_380,0.0,0.624964,3,10452,1,10454,0.333333,0.0575,0.333333
2,block12,gene_381,0.0,0.624964,3,10452,1,10454,0.333333,0.0605,0.333333


In [22]:
logit_pymc3.to_csv(f"{cwd}/{folder}/logit_pymc3_comparison_1.csv", sep = "\t", header = True, index = False)

In [23]:
logit_pymc3[logit_pymc3["block"] == "block30"]

Unnamed: 0,block,gene,beta,fisher,d_c,d_nc,nd_c,nd_nc,logit,pymc3,susie
0,block30,gene_924,-0.716931,0.048951,4,10451,13,10442,0.139127,0.0265,0.124754
1,block30,gene_925,0.0,0.010581,5,10450,18,10437,0.360873,0.0485,0.375246
2,block30,gene_926,0.0,0.010581,5,10450,18,10437,0.360873,0.0425,0.375246
3,block30,gene_927,0.0,0.048951,4,10451,13,10442,0.139127,0.022,0.124754
