In [2]:
import os, pickle
import numpy as np, pandas as pd
cwd = os.path.expanduser("~/GIT/cnv-gene-mapping/output/archive/deletion_simu_30_shape0.777_scale0.843")
X = pd.read_csv(f'{cwd}/block_1151_1159/deletion.genes.block30.for_simu.sample.combined.genes.block_1151_1159.gz', compression = "gzip", sep = "\t", header = None, dtype = float)
X = X.iloc[:, 1:4]
X = X.astype(int)
y = np.loadtxt(f'{cwd}/deletion.genes.block30.for_simu.sample.combined.y.gz', dtype=int)

In [4]:
X.head()

Unnamed: 0,1,2,3
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [7]:
sum(X[3])

79

In [8]:
X.corr()

Unnamed: 0,1,2,3
1,1.0,1.0,0.670527
2,1.0,1.0,0.670527
3,0.670527,0.670527,1.0


In [9]:
import pymc3 as pm
import theano.tensor as tt
import matplotlib.pyplot as plt

In [10]:
from scipy.special import expit

In [11]:
hyperparam_file = f"{cwd}/deletion.genes.block30.for_simu.varbvs.whole.genome.prior.csv"

In [12]:
prevalence = 0.05

In [13]:
priors = np.loadtxt(hyperparam_file)
pi0, mu0, s0 = priors[0], priors[1], priors[2]
X_complete = X
# remove duplicated columns in X but still keep track of the number of occurance and index removed 
# in order to reconstruct results later
X, index_reconstruct, dup_counts = np.unique(X_complete.to_numpy(), axis=1, return_inverse=True, return_counts=True)
case_prop = sum(y) / y.shape[0]
invlogit = lambda x: 1/(1 + tt.exp(-x))
upper = np.log(case_prop / (1-case_prop))
lower = np.log(prevalence / (1-prevalence))

In [17]:
reparameterize = False
n_chain = 2
iteration = 10
tune_prop = 0.25
target_accept = 0.98
mcmc_seed = 999

In [20]:
model = pm.Model()
with model:
    xi = pm.Bernoulli('xi', pi0, shape = X.shape[1]) #inclusion probability for each variable
    if reparameterize:
        beta_offset = pm.Normal('beta_offset', mu = 0, sd = 1, shape = X.shape[1])
        alpha_offset = pm.distributions.continuous.Uniform("alpha_offset", lower = -1, upper = 1)
        beta = pm.Deterministic("beta", mu0 + beta_offset * s0) #Prior for the non-zero coefficients
        alpha = pm.Deterministic("alpha", lower + (alpha_offset+1)/2*(upper - lower))
    else:
        beta = pm.Normal('beta', mu = mu0, sd = s0, shape = X.shape[1])
        alpha = pm.distributions.continuous.Uniform("alpha", lower = lower, upper = upper)
    p = pm.math.dot(X, xi * beta) #Deterministic function to map the stochastics to the output
    y_obs = pm.Bernoulli('y_obs', invlogit(p + alpha), observed = y) #Data likelihood
# Fit model multiple times
results = []
for i in range(n_chain):
    with model:
        trace = pm.sample(draws = iteration, init = 'nuts', chains = 1, tune = int(tune_prop * iteration),
                          nuts = {"target_accept": target_accept},
                          random_seed = mcmc_seed + i, cores = 1, progressbar = True)
    # FIXME: dump trace to pkl here, if needed
    # results
    pip = np.apply_along_axis(np.mean, 0, trace['xi'])
    print("pip before", pip)
    beta = np.apply_along_axis(np.mean, 0, np.multiply(trace["beta"], trace["xi"]))
    beta_given_inclusion = np.apply_along_axis(np.sum, 0, trace['xi'] * trace['beta']) / np.apply_along_axis(np.sum, 0, trace['xi'])
    # reconstruct original results adding back duplicated variables
    pip = pip / dup_counts
    beta = beta / dup_counts
    result = np.vstack((pip, beta, beta_given_inclusion)).T[index_reconstruct,:]
    print("pip after", result[0])
    results.append(pd.DataFrame(result, columns = ['inclusion_probability', 'beta', 'beta_given_inclusion']))
# merge results

results = sum(results)/len(results)
results = results.set_index(X_complete.columns)
results[["inclusion_probability"]].to_csv(f"{cwd}/new_mcmc_test_pip", sep = "\t", header = False, index = True)
results.to_csv(f"{cwd}/new_mcmc_test_res", sep = "\t", header = False, index = True)



Sampling chain 0, 9 divergences: 100%|##########| 12/12 [00:00<00:00, 34.70it/s]


pip before [1. 1.]
pip after [0.5        0.39161545 0.7832309 ]


Sampling chain 0, 9 divergences: 100%|##########| 12/12 [00:00<00:00, 37.65it/s]


pip before [1. 1.]
pip after [0.5        0.39161545 0.7832309 ]


In [16]:
results

Unnamed: 0,inclusion_probability,beta,beta_given_inclusion
0,0.5,0.577528,1.155056
1,0.5,0.577528,1.155056
2,0.021,0.006963,0.322051
