In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import logistic_regression as lr
import naive_bayes as nb

In [2]:
train = pd.read_csv("../multi_task_model_optimal_1.5_threshold/train.csv", index_col=(0,1))
test = pd.read_csv("../multi_task_model_optimal_1.5_threshold/n1.csv", index_col=(0,1))

In [3]:
# multi-task parameters
beta_children = np.loadtxt("../multi_task_model_optimal_1.5_threshold/beta_children.txt")
alpha = np.loadtxt("../multi_task_model_optimal_1.5_threshold/beta_parent.txt")
phi = np.loadtxt("../multi_task_model_optimal_1.5_threshold/phi.txt")
# RIVER parameters
beta_river = np.loadtxt("../multi_task_model_optimal_1.5_threshold/beta_RIVER.v6.txt")

lambda_hp_parent = 4.333047702488766
num_g_features = len(beta_river)
lambda_hp_children_dict = {'brain': 0.562229, 'group1': 0.75696656, 'muscle': 1.51665066, 'epithelial': 2.22486734, 'digestive': 5.17309994}
tissues = ['brain', 'digestive', 'epithelial', 'group1', 'muscle']
genomic_features = list(train.columns[0:-9])
phi_river = np.zeros((2,2))
phi_river[0][0] = .8
phi_river[1][0] = .2
phi_river[0][1] = .3
phi_river[1][1] = .7

In [25]:
def _river_likelihood(data, beta_river, phi_river):
    return eStepRIVER(data, beta_river, phi_river)

def eStepRIVER(data, beta, phi):
    '''
       Compute expectation for RIVER
    '''
    # log P(Z = 1 | G)
    log_prob_z_1_given_g = lr.log_prob(data[genomic_features].values, beta)

    # log P(Z = 0 | G)
    log_prob_z_0_given_g = np.log(1.0 - np.exp(log_prob_z_1_given_g))

    # naive bayes
    log_prob_e_given_z_1 = nb.log_prob(data['shared_label'].values, 1, phi)        
    log_prob_e_given_z_0 = nb.log_prob(data['shared_label'].values, 0, phi)

    log_q = log_prob_e_given_z_1 + log_prob_z_1_given_g -  np.log(np.exp(log_prob_e_given_z_0) * np.exp(log_prob_z_0_given_g) + 
        np.exp(log_prob_e_given_z_1) * np.exp(log_prob_z_1_given_g))

    return np.sum(log_q)

In [9]:
def _shared_logistic_regression_likelihood(data):
    beta_shared = np.loadtxt('../src/genome_only_shared_beta.v6.txt')
    p_z_1_given_g = np.exp(lr.log_prob(data[genomic_features].values, beta_shared)) * data['shared_label'].values
    p_z_0_given_g = (1.0 - p_z_1_given_g) * (1.0 - data['shared_label'].values)
    log_prob = np.sum(np.log(p_z_1_given_g + p_z_0_given_g))
    return log_prob

In [10]:
def _tissue_specific_logistic_regression_likelihood(data):
    log_prob = 0.0
    for tissue in tissues:
        data_t = data[data["tissue"] == tissue]
        beta = lr.sgd(data_t[genomic_features].values, data_t["expr_label"].values, np.zeros(len(genomic_features)), np.zeros(len(genomic_features)), 1.0)
        p_z_1_given_g = np.exp(lr.log_prob(data_t[genomic_features].values, beta)) * data_t["expr_label"].values
        p_z_0_given_g = (1.0 - p_z_1_given_g) * (1.0 - data_t["expr_label"].values)
        log_prob += np.sum(np.log(p_z_1_given_g + p_z_0_given_g))
    return log_prob

In [16]:
def _multitask_likelihood(data, alpha, beta_children, lambda_hp_parent, lambda_hp_children_dict, phi):
    log_prob = 0
    log_prob += log_p_alpha(alpha, lambda_hp_parent)
    log_prob += log_p_beta_child_given_beta(beta_children, alpha, lambda_hp_children_dict)
    log_prob += eStepGlobal(data, tissues, beta_children, phi)
    return log_prob

In [17]:
def log_p_alpha(alpha, sigma):
    '''
        Compute P(Beta_j) = normal distribution, mean = 0, cov = sigma
        @param: j - jth component of beta
    '''
    
    return np.sum(np.log(scipy.stats.norm(0, lambda_hp_parent).pdf(alpha)))

def log_p_beta_child_given_beta(beta_children, alpha, lambda_hp_children_dict):
    '''
        Compute P(Beta_child_j | Beta_j) = normal distribution, mean = beta_j, cov = sigma_child
        @param: i - tissue i
        @param: j - component
    '''
    log_prob = 0
    for i in range(len(tissues)):
        beta = beta_children[i]
        log_prob += np.sum(np.log(scipy.stats.norm(alpha,lambda_hp_children_dict[tissues[i]]).pdf(beta)))
    return log_prob

def eStepLocal(data, tissue, beta, phi):
    '''
       Compute expectation for tissue i
    '''
    # log P(Z = 1 | G)
    data_t = data[data["tissue"] == tissue]
    log_prob_z_1_given_g = lr.log_prob(data_t[genomic_features].values, beta)

    # log P(Z = 0 | G)
    log_prob_z_0_given_g = np.log(1.0 - np.exp(log_prob_z_1_given_g))

    # noisy OR
    log_prob_e_given_z_1 = nb.log_prob_noisyor_2_params(data_t['expr_label'], 1, data_t["eqtl"], phi)
    log_prob_e_given_z_0 = nb.log_prob_noisyor_2_params(data_t['expr_label'], 0, data_t["eqtl"], phi)

    # naive bayes
    #log_prob_e_given_z_1 = nb.log_prob(self.train_list[i]['expr_label'].values, 1, self.phi)        
    #log_prob_e_given_z_0 = nb.log_prob(self.train_list[i]['expr_label'].values, 0, self.phi)

    log_q = log_prob_e_given_z_1 + log_prob_z_1_given_g -  np.log(np.exp(log_prob_e_given_z_0) * np.exp(log_prob_z_0_given_g) + 
        np.exp(log_prob_e_given_z_1) * np.exp(log_prob_z_1_given_g))

    return np.sum(log_q)

def eStepGlobal(data, tissues, beta_children, phi):
    log_prob = 0.0
    for i in range(len(tissues)):
        log_prob += eStepLocal(data, tissues[i], beta_children[i], phi)
    return log_prob

In [31]:
print("multi-task test set likelihood: ", 
      _multitask_likelihood(test, alpha, beta_children, lambda_hp_parent, lambda_hp_children_dict, phi))

multi-task model test set likelihood:  -138542.753308


In [32]:
print("RIVER test set likelihood: ", _river_likelihood(test, beta_river, phi_river))

RIVER test set likelihood:  -99088.2641828


In [33]:
print("tissue specific genome only test set likelihood: ", _tissue_specific_logistic_regression_likelihood(test))

tissue specific genome only test set likelihood:  -2909.42119849


In [34]:
print("shared tissue genome only test set likelihood: ", _shared_logistic_regression_likelihood(test))

shared tissue genome only test set likelihood:  -1120.66191614
