# Obtain LFDR using .RDS and p-value using simulated dataset (.pkl)

In [1]:
from simulation import *
from power_typeIerror import *
from collections import OrderedDict
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
readRDS = robjects.r['readRDS']
from rpy2.robjects.packages import importr
import matplotlib.pyplot as plt
# from numpy import inf

In [2]:
def get_causal_pvalue_from_pkl(input_data):
    '''get p values of fisher's exact test of causal genes from pkl input'''
    dat = load_data(input_data)
    causal_genes = set(dat["debug"]["causal genes"])
    args = dat["debug"]["args"]
    sample_genes = get_gene_table(dat)
    fisher_table = test_contingency_table(sample_genes)
    overlap_genes = list(causal_genes.intersection(fisher_table["genes"]))
    # print (len(overlap_genes))
    indices = [fisher_table["genes"].index(x) for x in overlap_genes]
    # need to convert pvalue to -log10(pvalue) in def lfdr_pvalue_plot
    # log_overlap_pvalue = [-np.log10(fisher_table["p_value"][num]) for num in indices]
    overlap_pvalue = [fisher_table["p_value"][num] for num in indices]
    pvalue_df = OrderedDict(sorted(zip(*[overlap_genes, overlap_pvalue]), key = lambda x: x[1], reverse=True))
    return overlap_genes, pvalue_df, args

In [3]:
def get_lfdr(rds_data, overlap_genes_from_pkl, inf_sub = None):
    df = readRDS(rds_data)
    df = pandas2ri.ri2py(df)
    # print (df.names)
    # "20" is "pip", "18" is "summary"
    pip_rds = df[20]
    pip_genes = list(pip_rds.names[0])
    pip_list = np.array(np.matrix(pip_rds)).reshape(-1,).tolist()
    lfdr = [1-x for x in pip_list]
    gene_index = [pip_genes.index(gene) for gene in overlap_genes_from_pkl]
    log_lfdr_overlap = [-np.log10(lfdr[index]) for index in gene_index]
    # replace infinity with the maximum of the list excluding infinity
    log_lfdr = [x if not np.isinf(x) else max([x for x in log_lfdr_overlap if not np.isinf(x)]) 
                for x in log_lfdr_overlap]
    lfdr_df = OrderedDict(sorted(zip(*[overlap_genes_from_pkl, log_lfdr]), 
                                 key = lambda x: x[1], reverse=True))
    return lfdr_df

In [4]:
def convert_list_to_R(list_dat):
    ks = importr('ks')
    R_vector = robjects.r.c(robjects.FloatVector(list_dat))
    return R_vector

In [5]:
robjects.r('''
library("fdrtool")
f <- function(lfdr){
    fdr = fdrtool(lfdr, statistic = "pvalue", verbose = FALSE)
    return (fdr$lfdr)
}
''')
p_to_lfdr = robjects.r["f"]

In [6]:
def lfdr_pvalue_plot(pvalue_dat, lfdr_dat):
    '''Use LFDR from Fisher's pvalue as x axis, instead of pvalue itself'''
    overlap_genes, pvalue, args = get_causal_pvalue_from_pkl(pvalue_dat)
    lfdr = get_lfdr(lfdr_dat, overlap_genes)
    p_vector = convert_list_to_R(list(pvalue.values()))
    p_to_lfdr_list = [-np.log10(x) for x in list(p_to_lfdr(p_vector))]
    p_to_lfdr_df = OrderedDict(sorted(zip(*[pvalue.keys(), p_to_lfdr_list])))
    comb = {"p1": [], "p2": [], "gene": []}
    for gene, p in p_to_lfdr_df.items():
        comb["p1"].append(p)
        comb["p2"].append(lfdr[gene])
        comb["gene"].append(gene)
    # print (len(comb["gene"]), len(comb["p1"]), len(comb["p2"]))
    fig = plt.scatter(comb["p1"], comb["p2"])
    for label, p1, p2 in zip(comb["gene"], comb["p1"], comb["p2"]):
        if (p1>1 and p2>1):
            plt.annotate(label, xy = (p1, p2))
    plt.xlabel("-log(lfdr) from Fisher's pvalue")
    plt.ylabel("-log(lfdr)")
    plt.title("N={}, M={}, Gamma~({},1), p={}".format( args["n_case"]+args["n_ctrl"], args["n_causal_gene"], 
                                                args["odds_ratio_params"]["shape"], args["prevalence"] ))
    # plt.show()
    plt.savefig("N{}_M{}_shape{}_p{}.png".format( args["n_case"]+args["n_ctrl"], args["n_causal_gene"], 
                                                  args["odds_ratio_params"]["shape"], args["prevalence"] ))
    plt.close()
    return None

In [7]:
lfdr_pvalue_plot("data/del_sim_shape5_scale1_p0.005_N4000_M200_12.data.pkl", 
                 "data/del_sim_shape5_scale1_p0.005_N4000_M200_12.data.feather.RDS")

In [8]:
# overlap_genes_from_pkl, pvalue_df_pkl, args = get_causal_pvalue_from_pkl(
#     "data/del_sim_shape9_scale1_p0.01_N4000_M100_12.data.pkl")

In [9]:
# lfdr_df_rds = get_lfdr("data/del_sim_shape9_scale1_p0.01_N4000_M100_12.data.feather.RDS", overlap_genes_from_pkl)
# print (lfdr_df_rds)

In [10]:
# fig = plt.scatter(comb["p1"], comb["p2"])
# for label, p1, p2 in zip(comb["gene"], comb["p1"], comb["p2"]):
#     if (p1>1 and p2>1) or (p1>2) or (p2>2):
#         plt.annotate(label, xy = (p1, p2))
# plt.xlabel("-log(p) fisher test")
# plt.ylabel("-log(lfdr)")
# plt.title("-log(p_value) vs -log(lfdr)")
# # plt.show()
# plt.savefig("fig_4000_200_5_0.005.png")
# plt.close()