In [1]:
from simulation import *
from power_typeIerror import *
from collections import OrderedDict
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
readRDS = robjects.r['readRDS']
import matplotlib.pyplot as plt

In [2]:
def get_causal_pvalue_from_pkl(input_data):
    '''get p values of fisher's exact test of causal genes from pkl input'''
    dat = load_data(input_data)
    causal_genes = set(dat["debug"]["causal genes"])
    sample_genes = get_gene_table(dat)
    fisher_table = test_contingency_table(sample_genes)
    overlap_genes = list(causal_genes.intersection(fisher_table["genes"]))
    indices = [fisher_table["genes"].index(x) for x in overlap_genes]
    log_overlap_pvalue = [-np.log10(fisher_table["p_value"][num]) for num in indices]
    pvalue_df = OrderedDict(sorted(zip(*[overlap_genes, log_overlap_pvalue]), key = lambda x: x[1], reverse=True))
    return overlap_genes, pvalue_df

In [3]:
overlap_genes_from_pkl, pvalue_df_pkl = get_causal_pvalue_from_pkl(
    "data/del_sim_shape5_scale1_p0.005_N4000_M200_12.data.pkl")

In [14]:
# print (pvalue_df_pkl)

In [30]:
def get_lfdr(rds_data, overlap_genes_from_pkl, inf_sub = 0):
    df = readRDS(rds_data)
    df = pandas2ri.ri2py(df)
    # print (df.names)
    # "20" is "pip", "18" is "summary"
    pip_rds = df[20]
    pip_genes = list(pip_rds.names[0])
    pip_list = np.array(np.matrix(pip_rds)).reshape(-1,).tolist()
    lfdr = [1-x for x in pip_list]
    gene_index = [pip_genes.index(gene) for gene in overlap_genes_from_pkl]
    log_lfdr_overlap = [-np.log10(lfdr[index]) for index in gene_index]
    # replace infinity with maximum of log10(pvalue) plus the median of it
    log_lfdr_overlap = [x if not np.isinf(x) else inf_sub for x in log_lfdr_overlap]
    lfdr_df = OrderedDict(sorted(zip(*[overlap_genes_from_pkl, log_lfdr_overlap]), 
                                 key = lambda x: x[1], reverse=True))
    return lfdr_df

In [38]:
lfdr_df_rds = get_lfdr("data/del_sim_shape5_scale1_p0.005_N4000_M200_12.data.feather.RDS", overlap_genes_from_pkl)

In [39]:
print (len([x for x in lfdr_df_rds.values() if x>0.3]))

3


In [33]:
print ((lfdr_df_rds.values()))

odict_values([0.31734223404192258, 0.3060063163444412, 0.30023947646501031, 0.13382771172494831, 0.071298832768123516, 0.029133591665130142, 0.020603568782929089, 0.018498541148676065, 0.015315561639779179, 0.01413972171837116, 0.01409253938696697, 0.013311358877298078, 0.013010592172627162, 0.010871024730746703, 0.010033329005366428, 0.0096803374639285646, 0.0091761293677533051, 0.0091684834126166109, 0.0089377240230562819, 0.0076889946740771528, 0.0072416080907565282, 0.0071972962490682987, 0.0070547860113805254, 0.0063655671335553412, 0.006198223113734214, 0.0058869566116167534, 0.0058261573105750277, 0.0057986931104516502, 0.0055708303481066864, 0.005444440551454972, 0.0052047625198270121, 0.0048981982573831117, 0.0048074037503126289, 0.0047934692383957804, 0.0047407855852935376, 0.0047254428172659882, 0.0046857003022058447, 0.0046533235286056419, 0.0045722525098816601, 0.0044830696078400163, 0.0043558927612178311, 0.004267867028749108, 0.0041886621270019776, 0.0040420155774971763,

In [12]:
comb = {"p1": [], "p2": [], "gene": []}
for gene, pvalue in lfdr_df_rds.items():
    comb["p1"].append(pvalue_df_pkl[gene])
    comb["p2"].append(pvalue)
    comb["gene"].append(gene)
# print (comb)

In [9]:
fig = plt.scatter(comb["p1"], comb["p2"])
for label, p1, p2 in zip(comb["gene"], comb["p1"], comb["p2"]):
    if (p1>1 and p2>1) or (p1>2) or (p2>2):
        plt.annotate(label, xy = (p1, p2))
plt.xlabel("-log(p) fisher test")
plt.ylabel("-log(lfdr)")
plt.title("-log(p_value) vs -log(lfdr)")
# plt.show()
plt.savefig("fig_4000_200_5_0.005.png")
plt.close()

In [10]:
# from itertools import chain
# from collections import defaultdict
# overall_df = defaultdict(list)
# for k,v in chain(pvalue_df_pkl.items(), lfdr_df_rds.items()):
#     # The first element in value is -log10(pvalue) from fisher test, the second is -log10(lfdr)
#     overall_df[k].append(v)
# print (overall_df)

In [11]:
# from numpy import inf
# test1=np.array([1,3,9,5,6,7,inf])
# print (max(test1))
# test1[np.isinf(test1)]=10