In [None]:
import os
os.chdir(os.path.expanduser("~/speos/"))
from speos.utils.datahandlers import ResultsHandler
from speos.postprocessing.postprocessor import PostProcessor
from speos.utils.config import Config
from speos.visualization.settings import *

import numpy as np
import matplotlib as plt


full_width = 18
cm = 1/2.54
small_font = 6
medium_font = 8
large_font = 10
plt.rc('xtick', labelsize=small_font)
plt.rc('ytick', labelsize=small_font)
plt.rcParams['axes.linewidth'] = 0.4
plt.rcParams['ytick.major.size'] = 3
plt.rcParams['ytick.major.width'] = 0.5
plt.rcParams['ytick.minor.size'] = 2
plt.rcParams['ytick.minor.width'] = 0.3
plt.rcParams['xtick.major.size'] = 2
plt.rcParams['xtick.major.width'] = 0.3
plt.rcParams['xtick.minor.size'] = 1
plt.rcParams['xtick.minor.width'] = 0.1

In [None]:
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import pandas as pd
import matplotlib.pyplot as plt

def get_mighty_keys(n, results):
    mighty_keys = []
    for key, value in results[0].items():
        if key != "Total" and value > n:
            mighty_keys.append(key)
    return mighty_keys

def get_OR(n, superset, results, setB):
    from scipy.stats import fisher_exact
    try:
        results = set(get_mighty_keys(n,results))
    except TypeError:
        results = set(results)

    array = pp.make_contingency_table(superset,results,setB)

    odds_ratio, pval = fisher_exact(array)

    return odds_ratio, pval, array

import numpy as np
from statsmodels.stats.multitest import fdrcorrection as fdr

background = None

# You will need to adapt this to your (singular) phenotype
phenotypes = ["Ulcerative Colitis", ]
methods = ["film"]
pretty_methods = ["FiLM"]

tick_size = small_font
label_size = medium_font
title_size = medium_font

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
width_ratios = [10, 1]*2
marker_size = 4


colors_ = ["#3EB489"] #["#01016f"]
markers = ["^"]
min_strength = 3000

outer_dfs_list = []

# first, the data is loaded, ORs and pvals are calculated

for frame_idx, (phenotype, color) in enumerate(zip(phenotypes, colors_)):
    if frame_idx == 5:
        break
    dfs = []
    for pretty_method, method, marker in zip(pretty_methods, methods, markers):
        _columns= ["Method", "CS/Mendelian", "N Candidate/Mendelian and MKO", "N Candidate/Mendelian not MKO", "N Not Candidate/Mendelian and MKO", "N Not Candidate/Mendelian Not MKO", "OR", "pval unadjusted", "pval adjusted (FDR)"]
        df = pd.DataFrame(data=np.empty((12, len(_columns))), columns=_columns)
    
        # you will need to adapt your config path

        config_paths = {"Ulcerative Colitis": "config_uc_only_nohetio_{}_newstorage.yaml".format(method)}
        
        config = Config()
        config.parse_yaml(config_paths[phenotype])

        pp = PostProcessor(config)

        import json
        
        unknown_genes, all_genes, positive_genes = pp.get_unknown_genes(pp.results_paths[0][0])
        background = set(pp.get_mouse_knockout_background())
        valid_background = background.intersection(all_genes)
        print("Number of Background genes: {}".format(len(valid_background)))
        valid_unlabeled_background = valid_background.intersection(unknown_genes)
        print("Number of unlabeled Background genes: {}".format(len(valid_unlabeled_background)))

        mousegenes = set(pp.get_mouse_knockout_genes())
        valid_mousegenes = pp._return_only_valid(mousegenes, valid_background)
        valid_unlabeled_mousegenes = pp._return_only_valid(mousegenes, valid_unlabeled_background)
        print("Number of unlabeled Mouse Genes genes: {}".format(len(valid_unlabeled_mousegenes)))

        # this has to point to the outer_results file produced by the outer crossval (in the results folder)
        outer_results_paths = {"Ulcerative Colitis": "/mnt/storage/speos/results/uc_{}_nohetioouter_results.json".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)

        unknown_genes, all_genes, positive_genes = pp.get_unknown_genes(pp.results_paths[0][0])
        if background is None:
            background = set(pp.get_mouse_knockout_background())
        valid_background = background.intersection(all_genes)
        valid_unlabeled_background = valid_background.intersection(unknown_genes)
                
        mousegenes = set(pp.get_mouse_knockout_genes())
        valid_mousegenes = pp._return_only_valid(mousegenes, valid_background)
        valid_unlabeled_mousegenes = pp._return_only_valid(mousegenes, valid_unlabeled_background)
        
        groups = []
        ors = []
        pvals = []
        strengths = []
        n_candidates_and_mko = []
        n_candidates_not_mko = []
        n_not_candidates_and_mko = []
        n_not_candidates_not_mko = []
        print("Mendelian + CS11:")
        cs11 = set(get_mighty_keys(10, results))
        cs11 = cs11.union(positive_genes)
        print(get_OR(0, valid_background, cs11, valid_mousegenes))


        gwas_genes = set(pd.read_csv("/mnt/storage/prs/ldblocks/gene_snps.tsv", names=range(10), usecols=[3], header=None, sep="\t", index_col=None)[3].tolist())
        gwas_genes = gwas_genes.difference(positive_genes)
        valid_gwas_genes = pp._return_only_valid(gwas_genes, valid_background)

        g_odds_ratio, g_pval, array = get_OR(i, valid_background, set(valid_gwas_genes), valid_mousegenes)
        m_strength = len(positive_genes)
        df.loc[0, :] = ["-", "GWAS", array[0][0], array[1][0], array[0][1], array[1][1], g_odds_ratio, g_pval, None]

        for i in range(11):
            group = ">= {}".format(i + 1)
            groups.append(group)
            odds_ratio, pval, array = get_OR(i, valid_unlabeled_background, results, valid_unlabeled_mousegenes)
            ors.append(odds_ratio)
            pvals.append(pval)
            strength = len(get_mighty_keys(i, results))
            strengths.append(strength)
            df.loc[i + 1, :] = [pretty_method, "CS "+ group, array[0][0], array[1][0], array[0][1], array[1][1], odds_ratio, pval, None]

        min_strength = np.min(strengths) if np.min(strengths) < min_strength else min_strength
        m_odds_ratio, m_pval, array = get_OR(i, valid_background, set(positive_genes), valid_mousegenes)
        m_strength = len(positive_genes)
        df.loc[12, :] = ["-", "Mendelian", array[0][0], array[1][0], array[0][1], array[1][1], m_odds_ratio, m_pval, None]
        
        dfs.append(df)
    
    dfs = pd.concat(dfs)
    
    for column in dfs.columns:
        if "OR" in column:
            dfs[column] = dfs[column].round(3)
        #if "N " in column:
            #dfs[column] = dfs[column].astype(int)
    outer_dfs_list.append(dfs)

# second, the pvals are adjusted using false discovery rate

pvals = []
for df in outer_dfs_list:
    pvals.extend(df["pval unadjusted"].tolist())

adj_pvals = fdr(pvals)[1]

for df, adj_pval_one_df, phenotype in zip(outer_dfs_list, np.array_split(adj_pvals, len(outer_dfs_list)), phenotypes):
    print((adj_pval_one_df < 0.05).sum())
    df["pval adjusted (FDR)"] = adj_pval_one_df
    print((df["pval adjusted (FDR)"] < 0.05).sum())
    df.to_csv("{}_ors_mko_gwas.tsv".format(phenotype), sep="\t", index=False)

# third, the ORs are plotted conditioned on significant pvals

fig, axes = plt.subplots(figsize=(full_width*cm*0.4,6*cm), nrows=1, ncols=4, sharey=False, gridspec_kw={'width_ratios': [1, 11, 1, 3]})
axes = axes.flatten()

for frame_idx, (phenotype, color, (ax0, ax1, ax3, invis_ax), df) in enumerate(zip(phenotypes, colors_, [axes], outer_dfs_list)):
    """
    if frame_idx == len(phenotypes):
        ax0.axis("off")
        ax1.axis("off")
        ax3.axis("off")
        invis_ax.axis("off")
        handles = [Line2D([0], [0], color="black", marker=marker, markersize=marker_size, label=label + " OR") for marker, label in zip(markers, pretty_methods)]
        handles.extend([Line2D([0], [0], color=grey, marker=marker, label=label + " n") for marker, label in zip(markers, pretty_methods)])
        leg = ax1.legend(handles=handles, loc='center', fontsize=label_size, labelspacing=0.5, borderpad=1)
        bb = leg.get_bbox_to_anchor().transformed(ax1.transAxes.inverted())

        # Changes to location of the legend. 
        yOffset = -0.06
        bb.y0 += yOffset
        bb.y1 += yOffset
        leg.set_bbox_to_anchor(bb, transform = ax1.transAxes)
        break
    """
    axG = ax0.twinx()
    ax2 = ax1.twinx()
    ax4 = ax3.twinx()
    handles = []
    min_strength = 1000
    dfs = []
    
    start = 0
    stride = 13

    for method_idx, (pretty_method, method, marker) in enumerate(zip(pretty_methods, methods, markers)):
        
        method_df = df[start:start + stride]
        start += stride
        assert method_df["Method"][1] == pretty_method

        groups = method_df["CS/Mendelian"][1:-1]
        ors = method_df["OR"]
        adj_pvals = method_df["pval adjusted (FDR)"]
        strengths = np.asarray(method_df["N Candidate/Mendelian and MKO"]) + np.asarray(method_df["N Candidate/Mendelian not MKO"])
        n_candidates_and_mko = method_df["N Candidate/Mendelian and MKO"]
        n_candidates_not_mko = method_df["N Candidate/Mendelian not MKO"]
        n_not_candidates_and_mko = method_df["N Not Candidate/Mendelian and MKO"]
        n_not_candidates_not_mko = method_df["N Not Candidate/Mendelian Not MKO"]

        xind = np.asarray((range(stride)))
        width = 0.8

        # plot set strengths
        is_positive = np.asarray(strengths[-1]) > 0
        draw_xind = np.asarray(xind[1:-1])[is_positive]
        draw_strengths = np.asarray(strengths[1:-1])[is_positive]
        ax1.plot(draw_xind[0], draw_strengths[0], color=grey, zorder=1, marker=marker, ms=marker_size-1, linewidth=0.5)
        ax1.set_axisbelow(True)
        ax3.bar(1-(width/2), height=strengths[-1], width=width, color=grey, zorder=1)
        ax3.set_axisbelow(True)
        ax0.bar(1-(width/2), height=strengths[0], width=width, color=grey, zorder=1)
        ax0.set_axisbelow(True)
        
        is_significant = np.asarray(adj_pvals[1:-1]) < 0.05
        is_enough = np.asarray(strengths[1:-1]) > 0
        use_only = is_significant & is_enough
        draw_xind = np.asarray(xind[1:-1])[use_only]
        draw_ors = np.asarray(ors[1:-1])[use_only]
        handle, = ax2.plot(draw_xind, draw_ors, color=color, zorder=1, marker=marker, markeredgecolor="black",markeredgewidth=0.5, ms=marker_size, label=method)
        handles.append(handle)
        ax4.bar(1+(width/2), height=ors.tolist()[-1], width=width, color=color, zorder=1)
        ax4.set_axisbelow(True)
        axG.bar(1+(width/2), height=ors.tolist()[0], width=width, color=color, zorder=1)
        axG.set_axisbelow(True)
        ax2.set_axisbelow(True)
        ax2.set_xticks(xind)
        ax2.set_xticklabels(xind.astype(np.uint8) + 1, size=small_font)
        ax3.set_xticks([1])
        ax3.set_xticklabels(["M"], size=label_size)
        ax0.set_xticks([1])
        ax0.set_xticklabels(["G"], size=label_size)
        if frame_idx % 3 == 0:
            ax0.set_ylabel("Number of Candidate Genes", size=label_size)
        elif frame_idx % 3 == 2:
            ax4.set_ylabel("Odds Ratio", size=label_size)
        
        ax4.set_ylabel("Odds Ratio", size=label_size)
            
        ax1.set_xlabel("Consensus Score (>=)", size=label_size)

        ax1.set_title(phenotype, y=1.05, x=0.5, pad=-14, size=title_size)
        
        
    

    _, candidate_max_strength = ax1.get_ylim()
    _, mendelian_max_strength = ax3.get_ylim()
    max_strength = np.max((candidate_max_strength, mendelian_max_strength))
    max_strenth = 3500 if max_strength < 3500 else max_strength

    min_strength = 1

    ax0.set_yscale('log')
    ax1.set_yscale('log')
    ax3.set_yscale('log')
    ax0.set_ylim((min_strength, max_strenth))
    ax1.set_ylim((min_strength, max_strenth))
    ax3.set_ylim((min_strength, max_strenth))
    
    candidate_max_or = ax2.get_ylim()[1]
    mendelian_max_or = ax4.get_ylim()[1]
    ax2.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))
    ax4.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))
    axG.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))

    ax1.grid(True, which="major", axis="y", color=lightgrey)
    ax1.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)
    ax0.grid(True,which="major", axis="y", color=lightgrey)
    ax0.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)
    ax3.grid(True,which="major", axis="y", color=lightgrey)
    ax3.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)

    ax1.set_yticklabels([])
    ax1.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_yticks([])
    for tick in ax1.yaxis.get_major_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    for tick in ax1.yaxis.get_minor_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    ax0.spines['left'].set_visible(False)
    ax0.spines['right'].set_visible(False)
    ax3.spines['left'].set_visible(False)
    ax3.set_yticklabels([])
    for tick in ax3.yaxis.get_major_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    for tick in ax3.yaxis.get_minor_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    ax4.spines['left'].set_visible(False)
    axG.set_yticks([])
    axG.spines['right'].set_visible(False)
    ax0.tick_params(axis="y", pad=0.5)
    ax4.tick_params(axis="y", pad=0.5)


    ax1.set_xticks(range(1,12))
    ax1.set_xticklabels([str(x) for x in range(1,12)])
    ax1.tick_params(axis="x", labelsize=label_size)
    invis_ax.set_visible(False)

fig.tight_layout()
plt.subplots_adjust(wspace=0, hspace=0.1)
plt.savefig("OR_uc.svg", bbox_inches='tight', dpi=400)

In [None]:
import scipy.stats as stats
from scipy.stats import fisher_exact

results = pd.read_csv("{}_ors_mko_gwas.tsv".format(phenotype), sep="\t", index_col=None, header=0)

col1 = results[results["CS/Mendelian"] == "GWAS"].transpose()[0].tolist()
array1 = np.asarray([[col1[2], col1[3]], [col1[4], col1[5]]])


pvals = []
for i, group in enumerate(results["CS/Mendelian"].tolist()):
    if group in ["GWAS"]:
        continue
    col2 = results[results["CS/Mendelian"] == group].transpose()[i].tolist()
    array2 = np.asarray([[col2[2], col2[3]], [col2[4], col2[5]]])

    odds_typical = fisher_exact(array1)[0]
    odds_atypical = fisher_exact(array2)[0]

    log_odds_typical = np.log(odds_typical)
    log_odds_atypical = np.log(odds_atypical)
    delta = log_odds_typical - log_odds_atypical


    var_typical = np.sum([1 / value for value in array1.flatten()])
    var_atypical = np.sum([1 / value for value in array2.flatten()])

    se_delta = np.sqrt(var_typical + var_atypical)
    zval = delta / se_delta
    pval = stats.norm.sf(np.abs(zval)) * 2
    pvals.append(pval)
pvals

In [None]:
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import pandas as pd
import matplotlib.pyplot as plt

def get_mighty_keys(n, results):
    mighty_keys = []
    for key, value in results[0].items():
        if key != "Total" and value > n:
            mighty_keys.append(key)
    return mighty_keys

def get_OR(n, superset, results, setB):
    from scipy.stats import fisher_exact
    try:
        results = set(get_mighty_keys(n,results))
    except TypeError:
        results = set(results)

    array = pp.make_contingency_table(superset,results,setB)

    odds_ratio, pval = fisher_exact(array)

    return odds_ratio, pval, array

import numpy as np
from statsmodels.stats.multitest import fdrcorrection as fdr

background = None

# You will need to adapt this to your (singular) phenotype
phenotypes = ["Ulcerative Colitis", ]
methods = ["film"]
pretty_methods = ["FiLM"]

tick_size = small_font
label_size = medium_font
title_size = medium_font

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
width_ratios = [10, 1]*2
marker_size = 4


colors_ = ["#3EB489"] #["#01016f"]
markers = ["^"]
min_strength = 3000

outer_dfs_list = []

# first, the data is loaded, ORs and pvals are calculated

for frame_idx, (phenotype, color) in enumerate(zip(phenotypes, colors_)):
    if frame_idx == 5:
        break
    dfs = []
    for pretty_method, method, marker in zip(pretty_methods, methods, markers):
        _columns= ["Method", "CS/Mendelian", "N Candidate/Mendelian and MKO", "N Candidate/Mendelian not MKO", "N Not Candidate/Mendelian and MKO", "N Not Candidate/Mendelian Not MKO", "OR", "pval unadjusted", "pval adjusted (FDR)"]
        df = pd.DataFrame(data=np.empty((12, len(_columns))), columns=_columns)
    
        # you will need to adapt your config path

        config_paths = {"Ulcerative Colitis": "config_uc_only_nohetio_{}_newstorage.yaml".format(method)}
        
        config = Config()
        config.parse_yaml(config_paths[phenotype])

        pp = PostProcessor(config)

        import json
        
        unknown_genes, all_genes, positive_genes = pp.get_unknown_genes(pp.results_paths[0][0])
        background = set(pp.get_mouse_knockout_background())
        valid_background = background.intersection(all_genes)
        print("Number of Background genes: {}".format(len(valid_background)))
        valid_unlabeled_background = valid_background.intersection(unknown_genes)
        print("Number of unlabeled Background genes: {}".format(len(valid_unlabeled_background)))

        mousegenes = set(pp.get_mouse_knockout_genes())
        valid_mousegenes = pp._return_only_valid(mousegenes, valid_background)
        valid_unlabeled_mousegenes = pp._return_only_valid(mousegenes, valid_unlabeled_background)
        print("Number of unlabeled Mouse Genes genes: {}".format(len(valid_unlabeled_mousegenes)))

        # this has to point to the outer_results file produced by the outer crossval (in the results folder)
        outer_results_paths = {"Ulcerative Colitis": "/mnt/storage/speos/results/uc_{}_nohetioouter_results.json".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)

        unknown_genes, all_genes, positive_genes = pp.get_unknown_genes(pp.results_paths[0][0])
        if background is None:
            background = set(pp.get_mouse_knockout_background())
        valid_background = background.intersection(all_genes)
        valid_unlabeled_background = valid_background.intersection(unknown_genes)
                
        mousegenes = set(pp.get_mouse_knockout_genes())
        valid_mousegenes = pp._return_only_valid(mousegenes, valid_background)
        valid_unlabeled_mousegenes = pp._return_only_valid(mousegenes, valid_unlabeled_background)
        
        groups = []
        ors = []
        pvals = []
        strengths = []
        n_candidates_and_mko = []
        n_candidates_not_mko = []
        n_not_candidates_and_mko = []
        n_not_candidates_not_mko = []

        gwas_genes = set(pd.read_csv("/mnt/storage/prs/ldblocks/gene_snps.tsv", names=range(10), usecols=[3], header=None, sep="\t", index_col=None)[3].tolist())
        gwas_genes = gwas_genes.difference(set(results[0].keys())).difference(positive_genes)
                                           
        valid_gwas_genes = pp._return_only_valid(gwas_genes, valid_background)

        g_odds_ratio, g_pval, array = get_OR(i, valid_background, set(valid_gwas_genes), valid_mousegenes)
        m_strength = len(positive_genes)
        df.loc[0, :] = ["-", "GWAS", array[0][0], array[1][0], array[0][1], array[1][1], g_odds_ratio, g_pval, None]

        for i in range(11):
            group = ">= {}".format(i + 1)
            groups.append(group)
            odds_ratio, pval, array = get_OR(i, valid_unlabeled_background, results, valid_unlabeled_mousegenes)
            ors.append(odds_ratio)
            pvals.append(pval)
            strength = len(get_mighty_keys(i, results))
            strengths.append(strength)
            df.loc[i + 1, :] = [pretty_method, "CS "+ group, array[0][0], array[1][0], array[0][1], array[1][1], odds_ratio, pval, None]

        min_strength = np.min(strengths) if np.min(strengths) < min_strength else min_strength
        m_odds_ratio, m_pval, array = get_OR(i, valid_background, set(positive_genes), valid_mousegenes)
        m_strength = len(positive_genes)
        df.loc[12, :] = ["-", "Mendelian", array[0][0], array[1][0], array[0][1], array[1][1], m_odds_ratio, m_pval, None]
        
        dfs.append(df)
    
    dfs = pd.concat(dfs)
    
    for column in dfs.columns:
        if "OR" in column:
            dfs[column] = dfs[column].round(3)
        #if "N " in column:
            #dfs[column] = dfs[column].astype(int)
    outer_dfs_list.append(dfs)

# second, the pvals are adjusted using false discovery rate

pvals = []
for df in outer_dfs_list:
    pvals.extend(df["pval unadjusted"].tolist())

adj_pvals = fdr(pvals)[1]

for df, adj_pval_one_df, phenotype in zip(outer_dfs_list, np.array_split(adj_pvals, len(outer_dfs_list)), phenotypes):
    print((adj_pval_one_df < 0.05).sum())
    df["pval adjusted (FDR)"] = adj_pval_one_df
    print((df["pval adjusted (FDR)"] < 0.05).sum())
    df.to_csv("{}_ors_mko_gwas_selective.tsv".format(phenotype), sep="\t", index=False)

# third, the ORs are plotted conditioned on significant pvals

fig, axes = plt.subplots(figsize=(full_width*cm*0.4,6*cm), nrows=1, ncols=4, sharey=False, gridspec_kw={'width_ratios': [1, 11, 1, 3]})
axes = axes.flatten()

for frame_idx, (phenotype, color, (ax0, ax1, ax3, invis_ax), df) in enumerate(zip(phenotypes, colors_, [axes], outer_dfs_list)):
    """
    if frame_idx == len(phenotypes):
        ax0.axis("off")
        ax1.axis("off")
        ax3.axis("off")
        invis_ax.axis("off")
        handles = [Line2D([0], [0], color="black", marker=marker, markersize=marker_size, label=label + " OR") for marker, label in zip(markers, pretty_methods)]
        handles.extend([Line2D([0], [0], color=grey, marker=marker, label=label + " n") for marker, label in zip(markers, pretty_methods)])
        leg = ax1.legend(handles=handles, loc='center', fontsize=label_size, labelspacing=0.5, borderpad=1)
        bb = leg.get_bbox_to_anchor().transformed(ax1.transAxes.inverted())

        # Changes to location of the legend. 
        yOffset = -0.06
        bb.y0 += yOffset
        bb.y1 += yOffset
        leg.set_bbox_to_anchor(bb, transform = ax1.transAxes)
        break
    """
    axG = ax0.twinx()
    ax2 = ax1.twinx()
    ax4 = ax3.twinx()
    handles = []
    min_strength = 1000
    dfs = []
    
    start = 0
    stride = 13

    for method_idx, (pretty_method, method, marker) in enumerate(zip(pretty_methods, methods, markers)):
        
        method_df = df[start:start + stride]
        start += stride
        assert method_df["Method"][1] == pretty_method

        groups = method_df["CS/Mendelian"][1:-1]
        ors = method_df["OR"]
        adj_pvals = method_df["pval adjusted (FDR)"]
        strengths = np.asarray(method_df["N Candidate/Mendelian and MKO"]) + np.asarray(method_df["N Candidate/Mendelian not MKO"])
        n_candidates_and_mko = method_df["N Candidate/Mendelian and MKO"]
        n_candidates_not_mko = method_df["N Candidate/Mendelian not MKO"]
        n_not_candidates_and_mko = method_df["N Not Candidate/Mendelian and MKO"]
        n_not_candidates_not_mko = method_df["N Not Candidate/Mendelian Not MKO"]

        xind = np.asarray((range(stride)))
        width = 0.8

        # plot set strengths
        is_positive = np.asarray(strengths[-1]) > 0
        draw_xind = np.asarray(xind[1:-1])[is_positive]
        draw_strengths = np.asarray(strengths[1:-1])[is_positive]
        ax1.plot(draw_xind[0], draw_strengths[0], color=grey, zorder=1, marker=marker, ms=marker_size-1, linewidth=0.5)
        ax1.set_axisbelow(True)
        ax3.bar(1-(width/2), height=strengths[-1], width=width, color=grey, zorder=1)
        ax3.set_axisbelow(True)
        ax0.bar(1-(width/2), height=strengths[0], width=width, color=grey, zorder=1)
        ax0.set_axisbelow(True)
        
        is_significant = np.asarray(adj_pvals[1:-1]) < 0.05
        is_enough = np.asarray(strengths[1:-1]) > 0
        use_only = is_significant & is_enough
        draw_xind = np.asarray(xind[1:-1])[use_only]
        draw_ors = np.asarray(ors[1:-1])[use_only]
        handle, = ax2.plot(draw_xind, draw_ors, color=color, zorder=1, marker=marker, markeredgecolor="black",markeredgewidth=0.5, ms=marker_size, label=method)
        handles.append(handle)
        ax4.bar(1+(width/2), height=ors.tolist()[-1], width=width, color=color, zorder=1)
        ax4.set_axisbelow(True)
        axG.bar(1+(width/2), height=ors.tolist()[0], width=width, color=color, zorder=1)
        axG.set_axisbelow(True)
        ax2.set_axisbelow(True)
        ax2.set_xticks(xind)
        ax2.set_xticklabels(xind.astype(np.uint8) + 1, size=small_font)
        ax3.set_xticks([1])
        ax3.set_xticklabels(["M"], size=label_size)
        ax0.set_xticks([1])
        ax0.set_xticklabels(["G"], size=label_size)
        if frame_idx % 3 == 0:
            ax0.set_ylabel("Number of Candidate Genes", size=label_size)
        elif frame_idx % 3 == 2:
            ax4.set_ylabel("Odds Ratio", size=label_size)
        
        ax4.set_ylabel("Odds Ratio", size=label_size)
            
        ax1.set_xlabel("Consensus Score (>=)", size=label_size)

        ax1.set_title(phenotype, y=1.05, x=0.5, pad=-14, size=title_size)
        
        
    

    _, candidate_max_strength = ax1.get_ylim()
    _, mendelian_max_strength = ax3.get_ylim()
    max_strength = np.max((candidate_max_strength, mendelian_max_strength))
    max_strenth = 3500 if max_strength < 3500 else max_strength

    min_strength = 1

    ax0.set_yscale('log')
    ax1.set_yscale('log')
    ax3.set_yscale('log')
    ax0.set_ylim((min_strength, max_strenth))
    ax1.set_ylim((min_strength, max_strenth))
    ax3.set_ylim((min_strength, max_strenth))
    
    candidate_max_or = ax2.get_ylim()[1]
    mendelian_max_or = ax4.get_ylim()[1]
    ax2.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))
    ax4.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))
    axG.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))

    ax1.grid(True, which="major", axis="y", color=lightgrey)
    ax1.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)
    ax0.grid(True,which="major", axis="y", color=lightgrey)
    ax0.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)
    ax3.grid(True,which="major", axis="y", color=lightgrey)
    ax3.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)

    ax1.set_yticklabels([])
    ax1.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_yticks([])
    for tick in ax1.yaxis.get_major_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    for tick in ax1.yaxis.get_minor_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    ax0.spines['left'].set_visible(False)
    ax0.spines['right'].set_visible(False)
    ax3.spines['left'].set_visible(False)
    ax3.set_yticklabels([])
    for tick in ax3.yaxis.get_major_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    for tick in ax3.yaxis.get_minor_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    ax4.spines['left'].set_visible(False)
    axG.set_yticks([])
    axG.spines['right'].set_visible(False)
    ax0.tick_params(axis="y", pad=0.5)
    ax4.tick_params(axis="y", pad=0.5)


    ax1.set_xticks(range(1,12))
    ax1.set_xticklabels([str(x) for x in range(1,12)])
    ax1.tick_params(axis="x", labelsize=label_size)
    invis_ax.set_visible(False)

fig.tight_layout()
plt.subplots_adjust(wspace=0, hspace=0.1)
plt.savefig("OR_uc_gwas_selective.svg", bbox_inches='tight', dpi=400)

In [None]:
import scipy.stats as stats
from scipy.stats import fisher_exact

results = pd.read_csv("{}_ors_mko_gwas_selective.tsv".format(phenotype), sep="\t", index_col=None, header=0)

col1 = results[results["CS/Mendelian"] == "GWAS"].transpose()[0].tolist()
array1 = np.asarray([[col1[2], col1[3]], [col1[4], col1[5]]])


pvals = []
for i, group in enumerate(results["CS/Mendelian"].tolist()):
    if group in ["Mendelian", "GWAS"]:
        continue
    col2 = results[results["CS/Mendelian"] == group].transpose()[i].tolist()
    array2 = np.asarray([[col2[2], col2[3]], [col2[4], col2[5]]])

    odds_typical = fisher_exact(array1)[0]
    odds_atypical = fisher_exact(array2)[0]

    log_odds_typical = np.log(odds_typical)
    log_odds_atypical = np.log(odds_atypical)
    delta = log_odds_typical - log_odds_atypical


    var_typical = np.sum([1 / value for value in array1.flatten()])
    var_atypical = np.sum([1 / value for value in array2.flatten()])

    se_delta = np.sqrt(var_typical + var_atypical)
    zval = delta / se_delta
    pval = stats.norm.sf(np.abs(zval)) * 2
    pvals.append(pval)
pvals

In [None]:
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import pandas as pd
import matplotlib.pyplot as plt

def get_mighty_keys(n, results):
    mighty_keys = []
    for key, value in results[0].items():
        if key != "Total" and value == n +1:
            mighty_keys.append(key)
    return mighty_keys

def get_OR(n, superset, results, setB):
    from scipy.stats import fisher_exact
    try:
        results = set(get_mighty_keys(n,results))
    except TypeError:
        results = set(results)

    array = pp.make_contingency_table(superset,results,setB)

    odds_ratio, pval = fisher_exact(array)

    return odds_ratio, pval, array

import numpy as np
from statsmodels.stats.multitest import fdrcorrection as fdr

background = None

# You will need to adapt this to your (singular) phenotype
phenotypes = ["Ulcerative Colitis", ]
methods = ["film"]
pretty_methods = ["FiLM"]

tick_size = small_font
label_size = medium_font
title_size = medium_font

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
width_ratios = [10, 1]*2
marker_size = 4


colors_ = ["#3EB489"] #["#01016f"]
markers = ["^"]
min_strength = 3000

outer_dfs_list = []

# first, the data is loaded, ORs and pvals are calculated

for frame_idx, (phenotype, color) in enumerate(zip(phenotypes, colors_)):
    if frame_idx == 5:
        break
    dfs = []
    for pretty_method, method, marker in zip(pretty_methods, methods, markers):
        _columns= ["Method", "CS/Mendelian", "N Candidate/Mendelian and MKO", "N Candidate/Mendelian not MKO", "N Not Candidate/Mendelian and MKO", "N Not Candidate/Mendelian Not MKO", "OR", "pval unadjusted", "pval adjusted (FDR)"]
        df = pd.DataFrame(data=np.empty((12, len(_columns))), columns=_columns)
    
        # you will need to adapt your config path

        config_paths = {"Ulcerative Colitis": "config_uc_only_nohetio_{}_newstorage.yaml".format(method)}
        
        config = Config()
        config.parse_yaml(config_paths[phenotype])

        pp = PostProcessor(config)

        import json
        
        unknown_genes, all_genes, positive_genes = pp.get_unknown_genes(pp.results_paths[0][0])
        background = set(pp.get_mouse_knockout_background())
        valid_background = background.intersection(all_genes)
        print("Number of Background genes: {}".format(len(valid_background)))
        valid_unlabeled_background = valid_background.intersection(unknown_genes)
        print("Number of unlabeled Background genes: {}".format(len(valid_unlabeled_background)))

        mousegenes = set(pp.get_mouse_knockout_genes())
        valid_mousegenes = pp._return_only_valid(mousegenes, valid_background)
        valid_unlabeled_mousegenes = pp._return_only_valid(mousegenes, valid_unlabeled_background)
        print("Number of unlabeled Mouse Genes genes: {}".format(len(valid_unlabeled_mousegenes)))

        # this has to point to the outer_results file produced by the outer crossval (in the results folder)
        outer_results_paths = {"Ulcerative Colitis": "/mnt/storage/speos/results/uc_{}_nohetioouter_results.json".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)

        unknown_genes, all_genes, positive_genes = pp.get_unknown_genes(pp.results_paths[0][0])
        if background is None:
            background = set(pp.get_mouse_knockout_background())
        valid_background = background.intersection(all_genes)
        valid_unlabeled_background = valid_background.intersection(unknown_genes)
                
        mousegenes = set(pp.get_mouse_knockout_genes())
        valid_mousegenes = pp._return_only_valid(mousegenes, valid_background)
        valid_unlabeled_mousegenes = pp._return_only_valid(mousegenes, valid_unlabeled_background)
        
        groups = []
        ors = []
        pvals = []
        strengths = []
        n_candidates_and_mko = []
        n_candidates_not_mko = []
        n_not_candidates_and_mko = []
        n_not_candidates_not_mko = []

        gwas_genes = set(pd.read_csv("/mnt/storage/prs/ldblocks/gene_snps.tsv", names=range(10), usecols=[3], header=None, sep="\t", index_col=None)[3].tolist())
        gwas_genes = gwas_genes.difference(set(results[0].keys())).difference(positive_genes)
                                           
        valid_gwas_genes = pp._return_only_valid(gwas_genes, valid_background)

        g_odds_ratio, g_pval, array = get_OR(0, valid_background, set(valid_gwas_genes), valid_mousegenes)
        m_strength = len(positive_genes)
        df.loc[0, :] = ["-", "GWAS", array[0][0], array[1][0], array[0][1], array[1][1], g_odds_ratio, g_pval, None]

        for i in range(11):
            group = "= {}".format(i + 1)
            groups.append(group)
            odds_ratio, pval, array = get_OR(i, valid_unlabeled_background, results, valid_unlabeled_mousegenes)
            ors.append(odds_ratio)
            pvals.append(pval)
            strength = len(get_mighty_keys(i, results))
            strengths.append(strength)
            df.loc[i + 1, :] = [pretty_method, "CS "+ group, array[0][0], array[1][0], array[0][1], array[1][1], odds_ratio, pval, None]

        min_strength = np.min(strengths) if np.min(strengths) < min_strength else min_strength
        m_odds_ratio, m_pval, array = get_OR(i, valid_background, set(positive_genes), valid_mousegenes)
        m_strength = len(positive_genes)
        df.loc[12, :] = ["-", "Mendelian", array[0][0], array[1][0], array[0][1], array[1][1], m_odds_ratio, m_pval, None]
        
        dfs.append(df)
    
    dfs = pd.concat(dfs)
    
    for column in dfs.columns:
        if "OR" in column:
            dfs[column] = dfs[column].round(3)
        #if "N " in column:
            #dfs[column] = dfs[column].astype(int)
    outer_dfs_list.append(dfs)

# second, the pvals are adjusted using false discovery rate

pvals = []
for df in outer_dfs_list:
    pvals.extend(df["pval unadjusted"].tolist())

adj_pvals = fdr(pvals)[1]

for df, adj_pval_one_df, phenotype in zip(outer_dfs_list, np.array_split(adj_pvals, len(outer_dfs_list)), phenotypes):
    print((adj_pval_one_df < 0.05).sum())
    df["pval adjusted (FDR)"] = adj_pval_one_df
    print((df["pval adjusted (FDR)"] < 0.05).sum())
    df.to_csv("{}_ors_mko_gwas_selective.tsv".format(phenotype), sep="\t", index=False)

# third, the ORs are plotted conditioned on significant pvals

fig, axes = plt.subplots(figsize=(full_width*cm*0.4,6*cm), nrows=1, ncols=4, sharey=False, gridspec_kw={'width_ratios': [1, 11, 1, 3]})
axes = axes.flatten()

for frame_idx, (phenotype, color, (ax0, ax1, ax3, invis_ax), df) in enumerate(zip(phenotypes, colors_, [axes], outer_dfs_list)):
    """
    if frame_idx == len(phenotypes):
        ax0.axis("off")
        ax1.axis("off")
        ax3.axis("off")
        invis_ax.axis("off")
        handles = [Line2D([0], [0], color="black", marker=marker, markersize=marker_size, label=label + " OR") for marker, label in zip(markers, pretty_methods)]
        handles.extend([Line2D([0], [0], color=grey, marker=marker, label=label + " n") for marker, label in zip(markers, pretty_methods)])
        leg = ax1.legend(handles=handles, loc='center', fontsize=label_size, labelspacing=0.5, borderpad=1)
        bb = leg.get_bbox_to_anchor().transformed(ax1.transAxes.inverted())

        # Changes to location of the legend. 
        yOffset = -0.06
        bb.y0 += yOffset
        bb.y1 += yOffset
        leg.set_bbox_to_anchor(bb, transform = ax1.transAxes)
        break
    """
    axG = ax0.twinx()
    ax2 = ax1.twinx()
    ax4 = ax3.twinx()
    handles = []
    min_strength = 1000
    dfs = []
    
    start = 0
    stride = 13

    for method_idx, (pretty_method, method, marker) in enumerate(zip(pretty_methods, methods, markers)):
        
        method_df = df[start:start + stride]
        start += stride
        assert method_df["Method"][1] == pretty_method

        groups = method_df["CS/Mendelian"][1:-1]
        ors = method_df["OR"]
        adj_pvals = method_df["pval adjusted (FDR)"]
        strengths = np.asarray(method_df["N Candidate/Mendelian and MKO"]) + np.asarray(method_df["N Candidate/Mendelian not MKO"])
        n_candidates_and_mko = method_df["N Candidate/Mendelian and MKO"]
        n_candidates_not_mko = method_df["N Candidate/Mendelian not MKO"]
        n_not_candidates_and_mko = method_df["N Not Candidate/Mendelian and MKO"]
        n_not_candidates_not_mko = method_df["N Not Candidate/Mendelian Not MKO"]

        xind = np.asarray((range(stride)))
        width = 0.8

        # plot set strengths
        is_positive = np.asarray(strengths[-1]) > 0
        draw_xind = np.asarray(xind[1:-1])[is_positive]
        draw_strengths = np.asarray(strengths[1:-1])[is_positive]
        ax1.plot(draw_xind[0], draw_strengths[0], color=grey, zorder=1, marker=marker, ms=marker_size-1, linewidth=0.5)
        ax1.set_axisbelow(True)
        ax3.bar(1-(width/2), height=strengths[-1], width=width, color=grey, zorder=1)
        ax3.set_axisbelow(True)
        ax0.bar(1-(width/2), height=strengths[0], width=width, color=grey, zorder=1)
        ax0.set_axisbelow(True)
        
        is_significant = np.asarray(adj_pvals[1:-1]) < 0.05
        is_enough = np.asarray(strengths[1:-1]) > 0
        use_only = is_significant & is_enough
        draw_xind = np.asarray(xind[1:-1])[use_only]
        draw_ors = np.asarray(ors[1:-1])[use_only]
        handle, = ax2.plot(draw_xind, draw_ors, color=color, zorder=1, marker=marker, markeredgecolor="black",markeredgewidth=0.5, ms=marker_size, label=method)
        handles.append(handle)
        ax4.bar(1+(width/2), height=ors.tolist()[-1], width=width, color=color, zorder=1)
        ax4.set_axisbelow(True)
        axG.bar(1+(width/2), height=ors.tolist()[0], width=width, color=color, zorder=1)
        axG.set_axisbelow(True)
        ax2.set_axisbelow(True)
        ax2.set_xticks(xind)
        ax2.set_xticklabels(xind.astype(np.uint8) + 1, size=small_font)
        ax3.set_xticks([1])
        ax3.set_xticklabels(["M"], size=label_size)
        ax0.set_xticks([1])
        ax0.set_xticklabels(["G"], size=label_size)
        if frame_idx % 3 == 0:
            ax0.set_ylabel("Number of Candidate Genes", size=label_size)
        elif frame_idx % 3 == 2:
            ax4.set_ylabel("Odds Ratio", size=label_size)
        
        ax4.set_ylabel("Odds Ratio", size=label_size)
            
        ax1.set_xlabel("Consensus Score (>=)", size=label_size)

        ax1.set_title(phenotype, y=1.05, x=0.5, pad=-14, size=title_size)
        
        
    

    _, candidate_max_strength = ax1.get_ylim()
    _, mendelian_max_strength = ax3.get_ylim()
    max_strength = np.max((candidate_max_strength, mendelian_max_strength))
    max_strenth = 3500 if max_strength < 3500 else max_strength

    min_strength = 1

    ax0.set_yscale('log')
    ax1.set_yscale('log')
    ax3.set_yscale('log')
    ax0.set_ylim((min_strength, max_strenth))
    ax1.set_ylim((min_strength, max_strenth))
    ax3.set_ylim((min_strength, max_strenth))
    
    candidate_max_or = ax2.get_ylim()[1]
    mendelian_max_or = ax4.get_ylim()[1]
    ax2.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))
    ax4.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))
    axG.set_ylim((0, np.max((candidate_max_or, mendelian_max_or))))

    ax1.grid(True, which="major", axis="y", color=lightgrey)
    ax1.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)
    ax0.grid(True,which="major", axis="y", color=lightgrey)
    ax0.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)
    ax3.grid(True,which="major", axis="y", color=lightgrey)
    ax3.grid(True,which="minor", axis="y", linestyle=":", color=lightgrey)

    ax1.set_yticklabels([])
    ax1.spines['left'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.set_yticks([])
    for tick in ax1.yaxis.get_major_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    for tick in ax1.yaxis.get_minor_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    ax0.spines['left'].set_visible(False)
    ax0.spines['right'].set_visible(False)
    ax3.spines['left'].set_visible(False)
    ax3.set_yticklabels([])
    for tick in ax3.yaxis.get_major_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    for tick in ax3.yaxis.get_minor_ticks():
        tick.tick1line.set_visible(False)
        tick.tick2line.set_visible(False)
    ax4.spines['left'].set_visible(False)
    axG.set_yticks([])
    axG.spines['right'].set_visible(False)
    ax0.tick_params(axis="y", pad=0.5)
    ax4.tick_params(axis="y", pad=0.5)


    ax1.set_xticks(range(1,12))
    ax1.set_xticklabels([str(x) for x in range(1,12)])
    ax1.tick_params(axis="x", labelsize=label_size)
    invis_ax.set_visible(False)

fig.tight_layout()
plt.subplots_adjust(wspace=0, hspace=0.1)
plt.savefig("OR_uc_gwas_selective.svg", bbox_inches='tight', dpi=400)

In [None]:
import scipy.stats as stats
from scipy.stats import fisher_exact

results = pd.read_csv("{}_ors_mko_gwas_selective.tsv".format(phenotype), sep="\t", index_col=None, header=0)

col1 = results[results["CS/Mendelian"] == "GWAS"].transpose()[0].tolist()
array1 = np.asarray([[col1[2], col1[3]], [col1[4], col1[5]]])


pvals = []
for i, group in enumerate(results["CS/Mendelian"].tolist()):
    if group in ["Mendelian", "GWAS"]:
        continue
    col2 = results[results["CS/Mendelian"] == group].transpose()[i].tolist()
    array2 = np.asarray([[col2[2], col2[3]], [col2[4], col2[5]]])

    odds_typical = fisher_exact(array1)[0]
    odds_atypical = fisher_exact(array2)[0]

    log_odds_typical = np.log(odds_typical)
    log_odds_atypical = np.log(odds_atypical)
    delta = log_odds_typical - log_odds_atypical


    var_typical = np.sum([1 / value for value in array1.flatten()])
    var_atypical = np.sum([1 / value for value in array2.flatten()])

    se_delta = np.sqrt(var_typical + var_atypical)
    zval = delta / se_delta
    pval = stats.norm.sf(np.abs(zval)) * 2
    pvals.append(pval)

In [None]:
pvals