In [None]:
import os
os.chdir(os.path.expanduser("~/speos/"))
from speos.utils.datahandlers import ResultsHandler
from speos.postprocessing.postprocessor import PostProcessor
from speos.utils.config import Config
from speos.visualization.settings import *

import numpy as np
import matplotlib as plt

# For plotting either lof or missense mutation intolerance, change the 'lof_or_missense' flag at the beginning of the next cell

In [None]:
import pandas as pd

def get_pli_table(path_to_table="data/forweb_cleaned_exac_r03_march16_z_data_pLI.txt") -> tuple:
        return pd.read_csv(os.path.join("/mnt/storage/speos/", path_to_table), header=0, sep="\t")

def lof_intolerance(all_genes, core_genes, ambivalent_genes):

        from scipy.stats import fisher_exact
        from scipy.stats import f_oneway
        from statsmodels.stats.multicomp import pairwise_tukeyhsd
        import matplotlib.pyplot as plt

        pli_table = get_pli_table()
        #pli_genes = set(pli_table["gene"][pli_table["pLI"] > 0.9].tolist())
        all_pli_genes = set(pli_table["gene"].tolist())

        #array_mendelian = self.make_contingency_table(all_genes, positive_genes, pli_genes.intersection(all_genes))
        #pli_enrichment_result_mendelian = fisher_exact(array_mendelian)

        #valid_pli_genes = self._return_only_valid(pli_genes, all_genes)
        #self.add_to_pp_table("pLI>0.9", list(valid_pli_genes), True, False)

        #unknown_pli_genes = self._return_only_valid(pli_genes, unknown_genes)

        #print("Total of {} genes with significant LoF Intolerance, {} of them match with our translation table.".format(len(pli_genes), len(pli_genes.intersection(all_genes))))
        #print("Found {} LoF Intolerance genes among the {} known positive genes (p: {:.2e}, OR: {}), leaving {} in {} Unknowns".format(
        #    len(pli_genes.intersection(positive_genes)), len(positive_genes), pli_enrichment_result_mendelian[1], round(pli_enrichment_result_mendelian[0], 3), len(unknown_pli_genes), len(unknown_genes)))

        #predicted_genes = set(self.outer_result[0].keys())

        #array_candidates = self.make_contingency_table(unknown_genes, predicted_genes, unknown_pli_genes)

        #pli_enrichment_result_candidates = fisher_exact(array_candidates)

        #print("Fishers Exact Test for genes with significant LoF Intolerance among Predicted Genes. p: {:.2e}, OR: {}".format(pli_enrichment_result_candidates[1], round(pli_enrichment_result_candidates[0], 3)))
        #print("LoF Intolerance Confusion Matrix:\n" + str(array_candidates))

        tukeys = []

        for column, description in zip(["lof_z", "mis_z"], ["LoF Z Value", "Missense Z Value"]):
            hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

            mendelian = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
            predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
            not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes)).intersection(all_pli_genes)]

            result_predicted = f_oneway(mendelian,
                                        predicted,
                                        not_predicted)

            print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

            df = pd.DataFrame({'score': mendelian + predicted + not_predicted,
                               'group': np.repeat(['Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(predicted), len(not_predicted)])})

            tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

            print(tukey.summary())

            tukeys.append(tukey)

        #return [pli_enrichment_result_mendelian, array_mendelian, pli_enrichment_result_candidates, array_candidates], tukeys
        return tukeys

In [None]:
def get_pli_table(path_to_table="data/forweb_cleaned_exac_r03_march16_z_data_pLI.txt") -> tuple:
        return pd.read_csv(os.path.join("/mnt/storage/speos/", path_to_table), header=0, sep="\t")

def lof_intolerance(all_genes, mendelians, core_genes, ambivalent_genes):

        from scipy.stats import fisher_exact
        from scipy.stats import f_oneway
        from statsmodels.stats.multicomp import pairwise_tukeyhsd
        import matplotlib.pyplot as plt

        pli_table = get_pli_table()
        #pli_genes = set(pli_table["gene"][pli_table["pLI"] > 0.9].tolist())
        all_pli_genes = set(pli_table["gene"].tolist())

        #array_mendelian = self.make_contingency_table(all_genes, positive_genes, pli_genes.intersection(all_genes))
        #pli_enrichment_result_mendelian = fisher_exact(array_mendelian)

        #valid_pli_genes = self._return_only_valid(pli_genes, all_genes)
        #self.add_to_pp_table("pLI>0.9", list(valid_pli_genes), True, False)

        #unknown_pli_genes = self._return_only_valid(pli_genes, unknown_genes)

        #print("Total of {} genes with significant LoF Intolerance, {} of them match with our translation table.".format(len(pli_genes), len(pli_genes.intersection(all_genes))))
        #print("Found {} LoF Intolerance genes among the {} known positive genes (p: {:.2e}, OR: {}), leaving {} in {} Unknowns".format(
        #    len(pli_genes.intersection(positive_genes)), len(positive_genes), pli_enrichment_result_mendelian[1], round(pli_enrichment_result_mendelian[0], 3), len(unknown_pli_genes), len(unknown_genes)))

        #predicted_genes = set(self.outer_result[0].keys())

        #array_candidates = self.make_contingency_table(unknown_genes, predicted_genes, unknown_pli_genes)

        #pli_enrichment_result_candidates = fisher_exact(array_candidates)

        #print("Fishers Exact Test for genes with significant LoF Intolerance among Predicted Genes. p: {:.2e}, OR: {}".format(pli_enrichment_result_candidates[1], round(pli_enrichment_result_candidates[0], 3)))
        #print("LoF Intolerance Confusion Matrix:\n" + str(array_candidates))

        tukeys = []

        for column, description in zip(["lof_z", "mis_z"], ["LoF Z Value", "Missense Z Value"]):
            hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

            core = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
            mendelian = [hgnc2value[hgnc] for hgnc in mendelians.intersection(all_pli_genes)]
            predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
            not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes).difference(mendelians)).intersection(all_pli_genes)]

            result_predicted = f_oneway(core,
                                        mendelian,
                                        predicted,
                                        not_predicted)

            print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

            df = pd.DataFrame({'score': mendelian + core + predicted + not_predicted,
                               'group': np.repeat(['Mendelians', 'Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(core), len(predicted), len(not_predicted)])})

            tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

            print(tukey.summary())

            tukeys.append(tukey)

        #return [pli_enrichment_result_mendelian, array_mendelian, pli_enrichment_result_candidates, array_candidates], tukeys
        return tukeys

In [None]:
import pandas as pd
from speos.visualization.settings import *
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.lines import Line2D
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

lof_or_missense = 1

phenotypes = ["Ulcerative Colitis"]
methods = ["film"]
pretty_methods = ["FiLM"]
groups = ["Peripheral Genes", "Ambivalent Genes", "Core Genes", "Mendelians"]
tick_size = small_font
label_size = medium_font + 2

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
pink = "#EC5E71"
purple = "#600F5C"
mint = "#3EB489"
width_ratios= [10, 1]*2
marker_size=10
whiskers_length=0.1

fig, axes = plt.subplots(figsize=((full_width*cm)*0.33, 3*cm), ncols=len(phenotypes), sharey=True)
#axes = axes.flatten()
axes = [axes]
colors_ = [mint]
markers = ["^"]

for phenotype, color_, ax in zip(phenotypes, colors_, axes):
    plot_df_means = pd.DataFrame(columns=pretty_methods, index=groups)
    plot_df_ci = pd.DataFrame(columns=pretty_methods, index=groups)
    dfs = []
    for method, pretty_method in zip(methods, pretty_methods):
    
        config_paths = {"Ulcerative Colitis": "config_uc_only_nohetio_{}_newstorage.yaml".format(method)}

        config = Config()
        config.parse_yaml(config_paths[phenotype])
        #prepro = InputHandler(config).get_preprocessor()
        #prepro.build_graph(adjacency=False) 

        #pp = PostProcessor(config)

        import json

        outer_results_paths = {"Ulcerative Colitis": "/mnt/storage/speos/results/uc_{}_nohetioouter_results.json".format(method)}

        results_files = {"Ulcerative Colitis": "/mnt/storage/speos/results/uc_{}_nohetio_outer_0_fold_1.tsv".format(method),
                        "Coronary Artery Disease": "/mnt/storage/speos/results/uc_{}_nohetio_outer_0_fold_1.tsv".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)[0]



        core_genes = set([gene for gene, value in results.items() if value >= 11])

        ambivalent_genes = set([gene for gene, value in results.items() if value < 11])

        gene_df = pd.read_csv(results_files[phenotype], sep="\t", header=0)
        all_genes = set(gene_df["hgnc"].tolist())
        mendelians = set(gene_df["hgnc"][gene_df["truth"] == 1].tolist())

        #core_genes.update(mendelians)
        pli_table = get_pli_table()
        
        all_pli_genes = set(pli_table["gene"].tolist())
        
        #tukey = lof_intolerance(all_genes,mendelians,  core_genes, ambivalent_genes)

        column = ["lof_z", "mis_z"][lof_or_missense]
        description = ["LoF Z Value", "Missense Z Value"][lof_or_missense]
        
        hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

        core = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
        mendelian = [hgnc2value[hgnc] for hgnc in mendelians.intersection(all_pli_genes)]
        predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
        not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes).difference(mendelians)).intersection(all_pli_genes)]

        result_predicted = f_oneway(core,
                                        mendelian,
                                        predicted,
                                        not_predicted)

        print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

        df = pd.DataFrame({'score': mendelian + core + predicted + not_predicted,
                               'group': np.repeat(['Mendelians', 'Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(core), len(predicted), len(not_predicted)])})

        tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

        print(tukey.summary())

        df = pd.read_html(tukey.summary().as_html(), header=0, index_col=0)[0]
        df["p-adj"] = tukey.pvalues
        if getattr(tukey, 'halfwidths', None) is None:
            tukey._simultaneous_ci()
        

        group_names = tukey.groupsunique
        means = tukey._multicomp.groupstats.groupmean
        cis = tukey.halfwidths

        for group, mean, ci in zip(group_names, means, cis):
            plot_df_means.loc[group, pretty_method] = mean
            plot_df_ci.loc[group, pretty_method] = ci

        df.reset_index(inplace=True)
        df2 = pd.DataFrame()
        df2["Group Name"] = group_names
        df2["Group N"] = [(np.asarray(tukey.groups) == group).sum() for group in group_names]
        df2["Group Mean"] = means
        df2["Lower 95% CI"] = np.asarray(means) - np.asarray(ci)
        df2["Upper 95% CI"] = np.asarray(means) + np.asarray(ci)
        df2[" "] = [" "] * len(df2.index)
        #df_columns = df.columns[-6:].tolist() + df.columns[:-6].tolist()
        #df = df[df_columns]
        dfs.append(df)
    
    dfs_joined = pd.concat(dfs, keys=pretty_methods)
    dfs_joined.reset_index(inplace=True)
    dfs_joined.columns =  ["Method"] + dfs_joined.columns[1:].tolist()
    dfs_joined.drop("level_1", axis="columns", inplace=True)
    dfs_joined.to_csv("statistical_dump/{}_tukey_{}.tsv".format(phenotype, "lof" if lof_or_missense == 0 else "missense"), sep="\t", index=False)    

    max_comparisons = ((len(groups) - 1) * len(methods)) + 1 

    ax.vlines(x=np.mean(tukey.data), ymin=0.5, ymax = max_comparisons + 0.5, linestyles=["--"], color=lightgrey, zorder=-10)
    ax.set_ylim((0.5, max_comparisons + 0.5))
    
    for group in groups[::-1]:
        for method, marker, df in zip(pretty_methods, markers, dfs):

            if group == "Peripheral Genes":
                color = grey
            else:
                """
                nc_mean = plot_df_means.loc["Noncandidate Gene", method]
                nc_ci = plot_df_ci.loc["Noncandidate Gene", method]
                group_mean = plot_df_means.loc[group, method]
                group_ci = plot_df_ci.loc[group, method]
                big_enough = (nc_mean + nc_ci) < (group_mean - group_ci)
                small_enough = (nc_mean - nc_ci) > (group_mean + group_ci)
                
                if big_enough or small_enough:
                    color = color_

                """
                if df.loc[(df["group1"] == group) & (df["group2"] == "Peripheral Genes"), "p-adj"].item() < 0.05:
                    color = color_
                else:
                    color = grey

            
            # draw Confidence Interval
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons, max_comparisons),
                     color=color,
                     linewidth=0.5)
            
            
            # draw Whiskers
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] - plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)
            ax.plot((plot_df_means.loc[group, method] + plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)

            if group != "Mendelian":
                ax.scatter(plot_df_means.loc[group, method], max_comparisons, color=color, marker=marker, s=marker_size, linewidth=0.5, edgecolors='black', zorder=5)
            

            if group == "Mendelian":
                break

        max_comparisons -= 1
    
    label = ["LoF Z-Score", "Missense Z-Score"][lof_or_missense]

    ax.set_xlabel(label, size=label_size- 2)
    #ax.set_title(phenotype, size=label_size)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.2))
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))

ticks = [1,2,3,4]
#ticks = [1, 3, 5]
names = (pretty_methods[::-1] * 2) + ["Mendelian"]

axes[0].set_yticks(ticks)
axes[0].set_yticklabels(["Peripheral Genes", "CS 1 - 10", "CS 11", "Mendelian"], size=label_size-2)

plt.tight_layout()
plt.subplots_adjust(wspace=0.05)
if lof_or_missense == 0:
    plt.savefig("lof_uc_new4.svg", dpi=450)
else:
    plt.savefig("missense_uc_new4.svg", dpi=450)

In [None]:
import pandas as pd
from speos.visualization.settings import *
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.lines import Line2D
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

lof_or_missense = 1

phenotypes = ["Coronary Artery Disease"]
methods = ["film"]
pretty_methods = ["FiLM"]
groups = ["Peripheral Genes", "Ambivalent Genes", "Core Genes", "Mendelians"]
tick_size = small_font
label_size = medium_font + 2

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
pink = "#EC5E71"
purple = "#600F5C"
mint = "#3EB489"
width_ratios= [10, 1]*2
marker_size=10
whiskers_length=0.1

fig, axes = plt.subplots(figsize=((full_width*cm)*0.33, 3*cm), ncols=len(phenotypes), sharey=True)
#axes = axes.flatten()
axes = [axes]
colors_ = [mint]
markers = ["^"]

for phenotype, color_, ax in zip(phenotypes, colors_, axes):
    plot_df_means = pd.DataFrame(columns=pretty_methods, index=groups)
    plot_df_ci = pd.DataFrame(columns=pretty_methods, index=groups)
    dfs = []
    for method, pretty_method in zip(methods, pretty_methods):
    
        config_paths = {"Coronary Artery Disease": "config_cad_really_only_nohetio_{}_newstorage.yaml".format(method)}

        config = Config()
        config.parse_yaml(config_paths[phenotype])
        #prepro = InputHandler(config).get_preprocessor()
        #prepro.build_graph(adjacency=False) 

        #pp = PostProcessor(config)

        import json

        outer_results_paths = {"Coronary Artery Disease": "/mnt/storage/speos/results/cad_really_{}_nohetioouter_results.json".format(method)}

        results_files = {"Coronary Artery Disease": "/mnt/storage/speos/results/cad_really_{}_nohetio_outer_0_fold_1.tsv".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)[0]



        core_genes = set([gene for gene, value in results.items() if value >= 11])

        ambivalent_genes = set([gene for gene, value in results.items() if value < 11])

        gene_df = pd.read_csv(results_files[phenotype], sep="\t", header=0)
        all_genes = set(gene_df["hgnc"].tolist())
        mendelians = set(gene_df["hgnc"][gene_df["truth"] == 1].tolist())

        #core_genes.update(mendelians)
        pli_table = get_pli_table()
        
        all_pli_genes = set(pli_table["gene"].tolist())
        
        #tukey = lof_intolerance(all_genes,mendelians,  core_genes, ambivalent_genes)

        column = ["lof_z", "mis_z"][lof_or_missense]
        description = ["LoF Z Value", "Missense Z Value"][lof_or_missense]
        
        hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

        core = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
        mendelian = [hgnc2value[hgnc] for hgnc in mendelians.intersection(all_pli_genes)]
        predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
        not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes).difference(mendelians)).intersection(all_pli_genes)]

        result_predicted = f_oneway(core,
                                        mendelian,
                                        predicted,
                                        not_predicted)

        print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

        df = pd.DataFrame({'score': mendelian + core + predicted + not_predicted,
                               'group': np.repeat(['Mendelians', 'Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(core), len(predicted), len(not_predicted)])})

        tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

        print(tukey.summary())

        df = pd.read_html(tukey.summary().as_html(), header=0, index_col=0)[0]
        df["p-adj"] = tukey.pvalues
        if getattr(tukey, 'halfwidths', None) is None:
            tukey._simultaneous_ci()
        

        group_names = tukey.groupsunique
        means = tukey._multicomp.groupstats.groupmean
        cis = tukey.halfwidths

        for group, mean, ci in zip(group_names, means, cis):
            plot_df_means.loc[group, pretty_method] = mean
            plot_df_ci.loc[group, pretty_method] = ci

        df.reset_index(inplace=True)
        df2 = pd.DataFrame()
        df2["Group Name"] = group_names
        df2["Group N"] = [(np.asarray(tukey.groups) == group).sum() for group in group_names]
        df2["Group Mean"] = means
        df2["Lower 95% CI"] = np.asarray(means) - np.asarray(ci)
        df2["Upper 95% CI"] = np.asarray(means) + np.asarray(ci)
        df2[" "] = [" "] * len(df2.index)
        #df_columns = df.columns[-6:].tolist() + df.columns[:-6].tolist()
        #df = df[df_columns]
        dfs.append(df)
    
    dfs_joined = pd.concat(dfs, keys=pretty_methods)
    dfs_joined.reset_index(inplace=True)
    dfs_joined.columns =  ["Method"] + dfs_joined.columns[1:].tolist()
    dfs_joined.drop("level_1", axis="columns", inplace=True)
    dfs_joined.to_csv("statistical_dump/{}_tukey_{}.tsv".format(phenotype, "lof" if lof_or_missense == 0 else "missense"), sep="\t", index=False)    

    max_comparisons = ((len(groups) - 1) * len(methods)) + 1 

    ax.vlines(x=np.mean(tukey.data), ymin=0.5, ymax = max_comparisons + 0.5, linestyles=["--"], color=lightgrey, zorder=-10)
    ax.set_ylim((0.5, max_comparisons + 0.5))
    
    for group in groups[::-1]:
        for method, marker, df in zip(pretty_methods, markers, dfs):

            if group == "Peripheral Genes":
                color = grey
            else:
                """
                nc_mean = plot_df_means.loc["Noncandidate Gene", method]
                nc_ci = plot_df_ci.loc["Noncandidate Gene", method]
                group_mean = plot_df_means.loc[group, method]
                group_ci = plot_df_ci.loc[group, method]
                big_enough = (nc_mean + nc_ci) < (group_mean - group_ci)
                small_enough = (nc_mean - nc_ci) > (group_mean + group_ci)
                
                if big_enough or small_enough:
                    color = color_

                """
                if df.loc[(df["group1"] == group) & (df["group2"] == "Peripheral Genes"), "p-adj"].item() < 0.05:
                    color = color_
                else:
                    color = grey

            
            # draw Confidence Interval
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons, max_comparisons),
                     color=color,
                     linewidth=0.5)
            
            
            # draw Whiskers
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] - plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)
            ax.plot((plot_df_means.loc[group, method] + plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)

            if group != "Mendelian":
                ax.scatter(plot_df_means.loc[group, method], max_comparisons, color=color, marker=marker, s=marker_size, linewidth=0.5, edgecolors='black', zorder=5)
            

            if group == "Mendelian":
                break

        max_comparisons -= 1
    
    label = ["LoF Z-Score", "Missense Z-Score"][lof_or_missense]

    ax.set_xlabel(label, size=label_size- 2)
    #ax.set_title(phenotype, size=label_size)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.2))
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))

ticks = [1,2,3,4]
#ticks = [1, 3, 5]
names = (pretty_methods[::-1] * 2) + ["Mendelian"]

axes[0].set_yticks(ticks)
axes[0].set_yticklabels(["Peripheral Genes", "CS 1 - 10", "CS 11", "Mendelian"], size=label_size-2)

plt.tight_layout()
plt.subplots_adjust(wspace=0.05)
if lof_or_missense == 0:
    plt.savefig("lof_cad_new4.svg", dpi=450)
else:
    plt.savefig("missense_cad_new4.svg", dpi=450)

In [None]:
import pandas as pd
from speos.visualization.settings import *
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.lines import Line2D
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

lof_or_missense = 1

phenotypes = ["Schizophrenia"]
methods = ["film"]
pretty_methods = ["FiLM"]
groups = ["Peripheral Genes", "Ambivalent Genes", "Core Genes", "Mendelians"]
tick_size = small_font
label_size = medium_font + 2

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
pink = "#EC5E71"
purple = "#600F5C"
mint = "#3EB489"
width_ratios= [10, 1]*2
marker_size=10
whiskers_length=0.1

fig, axes = plt.subplots(figsize=((full_width*cm)*0.33, 3*cm), ncols=len(phenotypes), sharey=True)
#axes = axes.flatten()
axes = [axes]
colors_ = [mint]
markers = ["^"]

for phenotype, color_, ax in zip(phenotypes, colors_, axes):
    plot_df_means = pd.DataFrame(columns=pretty_methods, index=groups)
    plot_df_ci = pd.DataFrame(columns=pretty_methods, index=groups)
    dfs = []
    for method, pretty_method in zip(methods, pretty_methods):
    
        config_paths = {"Schizophrenia": "config_scz_only_nohetio_{}_newstorage.yaml".format(method)}

        config = Config()
        config.parse_yaml(config_paths[phenotype])
        #prepro = InputHandler(config).get_preprocessor()
        #prepro.build_graph(adjacency=False) 

        #pp = PostProcessor(config)

        import json

        outer_results_paths = {"Schizophrenia": "/mnt/storage/speos/results/scz_{}_nohetioouter_results.json".format(method)}

        results_files = {"Schizophrenia": "/mnt/storage/speos/results/scz_{}_nohetio_outer_0_fold_1.tsv".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)[0]



        core_genes = set([gene for gene, value in results.items() if value >= 11])

        ambivalent_genes = set([gene for gene, value in results.items() if value < 11])

        gene_df = pd.read_csv(results_files[phenotype], sep="\t", header=0)
        all_genes = set(gene_df["hgnc"].tolist())
        mendelians = set(gene_df["hgnc"][gene_df["truth"] == 1].tolist())

        #core_genes.update(mendelians)
        pli_table = get_pli_table()
        
        all_pli_genes = set(pli_table["gene"].tolist())
        
        #tukey = lof_intolerance(all_genes,mendelians,  core_genes, ambivalent_genes)

        column = ["lof_z", "mis_z"][lof_or_missense]
        description = ["LoF Z Value", "Missense Z Value"][lof_or_missense]
        
        hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

        core = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
        mendelian = [hgnc2value[hgnc] for hgnc in mendelians.intersection(all_pli_genes)]
        predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
        not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes).difference(mendelians)).intersection(all_pli_genes)]

        result_predicted = f_oneway(core,
                                        mendelian,
                                        predicted,
                                        not_predicted)

        print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

        df = pd.DataFrame({'score': mendelian + core + predicted + not_predicted,
                               'group': np.repeat(['Mendelians', 'Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(core), len(predicted), len(not_predicted)])})

        tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

        print(tukey.summary())

        df = pd.read_html(tukey.summary().as_html(), header=0, index_col=0)[0]
        df["p-adj"] = tukey.pvalues
        if getattr(tukey, 'halfwidths', None) is None:
            tukey._simultaneous_ci()
        

        group_names = tukey.groupsunique
        means = tukey._multicomp.groupstats.groupmean
        cis = tukey.halfwidths

        for group, mean, ci in zip(group_names, means, cis):
            plot_df_means.loc[group, pretty_method] = mean
            plot_df_ci.loc[group, pretty_method] = ci

        df.reset_index(inplace=True)
        df2 = pd.DataFrame()
        df2["Group Name"] = group_names
        df2["Group N"] = [(np.asarray(tukey.groups) == group).sum() for group in group_names]
        df2["Group Mean"] = means
        df2["Lower 95% CI"] = np.asarray(means) - np.asarray(ci)
        df2["Upper 95% CI"] = np.asarray(means) + np.asarray(ci)
        df2[" "] = [" "] * len(df2.index)
        #df_columns = df.columns[-6:].tolist() + df.columns[:-6].tolist()
        #df = df[df_columns]
        dfs.append(df)
    
    dfs_joined = pd.concat(dfs, keys=pretty_methods)
    dfs_joined.reset_index(inplace=True)
    dfs_joined.columns =  ["Method"] + dfs_joined.columns[1:].tolist()
    dfs_joined.drop("level_1", axis="columns", inplace=True)
    dfs_joined.to_csv("statistical_dump/{}_tukey_{}.tsv".format(phenotype, "lof" if lof_or_missense == 0 else "missense"), sep="\t", index=False)    

    max_comparisons = ((len(groups) - 1) * len(methods)) + 1 

    ax.vlines(x=np.mean(tukey.data), ymin=0.5, ymax = max_comparisons + 0.5, linestyles=["--"], color=lightgrey, zorder=-10)
    ax.set_ylim((0.5, max_comparisons + 0.5))
    
    for group in groups[::-1]:
        for method, marker, df in zip(pretty_methods, markers, dfs):

            if group == "Peripheral Genes":
                color = grey
            else:
                """
                nc_mean = plot_df_means.loc["Noncandidate Gene", method]
                nc_ci = plot_df_ci.loc["Noncandidate Gene", method]
                group_mean = plot_df_means.loc[group, method]
                group_ci = plot_df_ci.loc[group, method]
                big_enough = (nc_mean + nc_ci) < (group_mean - group_ci)
                small_enough = (nc_mean - nc_ci) > (group_mean + group_ci)
                
                if big_enough or small_enough:
                    color = color_

                """
                if df.loc[(df["group1"] == group) & (df["group2"] == "Peripheral Genes"), "p-adj"].item() < 0.05:
                    color = color_
                else:
                    color = grey

            
            # draw Confidence Interval
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons, max_comparisons),
                     color=color,
                     linewidth=0.5)
            
            
            # draw Whiskers
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] - plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)
            ax.plot((plot_df_means.loc[group, method] + plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)

            if group != "Mendelian":
                ax.scatter(plot_df_means.loc[group, method], max_comparisons, color=color, marker=marker, s=marker_size, linewidth=0.5, edgecolors='black', zorder=5)
            

            if group == "Mendelian":
                break

        max_comparisons -= 1
    
    label = ["LoF Z-Score", "Missense Z-Score"][lof_or_missense]

    ax.set_xlabel(label, size=label_size- 2)
    #ax.set_title(phenotype, size=label_size)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.2))
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))

ticks = [1,2,3,4]
#ticks = [1, 3, 5]
names = (pretty_methods[::-1] * 2) + ["Mendelian"]

axes[0].set_yticks(ticks)
axes[0].set_yticklabels(["Peripheral Genes", "CS 1 - 10", "CS 11", "Mendelian"], size=label_size-2)

plt.tight_layout()
plt.subplots_adjust(wspace=0.05)
if lof_or_missense == 0:
    plt.savefig("lof_scz_new4.svg", dpi=450)
else:
    plt.savefig("missense_scz_new4.svg", dpi=450)

In [None]:
import pandas as pd
from speos.visualization.settings import *
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.lines import Line2D
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

lof_or_missense = 0

phenotypes = ["Rheumatoid Arthritis"]
methods = ["film"]
pretty_methods = ["FiLM"]
groups = ["Peripheral Genes", "Ambivalent Genes", "Core Genes", "Mendelians"]
tick_size = small_font
label_size = medium_font + 2

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
pink = "#EC5E71"
purple = "#600F5C"
mint = "#3EB489"
width_ratios= [10, 1]*2
marker_size=10
whiskers_length=0.1

fig, axes = plt.subplots(figsize=((full_width*cm)*0.33, 3*cm), ncols=len(phenotypes), sharey=True)
#axes = axes.flatten()
axes = [axes]
colors_ = [mint]
markers = ["^"]

for phenotype, color_, ax in zip(phenotypes, colors_, axes):
    plot_df_means = pd.DataFrame(columns=pretty_methods, index=groups)
    plot_df_ci = pd.DataFrame(columns=pretty_methods, index=groups)
    dfs = []
    for method, pretty_method in zip(methods, pretty_methods):
    
        config_paths = {"Rheumatoid Arthritis": "config_ra_only_nohetio_{}_newstorage.yaml".format(method)}

        config = Config()
        config.parse_yaml(config_paths[phenotype])
        #prepro = InputHandler(config).get_preprocessor()
        #prepro.build_graph(adjacency=False) 

        #pp = PostProcessor(config)

        import json

        outer_results_paths = {"Rheumatoid Arthritis": "/mnt/storage/speos/results/ra_{}_nohetioouter_results.json".format(method)}

        results_files = {"Rheumatoid Arthritis": "/mnt/storage/speos/results/ra_{}_nohetio_outer_0_fold_1.tsv".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)[0]



        core_genes = set([gene for gene, value in results.items() if value >= 11])

        ambivalent_genes = set([gene for gene, value in results.items() if value < 11])

        gene_df = pd.read_csv(results_files[phenotype], sep="\t", header=0)
        all_genes = set(gene_df["hgnc"].tolist())
        mendelians = set(gene_df["hgnc"][gene_df["truth"] == 1].tolist())

        #core_genes.update(mendelians)
        pli_table = get_pli_table()
        
        all_pli_genes = set(pli_table["gene"].tolist())
        
        #tukey = lof_intolerance(all_genes,mendelians,  core_genes, ambivalent_genes)

        column = ["lof_z", "mis_z"][lof_or_missense]
        description = ["LoF Z Value", "Missense Z Value"][lof_or_missense]
        
        hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

        core = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
        mendelian = [hgnc2value[hgnc] for hgnc in mendelians.intersection(all_pli_genes)]
        predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
        not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes).difference(mendelians)).intersection(all_pli_genes)]

        result_predicted = f_oneway(core,
                                        mendelian,
                                        predicted,
                                        not_predicted)

        print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

        df = pd.DataFrame({'score': mendelian + core + predicted + not_predicted,
                               'group': np.repeat(['Mendelians', 'Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(core), len(predicted), len(not_predicted)])})

        tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

        print(tukey.summary())

        df = pd.read_html(tukey.summary().as_html(), header=0, index_col=0)[0]
        df["p-adj"] = tukey.pvalues
        if getattr(tukey, 'halfwidths', None) is None:
            tukey._simultaneous_ci()
        

        group_names = tukey.groupsunique
        means = tukey._multicomp.groupstats.groupmean
        cis = tukey.halfwidths

        for group, mean, ci in zip(group_names, means, cis):
            plot_df_means.loc[group, pretty_method] = mean
            plot_df_ci.loc[group, pretty_method] = ci

        df.reset_index(inplace=True)
        df2 = pd.DataFrame()
        df2["Group Name"] = group_names
        df2["Group N"] = [(np.asarray(tukey.groups) == group).sum() for group in group_names]
        df2["Group Mean"] = means
        df2["Lower 95% CI"] = np.asarray(means) - np.asarray(ci)
        df2["Upper 95% CI"] = np.asarray(means) + np.asarray(ci)
        df2[" "] = [" "] * len(df2.index)
        #df_columns = df.columns[-6:].tolist() + df.columns[:-6].tolist()
        #df = df[df_columns]
        dfs.append(df)
    
    dfs_joined = pd.concat(dfs, keys=pretty_methods)
    dfs_joined.reset_index(inplace=True)
    dfs_joined.columns =  ["Method"] + dfs_joined.columns[1:].tolist()
    dfs_joined.drop("level_1", axis="columns", inplace=True)
    dfs_joined.to_csv("statistical_dump/{}_tukey_{}.tsv".format(phenotype, "lof" if lof_or_missense == 0 else "missense"), sep="\t", index=False)    

    max_comparisons = ((len(groups) - 1) * len(methods)) + 1 

    ax.vlines(x=np.mean(tukey.data), ymin=0.5, ymax = max_comparisons + 0.5, linestyles=["--"], color=lightgrey, zorder=-10)
    ax.set_ylim((0.5, max_comparisons + 0.5))
    
    for group in groups[::-1]:
        for method, marker, df in zip(pretty_methods, markers, dfs):

            if group == "Peripheral Genes":
                color = grey
            else:
                """
                nc_mean = plot_df_means.loc["Noncandidate Gene", method]
                nc_ci = plot_df_ci.loc["Noncandidate Gene", method]
                group_mean = plot_df_means.loc[group, method]
                group_ci = plot_df_ci.loc[group, method]
                big_enough = (nc_mean + nc_ci) < (group_mean - group_ci)
                small_enough = (nc_mean - nc_ci) > (group_mean + group_ci)
                
                if big_enough or small_enough:
                    color = color_

                """
                if df.loc[(df["group1"] == group) & (df["group2"] == "Peripheral Genes"), "p-adj"].item() < 0.05:
                    color = color_
                else:
                    color = grey

            
            # draw Confidence Interval
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons, max_comparisons),
                     color=color,
                     linewidth=0.5)
            
            
            # draw Whiskers
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] - plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)
            ax.plot((plot_df_means.loc[group, method] + plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)

            if group != "Mendelian":
                ax.scatter(plot_df_means.loc[group, method], max_comparisons, color=color, marker=marker, s=marker_size, linewidth=0.5, edgecolors='black', zorder=5)
            

            if group == "Mendelian":
                break

        max_comparisons -= 1
    
    label = ["LoF Z-Score", "Missense Z-Score"][lof_or_missense]

    ax.set_xlabel(label, size=label_size- 2)
    #ax.set_title(phenotype, size=label_size)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.2))
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))

ticks = [1,2,3,4]
#ticks = [1, 3, 5]
names = (pretty_methods[::-1] * 2) + ["Mendelian"]

axes[0].set_yticks(ticks)
axes[0].set_yticklabels(["Peripheral Genes", "CS 1 - 10", "CS 11", "Mendelian"], size=label_size-2)

plt.tight_layout()
plt.subplots_adjust(wspace=0.05)
if lof_or_missense == 0:
    plt.savefig("lof_ra_new4.svg", dpi=450)
else:
    plt.savefig("missense_ra_new4.svg", dpi=450)

In [None]:
import pandas as pd
from speos.visualization.settings import *
import matplotlib.pyplot as plt
from matplotlib import ticker
from matplotlib.lines import Line2D
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

lof_or_missense = 1

phenotypes = ["Alzheimer's Disease"]
methods = ["film"]
pretty_methods = ["FiLM"]
groups = ["Peripheral Genes", "Ambivalent Genes", "Core Genes", "Mendelians"]
tick_size = small_font
label_size = medium_font + 2

crimson = "#6f0000"
navy = "#02055a"
jungle = "#1e5631"
tan = "#c24e00"
petrol = "#005f6a"
grey = "#bbbbbb"
lightgrey = "#dddddd"
pink = "#EC5E71"
purple = "#600F5C"
mint = "#3EB489"
width_ratios= [10, 1]*2
marker_size=10
whiskers_length=0.1

fig, axes = plt.subplots(figsize=((full_width*cm)*0.33, 3*cm), ncols=len(phenotypes), sharey=True)
#axes = axes.flatten()
axes = [axes]
colors_ = [mint]
markers = ["^"]

for phenotype, color_, ax in zip(phenotypes, colors_, axes):
    plot_df_means = pd.DataFrame(columns=pretty_methods, index=groups)
    plot_df_ci = pd.DataFrame(columns=pretty_methods, index=groups)
    dfs = []
    for method, pretty_method in zip(methods, pretty_methods):
    
        config_paths = {"Alzheimer's Disease": "config_alz_only_nohetio_{}_newstorage.yaml".format(method)}

        config = Config()
        config.parse_yaml(config_paths[phenotype])
        #prepro = InputHandler(config).get_preprocessor()
        #prepro.build_graph(adjacency=False) 

        #pp = PostProcessor(config)

        import json

        outer_results_paths = {"Alzheimer's Disease": "/mnt/storage/speos/results/alz_{}_nohetioouter_results.json".format(method)}

        results_files = {"Alzheimer's Disease": "/mnt/storage/speos/results/alz_{}_nohetio_outer_0_fold_1.tsv".format(method)}

        with open(outer_results_paths[phenotype], "r") as file:
            results = json.load(file)[0]



        core_genes = set([gene for gene, value in results.items() if value >= 11])

        ambivalent_genes = set([gene for gene, value in results.items() if value < 11])

        gene_df = pd.read_csv(results_files[phenotype], sep="\t", header=0)
        all_genes = set(gene_df["hgnc"].tolist())
        mendelians = set(gene_df["hgnc"][gene_df["truth"] == 1].tolist())

        #core_genes.update(mendelians)
        pli_table = get_pli_table()
        
        all_pli_genes = set(pli_table["gene"].tolist())
        
        #tukey = lof_intolerance(all_genes,mendelians,  core_genes, ambivalent_genes)

        column = ["lof_z", "mis_z"][lof_or_missense]
        description = ["LoF Z Value", "Missense Z Value"][lof_or_missense]
        
        hgnc2value = {hgnc: value for hgnc, value in zip(pli_table["gene"].tolist(), pli_table[column].tolist()) if hgnc in all_genes}

        core = [hgnc2value[hgnc] for hgnc in core_genes.intersection(all_pli_genes)]
        mendelian = [hgnc2value[hgnc] for hgnc in mendelians.intersection(all_pli_genes)]
        predicted = [hgnc2value[hgnc] for hgnc in ambivalent_genes.intersection(all_pli_genes)]
        not_predicted = [hgnc2value[hgnc] for hgnc in (all_genes.difference(core_genes).difference(ambivalent_genes).difference(mendelians)).intersection(all_pli_genes)]

        result_predicted = f_oneway(core,
                                        mendelian,
                                        predicted,
                                        not_predicted)

        print("ANOVA for {} in Predicted Genes vs Non-Predicted Genes (Unknowns). p: {:.2e}, F: {}".format(description, result_predicted[1], round(result_predicted[0], 3)))

        df = pd.DataFrame({'score': mendelian + core + predicted + not_predicted,
                               'group': np.repeat(['Mendelians', 'Core Genes', 'Ambivalent Genes', 'Peripheral Genes'], repeats=[len(mendelian), len(core), len(predicted), len(not_predicted)])})

        tukey = pairwise_tukeyhsd(endog=df['score'],
                                      groups=df['group'],
                                      alpha=0.05)

        print(tukey.summary())

        df = pd.read_html(tukey.summary().as_html(), header=0, index_col=0)[0]
        df["p-adj"] = tukey.pvalues
        if getattr(tukey, 'halfwidths', None) is None:
            tukey._simultaneous_ci()
        

        group_names = tukey.groupsunique
        means = tukey._multicomp.groupstats.groupmean
        cis = tukey.halfwidths

        for group, mean, ci in zip(group_names, means, cis):
            plot_df_means.loc[group, pretty_method] = mean
            plot_df_ci.loc[group, pretty_method] = ci

        df.reset_index(inplace=True)
        df2 = pd.DataFrame()
        df2["Group Name"] = group_names
        df2["Group N"] = [(np.asarray(tukey.groups) == group).sum() for group in group_names]
        df2["Group Mean"] = means
        df2["Lower 95% CI"] = np.asarray(means) - np.asarray(ci)
        df2["Upper 95% CI"] = np.asarray(means) + np.asarray(ci)
        df2[" "] = [" "] * len(df2.index)
        #df_columns = df.columns[-6:].tolist() + df.columns[:-6].tolist()
        #df = df[df_columns]
        dfs.append(df)
    
    dfs_joined = pd.concat(dfs, keys=pretty_methods)
    dfs_joined.reset_index(inplace=True)
    dfs_joined.columns =  ["Method"] + dfs_joined.columns[1:].tolist()
    dfs_joined.drop("level_1", axis="columns", inplace=True)
    dfs_joined.to_csv("statistical_dump/{}_tukey_{}.tsv".format(phenotype, "lof" if lof_or_missense == 0 else "missense"), sep="\t", index=False)    

    max_comparisons = ((len(groups) - 1) * len(methods)) + 1 

    ax.vlines(x=np.mean(tukey.data), ymin=0.5, ymax = max_comparisons + 0.5, linestyles=["--"], color=lightgrey, zorder=-10)
    ax.set_ylim((0.5, max_comparisons + 0.5))
    
    for group in groups[::-1]:
        for method, marker, df in zip(pretty_methods, markers, dfs):

            if group == "Peripheral Genes":
                color = grey
            else:
                """
                nc_mean = plot_df_means.loc["Noncandidate Gene", method]
                nc_ci = plot_df_ci.loc["Noncandidate Gene", method]
                group_mean = plot_df_means.loc[group, method]
                group_ci = plot_df_ci.loc[group, method]
                big_enough = (nc_mean + nc_ci) < (group_mean - group_ci)
                small_enough = (nc_mean - nc_ci) > (group_mean + group_ci)
                
                if big_enough or small_enough:
                    color = color_

                """
                if df.loc[(df["group1"] == group) & (df["group2"] == "Peripheral Genes"), "p-adj"].item() < 0.05:
                    color = color_
                else:
                    color = grey

            
            # draw Confidence Interval
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons, max_comparisons),
                     color=color,
                     linewidth=0.5)
            
            
            # draw Whiskers
            ax.plot((plot_df_means.loc[group, method] - plot_df_ci.loc[group, method], plot_df_means.loc[group, method] - plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)
            ax.plot((plot_df_means.loc[group, method] + plot_df_ci.loc[group, method], plot_df_means.loc[group, method] + plot_df_ci.loc[group, method]),
                    (max_comparisons + whiskers_length, max_comparisons - whiskers_length),
                     color=color,
                     linewidth=0.5)

            if group != "Mendelian":
                ax.scatter(plot_df_means.loc[group, method], max_comparisons, color=color, marker=marker, s=marker_size, linewidth=0.5, edgecolors='black', zorder=5)
            

            if group == "Mendelian":
                break

        max_comparisons -= 1
    
    label = ["LoF Z-Score", "Missense Z-Score"][lof_or_missense]

    ax.set_xlabel(label, size=label_size- 2)
    #ax.set_title(phenotype, size=label_size)
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.2))
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))

ticks = [1,2,3,4]
#ticks = [1, 3, 5]
names = (pretty_methods[::-1] * 2) + ["Mendelian"]

axes[0].set_yticks(ticks)
axes[0].set_yticklabels(["Peripheral Genes", "CS 1 - 10", "CS 11", "Mendelian"], size=label_size-2)

plt.tight_layout()
plt.subplots_adjust(wspace=0.05)
if lof_or_missense == 0:
    plt.savefig("lof_ad_new4.svg", dpi=450)
else:
    plt.savefig("missense_ad_new4.svg", dpi=450)