In [None]:
import pandas as pd
import numpy as np
import torch

from speos.preprocessing.handler import InputHandler
from speos.utils.config import Config
from speos.preprocessing.datasets import DatasetBootstrapper

In [None]:
import os
os.chdir("..")

In [None]:
config = Config()
config.parse_yaml("config_uc_only_nohetio_film_newstorage.yaml")
uc_prepro = InputHandler(config).get_preprocessor()
uc_prepro.get_data()
G = uc_prepro.get_graph()

"""
config = Config()
config.parse_yaml("config_cad_really_only_nohetio_film_newstorage.yaml")
cad_prepro = InputHandler(config).get_preprocessor()
cad_prepro.get_data()

config = Config()
config.parse_yaml("config_scz_only_nohetio_film_newstorage.yaml")
scz_prepro = InputHandler(config).get_preprocessor()
scz_prepro.get_data()
"""

# Get Some STatistics about the graph

In [None]:
len(list(G.edges))

In [None]:
connected_nodes = set()
connected_nodes.update(set([edge[0] for edge in G.edges]))
connected_nodes.update(set([edge[1] for edge in G.edges]))
len(connected_nodes)

In [None]:
X, y, adj = uc_prepro.get_data()
X.shape

In [None]:
len(connected_nodes) - X.shape[0]

In [None]:
 (len(connected_nodes) - X.shape[0]) / X.shape[0]

In [None]:
y.sum()

In [None]:
[uc_prepro.hgnc2id["TNFSF15"]]

# must be 15506

In [None]:
data = uc_prepro.get_data()


# Start Analysis

In [None]:

dataset = DatasetBootstrapper(holdout_size=config.input.holdout_size, name=config.name, config=config).get_dataset()

In [None]:
adjacency = {}
for key, value in dataset.data.edge_index_dict.items():
    num_incoming_edges = (value[1, :] == uc_prepro.hgnc2id["PARK7"]).sum()
    adjacency[key[1]] = num_incoming_edges

In [None]:
adjacency

In [None]:
sum([value for value in adjacency.values()])

In [None]:
edges = list[G.edges(data=True)]

In [None]:
adjacency = {}
for key, value in adj.items():
    num_incoming_edges = (value[1, :] == uc_prepro.hgnc2id["PARK7"]).sum()
    adjacency[key] = num_incoming_edges

In [None]:
sum([value for value in adjacency.values()])

In [None]:

from speos.utils.nn_utils import typed_edges_to_sparse_tensor

edge_index, encoder = typed_edges_to_sparse_tensor(dataset.data.x, dataset.data.edge_index_dict)
edge_index_flat = torch.vstack((edge_index.storage.row(), edge_index.storage.col()))
edge_index_flat_reversed = torch.vstack((edge_index.storage.col(), edge_index.storage.row()))
#edge_index_flat = add_remaining_self_loops(edge_index_flat)[0]
#edge_index_new = SparseTensor(row = edge_index_flat[0, :], col= edge_index_flat[1, :])

In [None]:
import json
from extensions.preprocessing import preprocess_labels

def get_coregenes(trait: str, background):
    trait2name = {"uc": "uc",
                "cad": "cad_really",
                "scz": "scz",
                "ad": "alz",
                "ra": "ra"}

    mendelians = preprocess_labels("extensions/{}_only_genes.tsv".format(trait2name[trait]))

    hsps= pd.read_csv("hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

    with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait2name[trait]), "r") as file:
        candidate2cs = json.load(file)[0]

    coregenes = [key for key, value in candidate2cs.items() if value == 11]

    other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

    allcore = set()
    allcore.update(set(coregenes))
    allcore.update(set(mendelians))
    allcore = allcore.intersection(set(background))

    noncore = set(background).difference(allcore).difference(other_coregenes)

    return allcore, other_coregenes, hsps,  noncore

In [None]:
uc_core, uc_weakcore, uc_hsps,  uc_noncore = get_coregenes("uc", uc_prepro.id2hgnc.values())

uc_core_indices = torch.LongTensor([uc_prepro.hgnc2id[hgnc] for hgnc in uc_core])

uc_core_indices_weak = torch.LongTensor([uc_prepro.hgnc2id[hgnc] for hgnc in uc_weakcore])

 

In [None]:
dataset.data.y.long().sum()
# must be 379

In [None]:
coregenes = dataset.data.y.long() 
coregenes[uc_core_indices] = 1
coregenes.sum()

coregenes_weak = torch.zeros_like(coregenes)
coregenes_weak[uc_core_indices_weak] = 1

In [None]:
G.in_degree(uc_prepro.hgnc2id["PARK7"])
# must be 120, or 251?

In [None]:
(edge_index_flat[1, :] == uc_prepro.hgnc2id["PARK7"]).sum()

In [None]:
G.out_degree(uc_prepro.hgnc2id["PARK7"])
# must be 0

# See if HSPs are "closer" to core genes

In [None]:
from torch_geometric.nn.models import LabelPropagation
from statsmodels.stats.multitest import fdrcorrection
import matplotlib.pyplot as plt

hsp_indices = [uc_prepro.hgnc2id[hgnc] for hgnc in uc_hsps]
hsp_y = torch.zeros_like(dataset.data.y)
hsp_y[np.asarray(hsp_indices)] = 1


def plot_labelprop(edge_index, hsp_y, coregenes, coregenes_weak, prepro, weights=None):
    import seaborn as sns
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams['font.family'] = 'Helvetica'

    full_width = 18
    cm = 1/2.54
    small_font = 6
    medium_font = 8
    large_font = 10
    mpl.rc('xtick', labelsize=small_font)
    mpl.rc('ytick', labelsize=small_font)
    mpl.rcParams['axes.linewidth'] = 0.4
    mpl.rcParams['ytick.major.size'] = 3
    mpl.rcParams['ytick.major.width'] = 0.5
    mpl.rcParams['ytick.minor.size'] = 2
    mpl.rcParams['ytick.minor.width'] = 0.3
    mpl.rcParams['xtick.major.size'] = 2
    mpl.rcParams['xtick.major.width'] = 0.3
    mpl.rcParams['xtick.minor.size'] = 1
    mpl.rcParams['xtick.minor.width'] = 0.1

    reverse_edge_index = torch.vstack((edge_index[1, :], edge_index[0, :]))

    #weights = torch.ones_like(edge_index[0, :]) if weights is None else weights

    fig, axes = plt.subplots(1, 6, figsize=(full_width*cm,5*cm), sharey=False)
    pvals = []
    stats = []
    ys = []

    for i, (num_layers, edges, ax) in enumerate(zip((1,3,5,1,3,5), [edge_index, edge_index, edge_index, reverse_edge_index, reverse_edge_index, reverse_edge_index], axes)):
                                
        model = LabelPropagation(num_layers=num_layers, alpha=0.9)
        out = model(hsp_y.long(), edges, edge_weight=weights)

        df = pd.DataFrame()
        df["HGNC"] = list(prepro.id2hgnc.values())
        df["coregenes"] = coregenes
        df["weak_coregenes"] = coregenes_weak
        df["total_coregenes"] = coregenes_weak + coregenes
        df["hsp"] = hsp_y
        df["propagated"] = out[:, 1]

        new_df = df[df["hsp"] == 0]
        new_df = new_df[new_df["weak_coregenes"] == 0]
        new_df = new_df[new_df["propagated"] > 0]
                                
        ax = sns.boxplot(new_df,x ="coregenes", y="propagated", fliersize=0.3, ax=ax, order=[1, 0], palette={0: "darkgray", 1: "#01016f"}, linewidth=1)
        if i != 0:
            ax.set_ylabel("")
        else:
            ax.set_ylabel("Propagated z'")
        ax.set_xlabel("")
        ax.set_xticklabels(["Core\n(n={})".format((new_df["coregenes"] == 1).sum()), "Peripheral\n(n={})".format((new_df["coregenes"] == 0).sum())])
        topval = np.quantile(new_df["propagated"], 0.99)
        ax.set_ylim((0, topval))

        from scipy.stats import mannwhitneyu

        result = mannwhitneyu(new_df["propagated"][new_df["coregenes"] == 1], new_df["propagated"][new_df["coregenes"] == 0])
        pvals.append(result[1])
        stats.append(result[0])

        ax.tick_params(axis='x', labelrotation=90)
        ys.append(max(np.quantile(new_df["propagated"][new_df["coregenes"] == 1], 0.75), np.quantile(new_df["propagated"][new_df["coregenes"] == 0], 0.75) ))

    pvals = fdrcorrection(pvals)[1]
    for ax, pval, y in zip(axes, pvals, ys):

        if pval < 0.001:
            s = "***"
        elif pval < 0.01:
            s = "**"
        elif pval < 0.05:
            s = "*"
        else:
            s = "n.s."

        ax.text(0.5, y=y * 1.2,
                    s=s, fontsize=small_font, ha="center")

    return fig, df, pvals, stats

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, prepro=uc_prepro)

plt.tight_layout()

plt.savefig("label_propagation_pyg.svg", bbox_inches="tight")

In [None]:
hsp_y.sum()

In [None]:
pvals

# Get Connection Statistics

In [None]:
from torch_geometric.utils import degree
from collections import Counter
from speos.visualization.settings import *
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

out_degrees = degree(edge_index_flat[0, :], dataset.data.x.shape[0])
in_degrees = degree(edge_index_flat[1, :], dataset.data.x.shape[0])
total_degrees = in_degrees + out_degrees



out_degree_core = out_degrees[coregenes.nonzero()]
out_degree_hsp = out_degrees[hsp_y.nonzero()]
out_degree_peripheral = out_degrees[(1 - (coregenes + coregenes_weak + hsp_y)).nonzero()]

in_degree_core = in_degrees[coregenes.nonzero()]
in_degree_hsp = in_degrees[hsp_y.nonzero()]
in_degree_peripheral = in_degrees[(1 - (coregenes + coregenes_weak + hsp_y)).nonzero()]

total_degree_core = total_degrees[coregenes.nonzero()]
total_degree_hsp = total_degrees[hsp_y.nonzero()]
total_degree_peripheral = total_degrees[(1 - (coregenes + coregenes_weak + hsp_y)).nonzero()]


out_core_counter = Counter(out_degree_core.squeeze().tolist())
out_hsp_counter = Counter(out_degree_hsp.squeeze().tolist())
out_peripheral_counter = Counter(out_degree_peripheral.squeeze().tolist())

in_core_counter = Counter(in_degree_core.squeeze().tolist())
in_hsp_counter = Counter(in_degree_hsp.squeeze().tolist())
in_peripheral_counter = Counter(in_degree_peripheral.squeeze().tolist())

total_core_counter = Counter(total_degree_core.squeeze().tolist())
total_hsp_counter = Counter(total_degree_hsp.squeeze().tolist())
total_peripheral_counter = Counter(total_degree_peripheral.squeeze().tolist())

out_counter = [out_peripheral_counter, out_core_counter, out_hsp_counter]
in_counter = [in_peripheral_counter, in_core_counter, in_hsp_counter]
total_counter = [total_peripheral_counter, total_hsp_counter, total_hsp_counter]

fig, axes = plt.subplots(1,4, figsize=(full_width*cm*1.3,5*cm*1.3), sharey=True, width_ratios=(3,3,3,1.2))

for counters, ax, title, xval in zip((out_counter, in_counter, total_counter, None), axes, ("Out-Degree", "In-Degree", "Total Degree", None), (1e5, 1e3 *1.3, 1e4 *6.2, None)):
    if title is None:
        legend_elements = [Patch(facecolor='#5a5a5a', edgecolor='#5a5a5a',
                                label='Peripheral\nn={}'.format((1 - (coregenes + coregenes_weak)).sum())),
                            Patch(facecolor='#01016f', edgecolor='#01016f',
                                    label='Core Gene\nn={}'.format(coregenes.sum())),
                            Patch(facecolor='#d8031c', edgecolor='#d8031c',
                                    label='HSP\nn={}'.format(hsp_y.sum().long()))]

        leg = ax.legend(handles=legend_elements, loc='center', title="Node Class", fontsize=6.8, title_fontsize=8, ncol=1, columnspacing=1.7, handletextpad=-0.5, labelspacing=1.7)

        for patch in leg.get_patches():
            patch.set_height(15)
            patch.set_width(5)
            patch.set_y(-5)
        ax.set_axis_off()

    else:

        ax.text(xval, 1e3 * 2, "Degree 0:", color="black", fontsize=8, ha="right")
        for counter, color, yval, totalnum in zip(counters, ("#5a5a5a", "#01016f", "#d8031c"), (1e3, 1e3 * 0.5, 1e3 * 0.25), ((1 - (coregenes + coregenes_weak)).sum(), coregenes.sum(),hsp_y.sum())):
            x, y = zip(*counter.items())           
            ax.scatter(x, y, marker='.', color=color, alpha=0.1)   
            ax.text(xval, yval, "{} ({:.1f}%)".format(counter[0], (counter[0] / totalnum)*100), color=color, fontsize=8, ha="right")  
                                            

                                                                                                                                                                                                                                                                
        # prep axes                                                                                                                      
        ax.set_xlabel(title)                                                                                        
        ax.set_xscale('log')                                                                                                                
        #ax.set_xlim(0.9, max(x) + 0.1 * max(x))  
        if title == "Out-Degree":                                                                                                        
            ax.set_ylabel('Frequency')                                                                                                          
        ax.set_yscale('log')                                                                                                                
        #ax.set_ylim(0.9, max(y) + 0.1 *max(y))       

plt.savefig("degree_distributions.svg", bbox_inches="tight")                                                                                                      

In [None]:
os.getcwd()

In [None]:
import matplotlib as mpl

fig, ax = plt.subplots(figsize=(full_width*cm*0.1, 0.2*cm))
col_map = plt.get_cmap('Reds')
cbar = mpl.colorbar.ColorbarBase(ax, cmap=col_map, orientation = 'horizontal', ticks=[0,  0.5,  1])
cbar.ax.tick_params(labelsize=5)
cbar.set_label(label="Propagated $Z'$",size=6,weight='bold')
# As for a more fancy example, you can also give an axes by hand:
c_map_ax = fig.add_axes([0.2, 0.8, 0.6, 0.02])
c_map_ax.axes.get_xaxis().set_visible(False)
c_map_ax.axes.get_yaxis().set_visible(False)
plt.tight_layout()
# and create another colorbar with:
#mpl.colorbar.ColorbarBase(c_map_ax, cmap=col_map, orientation = 'horizontal', )
plt.savefig("colorbar.svg")

In [None]:
core_and_isolated = ((out_degrees == 0 )[coregenes.nonzero()]).sum()
hsp_and_isolated = ((out_degrees == 0 )[hsp_y.nonzero()]).sum()
core_not_isolated = ((out_degrees > 0 )[coregenes.nonzero()]).sum()
hsp_not_isolated = ((out_degrees > 0 )[hsp_y.nonzero()]).sum()

In [None]:
from scipy.stats import fisher_exact

array = np.asarray([[hsp_and_isolated, hsp_not_isolated],
                    [core_and_isolated, core_not_isolated]])

fisher_exact(array)

In [None]:
from scipy.stats import mannwhitneyu

out_degree_core = out_degrees[coregenes.nonzero()]
out_degree_hsp = out_degrees[hsp_y.nonzero()]

mannwhitneyu(out_degree_core, out_degree_hsp)

In [None]:
from scipy.stats import mannwhitneyu

out_degree_core = out_degrees[coregenes.nonzero()]
out_degree_hsp = out_degrees[hsp_y.nonzero()]

mannwhitneyu(out_degree_core[out_degree_core > 0], out_degree_hsp[out_degree_hsp > 0])

In [None]:
out_degree_counts = Counter(out_degree.tolist())       
in_degree_counts = Counter(in_degree.tolist())         

fig, axes = plt.subplots(2,1, figsize=(3,6))

for counter, ax, title, color in zip((out_degree_counts, in_degree_counts), axes, ("Out-Degree", "In-Degree"), ("#03CAF7", "#59D52F")):
    x, y = zip(*counter.items())                                                      

                                                                                                                                                                                                                                                            
    # prep axes                                                                                                                      
    ax.set_xlabel('degree')                                                                                        
    ax.set_xscale('log')                                                                                                                
    ax.set_xlim(0.9, max(x) + 0.1 * max(x))  
                                                                                                                
    ax.set_ylabel('frequency')                                                                                                          
    ax.set_yscale('log')                                                                                                                
    ax.set_ylim(0.9, max(y) + 0.1 *max(y))                                                                                                             
                                                                                                                                            # do plot                                                                                                                        
    ax.scatter(x, y, marker='.', color=color)
    ax.set_title(title)

plt.tight_layout()
plt.show()

# HSPs from other Phenotypes

In [None]:
hsps = pd.read_csv("hsps/uc.txt", header=None, index_col=None, sep="\t")
hsp_indices = [uc_prepro.hgnc2id[hgnc] for hgnc in hsps.iloc[:, 0]]
hsp_y = torch.zeros_like(dataset.data.y)
hsp_y[np.asarray(hsp_indices)] = 1

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, prepro=uc_prepro)

#test_df_list.append(pvals)
plt.tight_layout()

plt.savefig("label_propagation_pyg_uc.svg", bbox_inches="tight")

In [None]:


edge_weights = torch.load("edge_attributions_tensor_UC.pt")

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, weights=edge_weights, prepro=uc_prepro)

#test_df_list.append(pvals)
plt.tight_layout()

plt.savefig("label_propagation_pyg_uc_weighted.pdf", bbox_inches="tight")

In [None]:
hsps = pd.read_csv("hsps/cad.txt", header=None, index_col=None, sep="\t")
hsp_indices = [uc_prepro.hgnc2id[hgnc] for hgnc in hsps.iloc[:, 0]]
hsp_y = torch.zeros_like(dataset.data.y)
hsp_y[np.asarray(hsp_indices)] = 1

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, prepro=uc_prepro)

plt.tight_layout()

plt.savefig("label_propagation_pyg_cad.pdf", bbox_inches="tight")

In [None]:

hsps = pd.read_csv("hsps/scz.txt", header=None, index_col=None, sep="\t")
hsp_indices = [uc_prepro.hgnc2id[hgnc] for hgnc in hsps.iloc[:, 0] if hgnc in uc_prepro.hgnc2id.keys()]
hsp_y = torch.zeros_like(dataset.data.y)
hsp_y[np.asarray(hsp_indices)] = 1

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, prepro=uc_prepro)
plt.tight_layout()
plt.savefig("label_propagation_pyg_scz.pdf", bbox_inches="tight")

# Other trait's Coregenes

In [None]:
config = Config()
config.parse_yaml("config_cad_really_only_nohetio_film_newstorage.yaml")
prepro = InputHandler(config).get_preprocessor()
prepro.build_graph(adjacency=False)

dataset = DatasetBootstrapper(holdout_size=config.input.holdout_size, name=config.name, config=config).get_dataset()
   

edge_index, encoder = typed_edges_to_sparse_tensor(dataset._data.x, dataset._data.edge_index_dict)
edge_index_flat = torch.vstack((edge_index.storage.row(), edge_index.storage.col()))

core, weakcore, _,  _ = get_coregenes("cad", prepro.id2hgnc.values())

core_indices = torch.LongTensor([prepro.hgnc2id[hgnc] for hgnc in core])
core_indices_weak = torch.LongTensor([prepro.hgnc2id[hgnc] for hgnc in weakcore])

coregenes = dataset._data.y.long() 
coregenes[core_indices] = 1
print("CAD: {} core genes".format(coregenes.sum()))

coregenes_weak = torch.zeros_like(coregenes)
coregenes_weak[core_indices_weak] = 1


hsps = pd.read_csv("hsps/cad.txt", header=None, index_col=None, sep="\t")
hsp_indices = [prepro.hgnc2id[hgnc] for hgnc in hsps.iloc[:, 0]]
hsp_y = torch.zeros_like(dataset.data.y)
hsp_y[np.asarray(hsp_indices)] = 1

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, prepro=prepro)

plt.tight_layout()

plt.savefig("label_propagation_pyg_target_cad.svg", bbox_inches="tight")

In [None]:
config = Config()
config.parse_yaml("config_scz_only_nohetio_film_newstorage.yaml")
prepro = InputHandler(config).get_preprocessor()
prepro.build_graph(adjacency=False)

dataset = DatasetBootstrapper(holdout_size=config.input.holdout_size, name=config.name, config=config).get_dataset()
   

edge_index, encoder = typed_edges_to_sparse_tensor(dataset._data.x, dataset._data.edge_index_dict)
edge_index_flat = torch.vstack((edge_index.storage.row(), edge_index.storage.col()))

core, weakcore, _,  _ = get_coregenes("scz", prepro.id2hgnc.values())

core_indices = torch.LongTensor([prepro.hgnc2id[hgnc] for hgnc in core])
core_indices_weak = torch.LongTensor([prepro.hgnc2id[hgnc] for hgnc in weakcore])

coregenes = dataset._data.y.long() 
coregenes[core_indices] = 1
print("SCZ: {} core genes".format(coregenes.sum()))

coregenes_weak = torch.zeros_like(coregenes)
coregenes_weak[core_indices_weak] = 1


hsps = pd.read_csv("hsps/scz.txt", header=None, index_col=None, sep="\t")
hsp_indices = [prepro.hgnc2id[hgnc] for hgnc in hsps.iloc[:, 0] if  hgnc in prepro.hgnc2id.keys()]
hsp_y = torch.zeros_like(dataset.data.y)
hsp_y[np.asarray(hsp_indices)] = 1

fig, df, pvals, stats = plot_labelprop(edge_index_flat, hsp_y, coregenes, coregenes_weak, prepro=prepro)

plt.tight_layout()

plt.savefig("label_propagation_pyg_target_scz.svg", bbox_inches="tight")

# Get Background Distribution

In [None]:
def get_pvals(_hsp_y, _edge_index, _coregenes, _coregenes_weak, weights=None):
    from scipy.stats import mannwhitneyu

    edge_index_reversed = torch.vstack((_edge_index[1, :], _edge_index[0, :]))
    pvals = []
    for i, (num_layers, edges) in enumerate(zip((1,3,5,1,3,5), [_edge_index, _edge_index, _edge_index, edge_index_reversed, edge_index_reversed, edge_index_reversed])):
        model = LabelPropagation(num_layers=num_layers, alpha=0.9)
        out = model(_hsp_y.long(), edges, edge_weight=weights)

        df = pd.DataFrame()
        df["coregenes"] = _coregenes
        df["weak_coregenes"] = _coregenes_weak
        df["total_coregenes"] = _coregenes_weak + _coregenes
        df["hsp"] = _hsp_y
        df["propagated"] = out[:, 1]

        new_df = df[df["hsp"] == 0]
        new_df = new_df[new_df["weak_coregenes"] == 0]
        new_df = new_df[new_df["propagated"] > 0]

        pvals.append(mannwhitneyu(new_df["propagated"][new_df["coregenes"] == 1], new_df["propagated"][new_df["coregenes"] == 0])[1])

    return pvals

def get_random_background(trait, weights=False, nrandom = 100, use_seed=1):
    import torch
    from random import choice, seed
    from speos.utils.nn_utils import typed_edges_to_sparse_tensor

    config = Config()
    config.parse_yaml("config_{}_only_nohetio_film_newstorage.yaml".format(trait if trait != "cad" else trait + "_really"))
    prepro = InputHandler(config).get_preprocessor()
    prepro.build_graph(adjacency=False)

    dataset = DatasetBootstrapper(holdout_size=config.input.holdout_size, name=config.name, config=config).get_dataset()
   

    edge_index, encoder = typed_edges_to_sparse_tensor(dataset._data.x, dataset._data.edge_index_dict)
    edge_index_flat = torch.vstack((edge_index.storage.row(), edge_index.storage.col()))

    core, weakcore, hsps,  _ = get_coregenes(trait, prepro.id2hgnc.values())
    core_indices = torch.LongTensor([prepro.hgnc2id[hgnc] for hgnc in core])
    core_indices_weak = torch.LongTensor([prepro.hgnc2id[hgnc] for hgnc in weakcore])

    coregenes = dataset._data.y.long() 
    coregenes[core_indices] = 1
    print("{}: {} core genes".format(trait, coregenes.sum()))

    coregenes_weak = torch.zeros_like(coregenes)
    coregenes_weak[core_indices_weak] = 1

    if weights:

        genes = []
        edge_attributions = []

        for gene in [prepro.id2hgnc[idx.item()] for idx in coregenes.nonzero()]:
            try:
                edge_attributions.append(torch.load("/mnt/storage/speos/explanations/{}_film_nohetio_ig_attr_edge_total_{}.pt".format(trait if trait != "cad" else trait + "_really", gene)).detach().float().cpu().numpy())
                genes.append(gene)
            except (FileNotFoundError, RuntimeError):
                continue

        edge_weights = torch.Tensor(np.asarray(edge_attributions).max(axis=0))
    else:
        edge_weights = None
        

    seed(use_seed)
    test_df_list = []

    for anothertrait in ["uc", "cad", "scz"]:
        hsps = pd.read_csv("hsps/{}.txt".format(anothertrait), header=None, index_col=None, sep="\t").iloc[:, 0].tolist()
        hsp_indices = [prepro.hgnc2id[hsp] for hsp in hsps if hsp in prepro.hgnc2id.keys()]
        hsp_y = torch.zeros_like(dataset._data.y)
        hsp_y[np.asarray(hsp_indices)] = 1

        test_df_list.append(get_pvals(hsp_y, edge_index_flat, coregenes, coregenes_weak, edge_weights))

    hsps = pd.read_csv("hsps/{}.txt".format(trait), header=None, index_col=None, sep="\t").iloc[:, 0].tolist()
    runs = 0
    while runs < nrandom:
        try:
            hsp_indices = [choice(list(prepro.hgnc2id.values())) for _ in range(len(hsps))]
            hsp_y = torch.zeros_like(dataset._data.y)
            hsp_y[np.asarray(hsp_indices)] = 1

            pvals = get_pvals(hsp_y, edge_index_flat, coregenes, coregenes_weak, edge_weights)
            
            test_df_list.append(pvals)
            runs += 1
        except ValueError:
            continue

    test_df_list = np.asarray(test_df_list)
    old_shape = test_df_list.shape

    adjusted = fdrcorrection(test_df_list.flatten())[1].reshape(old_shape)
    test_df = pd.DataFrame(adjusted, index=["UC", "CAD", "SCZ"] + ["Random{}".format(i) for i in range(nrandom)], columns=["1","3","5","1_rev", "3_rev", "5_rev"])
    test_df.to_csv("random_labelprop_target_{}_{}.tsv".format(trait, "weighted" if edge_weights is not None else "unweighted"), sep="\t")



In [None]:
new_df

In [None]:
get_random_background("uc", nrandom=1000, weights=False)

In [None]:
get_random_background("uc", nrandom=1000, weights=True)
get_random_background("cad", nrandom=1000, weights=False)
get_random_background("cad", nrandom=1000, weights=True)
get_random_background("scz", nrandom=1000, weights=False)
get_random_background("scz", nrandom=1000, weights=True)

In [None]:
from joblib import Parallel, delayed

traits = ["uc", "cad", "scz"]

combinations = []

for trait in traits:
    combinations.append((trait, False))
    combinations.append((trait, True))


Parallel(n_jobs=6)(delayed(get_random_background)(trait, nrandom=1000, weights=weights) for (trait, weights) in combinations)

In [None]:
from random import choice, seed
from scipy.stats import mannwhitneyu
from speos.visualization.settings import *

test_df_list = []

seed(1)
for _ in range(500):
    hsps = pd.read_csv("hsps/uc.txt", header=None, index_col=None, sep="\t")
    hsp_indices = [choice(list(prepro.hgnc2id.values())) for _ in range(len(hsps))]
    new_y = torch.zeros_like(new_y)
    new_y[np.asarray(hsp_indices)] = 1

    pvals = []
    for i, (num_layers, edges) in enumerate(zip((1,3,5,1,3,5), [edge_index_flat, edge_index_flat, edge_index_flat, edge_index_flat_reversed, edge_index_flat_reversed, edge_index_flat_reversed])):
                                
        model = LabelPropagation(num_layers=num_layers, alpha=0.9)
        out = model(new_y.long(), edges, edge_weight=edge_weights)

        df = pd.DataFrame()
        df["HGNC"] = list(prepro.id2hgnc.values())
        df["coregenes"] = coregenes
        df["weak_coregenes"] = coregenes_weak
        df["total_coregenes"] = coregenes_weak + coregenes
        df["hsp"] = new_y
        df["propagated"] = out[:, 1]

        new_df = df[df["hsp"] == 0]
        new_df = new_df[new_df["weak_coregenes"] == 0]
        new_df = new_df[new_df["propagated"] > 0]

        from scipy.stats import mannwhitneyu

        pval =  mannwhitneyu(new_df["propagated"][new_df["coregenes"] == 1], new_df["propagated"][new_df["coregenes"] == 0])[1]
        pvals.append(pval)
    test_df_list.append(pvals)


In [None]:
test_df_list = np.asarray(test_df_list)
old_shape = test_df_list.shape

adjusted = fdrcorrection(test_df_list.flatten())[1].reshape(old_shape)

In [None]:
(test_df_list < 0.05).sum(axis=0)

In [None]:
(adjusted < 0.05).sum(axis=0)

In [None]:
test_df = pd.DataFrame(adjusted, index=["UC"] + ["Random{}".format(i) for i in range(500)], columns=["1","3","5","1_rev", "3_rev", "5_rev"])

In [None]:
(test_df.sort_values("1").index == "UC").nonzero()[0] / len(test_df)

In [None]:
(test_df.sort_values("3").index == "UC").nonzero()[0] / len(test_df)

In [None]:
(test_df.sort_values("5").index == "UC").nonzero()[0] / len(test_df)

In [None]:
(test_df.sort_values("1_rev").index == "UC").nonzero()[0] / len(test_df)

In [None]:
(test_df.sort_values("3_rev").index == "UC").nonzero()[0] / len(test_df)

In [None]:
(test_df.sort_values("5_rev").index == "UC").nonzero()[0] / len(test_df)

In [None]:
uc_test_df_list = cad_test_df_list.tolist()
uc_test_df_list[0] = uc_pvals
uc_test_df_list = np.asarray(uc_test_df_list)
old_shape = uc_test_df_list.shape

adjusted = fdrcorrection(uc_test_df_list.flatten())[1].reshape(old_shape)
test_df = pd.DataFrame(adjusted, index=["CAD"] + ["Random{}".format(i) for i in range(500)], columns=["1","3","5","1_rev", "3_rev", "5_rev"])

print((test_df.sort_values("1").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("3").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("5").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("1_rev").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("3_rev").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("5_rev").index == "CAD").nonzero()[0] / len(test_df))

# For CAD

In [None]:
cad_test_df_list = test_df_list[:]
cad_test_df_list[0] = cad_pvals
cad_test_df_list = np.asarray(cad_test_df_list)
old_shape = cad_test_df_list.shape

adjusted = fdrcorrection(cad_test_df_list.flatten())[1].reshape(old_shape)
test_df = pd.DataFrame(adjusted, index=["CAD"] + ["Random{}".format(i) for i in range(500)], columns=["1","3","5","1_rev", "3_rev", "5_rev"])

print((test_df.sort_values("1").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("3").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("5").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("1_rev").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("3_rev").index == "CAD").nonzero()[0] / len(test_df))
print((test_df.sort_values("5_rev").index == "CAD").nonzero()[0] / len(test_df))

In [None]:
scz_test_df_list = test_df_list[:]
scz_test_df_list[0] = scz_pvals
scz_test_df_list = np.asarray(cad_test_df_list)
old_shape = scz_test_df_list.shape

adjusted = fdrcorrection(scz_test_df_list.flatten())[1].reshape(old_shape)
test_df = pd.DataFrame(adjusted, index=["SCZ"] + ["Random{}".format(i) for i in range(500)], columns=["1","3","5","1_rev", "3_rev", "5_rev"])

print((test_df.sort_values("1").index == "SCZ").nonzero()[0] / len(test_df))
print((test_df.sort_values("3").index == "SCZ").nonzero()[0] / len(test_df))
print((test_df.sort_values("5").index == "SCZ").nonzero()[0] / len(test_df))
print((test_df.sort_values("1_rev").index == "SCZ").nonzero()[0] / len(test_df))
print((test_df.sort_values("3_rev").index == "SCZ").nonzero()[0] / len(test_df))
print((test_df.sort_values("5_rev").index == "SCZ").nonzero()[0] / len(test_df))

In [None]:
test_df

In [None]:
import matplotlib.patches as patches
from scipy.stats import rankdata
import matplotlib.ticker as tck
import seaborn as sns
from speos.visualization.settings import *
import matplotlib.pyplot as plt

df = pd.read_csv("random_labelprop_target_uc_unweighted.tsv", index_col=0, header=0, sep="\t")

def plot_random_prop(df, target="UC", others="CAD/SCZ"):
    fig, axes = plt.subplots(nrows=1, ncols=7, figsize=(full_width*cm, 5*cm))
    for ax, str_ind, ind in zip(axes.tolist(), df.columns.tolist() + ["None"], range(7)):
        if ind < 6:
            ax = sns.kdeplot(y = np.log10(df[str_ind]) * -1, cut=0, fill="lightblue", ax=ax)
            value = np.quantile(np.log10(df[str_ind]) * -1, 0.95)
            pval = (rankdata(df[str_ind])[df[str_ind] == df.loc[target, str_ind]] / len(df))[0]
            #if ind > 2:
            #    value = np.quantile(np.log10(df[str_ind]) * -1, 0.95)
            #    pval = rankdata(df[str_ind])[df[str_ind] == df.loc["UC", str_ind]] / len(df)
            #else:
            #    value = np.quantile(np.log10(df[str_ind]) * -1, 0.05)
            #    pval = 1 - (rankdata(df[str_ind])[df[str_ind] == df.loc["UC", str_ind]] / len(df))
            
            text = "p={:.3f}".format(pval.item())
            ax.hlines(value, 0, 0.05, color="#5a5a5a", zorder=2)
            ax.hlines(np.log10(df.loc[target, str_ind]) * -1, 0, 0.1, color="#d8031c", zorder=1)
            ax.hlines(np.log10(df.loc[others.split("/")[0], str_ind]) * -1, 0, 0.1, color="green", zorder=1)
            ax.hlines(np.log10(df.loc[others.split("/")[1], str_ind]) * -1, 0, 0.1, color="green", zorder=1)
            ax.set_ylabel("")
            #ax.set_title(str_ind)
            # Create a Rectangle patch
            xlim = ax.get_xlim()
            ylim= ax.get_ylim()
            ax.hlines(value, 0, xlim[1]*0.4, color="#5a5a5a", zorder=2)
            ax.hlines(np.log10(df.loc[target, str_ind]) * -1, 0, xlim[1]*0.8, color="#d8031c", zorder=1)
            ax.hlines(np.log10(df.loc[others.split("/")[0], str_ind]) * -1, 0, xlim[1]*0.8, color="green", zorder=1)
            ax.hlines(np.log10(df.loc[others.split("/")[1], str_ind]) * -1, 0, xlim[1]*0.8, color="green", zorder=1)
            #rect = patches.Rectangle((0, value*0.99 if ind >2 else value*1.01), 0.3, -200 if ind > 2 else 200,  linewidth=0, facecolor='white', alpha=0.7, zorder=5)
            ax.text(x=np.mean(xlim), y=ylim[1] * 0.9, s=text, fontsize=5, zorder=7, ha="center")

            # Add the patch to the Axes
            #ax.add_patch(rect)
            ax.set_xlim(xlim)
            ax.set_ylim((-0.05, ylim[1]))
            #nticks = int(ylim[1] / 8)
            #ax.yaxis.set_major_locator(tck.MultipleLocator(nticks))
            if ind == 0:
                ax.set_ylabel("-log(p)")
        else:
            legend_elements = [patches.Patch(facecolor='#d8031c', edgecolor='#d8031c',
                                    label='{} HSPs'.format(target)),
                                patches.Patch(facecolor='green', edgecolor='green',
                                        label='{} HSPs'.format(others)),
                                patches.Patch(facecolor='#5a5a5a', edgecolor='#5a5a5a',
                                        label='95th Percentile')]

            leg = ax.legend(handles=legend_elements, loc='center', title="p-Values", fontsize=8, title_fontsize=8, ncol=1, columnspacing=1.7, handletextpad=-0.2, labelspacing=1.7)

            for patch in leg.get_patches():
                patch.set_height(10)
                patch.set_width(10)
                patch.set_y(-2.5)
            ax.set_axis_off()
    plt.tight_layout()
    plt.subplots_adjust(wspace=0.4)
        
    return fig, ax

fig, ax = plot_random_prop(df)

plt.savefig("pvals_labelprop_uc_new.svg")


In [None]:
df = pd.read_csv("random_labelprop_target_uc_weighted.tsv", index_col=0, header=0, sep="\t")
fig, ax = plot_random_prop(df, target="UC", others="CAD/SCZ")

plt.savefig("pvals_labelprop_uc_weighted.svg")

In [None]:
df

In [None]:
df = pd.read_csv("random_labelprop_target_cad_unweighted.tsv", index_col=0, header=0, sep="\t")
fig, ax = plot_random_prop(df,target="CAD", others="UC/SCZ")

plt.savefig("pvals_labelprop_cad_unweighted.svg")

In [None]:
df = pd.read_csv("random_labelprop_target_cad_weighted.tsv", index_col=0, header=0, sep="\t")
fig, ax = plot_random_prop(df)

plt.savefig("pvals_labelprop_cad_weighted.svg")

In [None]:
df = pd.read_csv("random_labelprop_target_scz_unweighted.tsv", index_col=0, header=0, sep="\t")

fig, ax = plot_random_prop(df, target="SCZ", others="UC/CAD")

plt.savefig("pvals_labelprop_scz_unweighted.svg")

In [None]:
df = pd.read_csv("random_labelprop_target_scz_weighted.tsv", index_col=0, header=0, sep="\t")

fig, ax = plot_random_prop(df, target="SCZ", others="UC/CAD")

plt.savefig("pvals_labelprop_scz_weighted.svg")

# Other Centrality measures

In [None]:
config = Config()
config.parse_yaml("config_uc_only_nohetio_film_newstorage.yaml")
uc_prepro = InputHandler(config).get_preprocessor()
#G = uc_prepro.get_graph()
uc_prepro.build_graph(adjacency=True)

In [None]:
G = uc_prepro.G

In [None]:
import networkx as nx 
pagerank = nx.pagerank(G)

In [None]:
degree_centrality = nx.degree_centrality(G)

In [None]:
betweenness = nx.betweenness_centrality(G, k=100)

In [None]:
degree = {i: G.degree([i])[i] for i in range(len(uc_prepro.id2hgnc))}

In [None]:
allcore, other_coregenes, hsps, noncore = get_coregenes("uc", background=uc_prepro.id2hgnc.values())

In [None]:
import seaborn as sns
import numpy as np
from scipy.stats import mannwhitneyu

core_values = [betweenness[uc_prepro.hgnc2id[hgnc]] for hgnc in allcore]
peri_values = [betweenness[uc_prepro.hgnc2id[hgnc]] for hgnc in list(noncore) + hsps]
sns.boxplot(y = core_values + peri_values, x=["Core"] * len(core_values) + ["Peripheral"] * len(peri_values), showfliers=False)
mannwhitneyu(core_values, peri_values)

In [None]:
import seaborn as sns
import numpy as np
from scipy.stats import mannwhitneyu

core_values = [degree[uc_prepro.hgnc2id[hgnc]] for hgnc in allcore]
peri_values = [degree[uc_prepro.hgnc2id[hgnc]] for hgnc in list(noncore) + hsps]
sns.boxplot(y = np.log10(np.asarray(core_values + peri_values) +1), x=["Core"] * len(core_values) + ["Peripheral"] * len(peri_values), showfliers=False)
mannwhitneyu(core_values, peri_values)

In [None]:
core_values = [degree_centrality[uc_prepro.hgnc2id[hgnc]] for hgnc in allcore]
peri_values = [degree_centrality[uc_prepro.hgnc2id[hgnc]] for hgnc in list(noncore) + hsps]
sns.boxplot(y = np.log10(np.asarray(core_values + peri_values) +1), x=["Core"] * len(core_values) + ["Peripheral"] * len(peri_values), showfliers=False)
mannwhitneyu(core_values, peri_values)

In [None]:
import seaborn as sns
import numpy as np
from scipy.stats import mannwhitneyu

core_values = [pagerank[uc_prepro.hgnc2id[hgnc]] for hgnc in allcore]
peri_values = [pagerank[uc_prepro.hgnc2id[hgnc]] for hgnc in list(noncore) + hsps]
sns.boxplot(y = core_values + peri_values, x=["Core"] * len(core_values) + ["Peripheral"] * len(peri_values), showfliers=False)
mannwhitneyu(core_values, peri_values)

In [None]:
np.mean(peri_values)

In [None]:
uc_prepro.get_feature_names()