In [None]:
from speos.utils.config import Config
from speos.preprocessing.handler import InputHandler
import os
os.chdir("..")


In [None]:
import json
from extensions.preprocessing import preprocess_labels
import pandas as pd
def get_coregenes(trait: str, background):
    trait2name = {"uc": "uc",
                "cad": "cad_really",
                "scz": "scz",
                "ad": "alz",
                "ra": "ra"}

    mendelians = preprocess_labels("./extensions/{}_only_genes.tsv".format(trait2name[trait]))

    hsps= pd.read_csv("./hsps/{}.txt".format(trait), header=None, index_col=None).iloc[:, 0].tolist()

    with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format(trait2name[trait]), "r") as file:
        candidate2cs = json.load(file)[0]

    coregenes = [key for key, value in candidate2cs.items() if value == 11]

    other_coregenes = [key for key, value in candidate2cs.items() if value != 11]

    allcore = set()
    allcore.update(set(coregenes))
    allcore.update(set(mendelians))
    allcore = allcore.intersection(set(background))

    noncore = set(background).difference(allcore).difference(other_coregenes)

    return allcore, other_coregenes, hsps, noncore

In [None]:

trait= "uc"

config = Config()
config.parse_yaml("/home/ubuntu/speos/config_{}_only_nohetio_film_newstorage.yaml".format(trait))
prepro = InputHandler(config).get_preprocessor()
prepro.build_graph(adjacency=False)

In [None]:
allcore, other_coregenes, hsps,  noncore = get_coregenes(trait, prepro.id2hgnc.values())
mendelians = preprocess_labels("./extensions/{}_only_genes.tsv".format(trait))

In [None]:
descr = pd.read_csv("/mnt/storage/cage/hg19.cage_peak_phase1and2combined_ann.txt", header=0, skiprows=7, sep="\t")["short_description"]

In [None]:
split = [values for _descr in descr for values in _descr.split(",")]
split = [values.split("@")[1] for values in split if "@" in values]
split = [values for values in split if "chr" not in values]

In [None]:
from collections import Counter

counter = Counter({hgnc: 0 for hgnc in prepro.id2hgnc.values()})
counter.update(split)
counter

In [None]:
coregene_counts = [counter[gene] for gene in allcore]
mendelian_counts = [counter[gene] for gene in mendelians]
peripheral_counts = [counter[gene] for gene in noncore]
hsp_counts = [counter[gene] for gene in hsps]

In [None]:
len(peripheral_counts)

In [None]:
from scipy.stats import mannwhitneyu

mannwhitneyu(coregene_counts, peripheral_counts)

In [None]:
mannwhitneyu(coregene_counts, hsp_counts)

In [None]:
mannwhitneyu(peripheral_counts, hsp_counts)

In [None]:
from seaborn import boxplot

df = pd.DataFrame(data= {"TSS Counts": coregene_counts + mendelian_counts + hsp_counts + peripheral_counts,
                         "Group": ["Core Gene"] * len(coregene_counts) + ["Mendelian"] *len(mendelian_counts)  + ["HSP"] * len(hsp_counts)+ ["Peripheral"] * len(peripheral_counts)})

boxplot(data=df, x="Group", y="TSS Counts")

In [None]:

counter = Counter()
counter.update(split)

coregene_counts = [counter[gene] for gene in allcore]
peripheral_counts = [counter[gene] for gene in noncore]
hsp_counts = [counter[gene] for gene in hsps]

print(mannwhitneyu(coregene_counts, peripheral_counts))
print(mannwhitneyu(coregene_counts, hsp_counts))
print(mannwhitneyu(peripheral_counts, hsp_counts))

df = pd.DataFrame(data= {"TSS Counts": coregene_counts + hsp_counts + peripheral_counts,
                         "Group": ["Core Gene"] * len(coregene_counts)  + ["HSP"] * len(hsp_counts)+ ["Peripheral"] * len(peripheral_counts)})

boxplot(data=df, x="Group", y="TSS Counts")

In [None]:
gwas = pd.read_csv("/home/ubuntu/speos/data/gwas/{}.genes.out".format(trait.upper()), sep=" ", header=0, usecols=["GENE", "ZSTAT"])

In [None]:
hgnc2gwas = {prepro.id2hgnc[prepro.entrez2id[gene]]: zstat for gene, zstat in zip(gwas["GENE"], gwas["ZSTAT"]) if gene in prepro.entrez2id.keys()}


In [None]:
len(hgnc2gwas)

In [None]:
tss = []
zscore = []
for hgnc in hgnc2gwas:
    tss.append(counter[hgnc])
    zscore.append(hgnc2gwas[hgnc])

In [None]:
from scipy.stats import spearmanr

spearmanr(tss, zscore)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(tss, zscore)

In [None]:
cutoff = 5
core_but_strong_gwas = [hgnc for hgnc in allcore if hgnc2gwas[hgnc] > cutoff]
core_but_weak_gwas = [hgnc for hgnc in allcore if hgnc2gwas[hgnc] < cutoff]
peripheral_but_weak_gwas = [hgnc for hgnc in noncore if hgnc2gwas[hgnc] < cutoff]
peripheral_but_strong_gwas = [hgnc for hgnc in noncore if hgnc2gwas[hgnc] > cutoff]

In [None]:
assert(len(core_but_weak_gwas) + len(core_but_strong_gwas) == len(allcore))
len(core_but_strong_gwas)

In [None]:
len(allcore)

In [None]:

core_but_weak_gwas_counts = [counter[gene] for gene in core_but_weak_gwas]
core_but_strong_gwas_counts = [counter[gene] for gene in core_but_strong_gwas]
other_core_counts = [counter[gene] for gene in other_coregenes]
peripheral_counts = [counter[gene] for gene in noncore]
hsp_counts = [counter[gene] for gene in hsps]

print(mannwhitneyu(core_but_weak_gwas_counts, core_but_strong_gwas_counts))
print(mannwhitneyu(core_but_weak_gwas_counts, peripheral_counts))
print(mannwhitneyu(core_but_strong_gwas_counts, peripheral_counts))
print(mannwhitneyu(core_but_weak_gwas_counts, hsp_counts))
print(mannwhitneyu(peripheral_counts, hsp_counts))

df = pd.DataFrame(data= {"TSS Counts": core_but_strong_gwas_counts + core_but_weak_gwas_counts + other_core_counts + hsp_counts + peripheral_counts,
                         "Group": ["Core Genes\nZ-SCore > 2\nn={}".format(len(core_but_strong_gwas_counts))] * len(core_but_strong_gwas_counts) 
                                + ["Core Genes\nZ-SCore < 2\nn={}".format(len(core_but_weak_gwas_counts))] * len(core_but_weak_gwas_counts)  
                                + ["Ambivalent\nn={}".format(len(other_core_counts))] * len(other_core_counts)
                                + ["HSP\nn={}".format(len(hsp_counts))] * len(hsp_counts)
                                + ["Peripheral\nn={}".format(len(peripheral_counts))] * len(peripheral_counts)})

boxplot(data=df, x="Group", y="TSS Counts")

In [None]:
cutoff = 5

core_but_strong_gwas = [hgnc for hgnc in allcore if hgnc2gwas[hgnc] > cutoff]
core_but_weak_gwas = [hgnc for hgnc in allcore if hgnc2gwas[hgnc] < cutoff]
peripheral_but_weak_gwas = [hgnc for hgnc in noncore if hgnc2gwas[hgnc] < cutoff]
peripheral_but_strong_gwas = [hgnc for hgnc in noncore if hgnc2gwas[hgnc] > cutoff]

ambi_but_weak_gwas_counts = [counter[gene] for gene in other_coregenes if hgnc2gwas[gene]  < cutoff]
ambi_but_strong_gwas_counts = [counter[gene] for gene in other_coregenes if hgnc2gwas[gene] > cutoff]

core_but_weak_gwas_counts = [counter[gene] for gene in core_but_weak_gwas]
core_but_strong_gwas_counts = [counter[gene] for gene in core_but_strong_gwas]
peripheral_weak_gwas_counts = [counter[gene] for gene in peripheral_but_weak_gwas]
peripheral_strong_gwas_counts = [counter[gene] for gene in peripheral_but_strong_gwas]

print(mannwhitneyu(core_but_weak_gwas_counts, core_but_strong_gwas_counts))
print(mannwhitneyu(ambi_but_weak_gwas_counts, ambi_but_strong_gwas_counts))
print(mannwhitneyu(peripheral_weak_gwas_counts, peripheral_strong_gwas_counts))

print(mannwhitneyu(core_but_weak_gwas_counts + core_but_strong_gwas_counts, peripheral_strong_gwas_counts + peripheral_weak_gwas_counts))
print(mannwhitneyu(core_but_weak_gwas_counts + core_but_strong_gwas_counts, ambi_but_weak_gwas_counts + ambi_but_strong_gwas_counts))
print(mannwhitneyu(peripheral_weak_gwas_counts + peripheral_strong_gwas_counts, ambi_but_weak_gwas_counts + ambi_but_strong_gwas_counts))


df = pd.DataFrame(data= {"TSS Counts": core_but_strong_gwas_counts + core_but_weak_gwas_counts +ambi_but_strong_gwas_counts + ambi_but_weak_gwas_counts + peripheral_weak_gwas_counts + peripheral_strong_gwas_counts,
                         "Group": ["Core Genes\nZ-Score > {}\nn={}".format(cutoff, len(core_but_strong_gwas_counts))] * len(core_but_strong_gwas_counts) 
                                + ["Core Genes\nZ-Score < {}\nn={}".format(cutoff, len(core_but_weak_gwas_counts))] * len(core_but_weak_gwas_counts)  
                                + ["Ambiv. Genes\nZ-Score > {}\nn={}".format(cutoff, len(ambi_but_strong_gwas_counts))] * len(ambi_but_strong_gwas_counts) 
                                + ["Ambiv. Genes\nZ-Score < {}\nn={}".format(cutoff, len(ambi_but_weak_gwas_counts))] * len(ambi_but_weak_gwas_counts)  
                                + ["Peripheral\nZ-Score > {}\nn={}".format(cutoff, len(peripheral_strong_gwas_counts))] * len(peripheral_strong_gwas_counts)  
                                + ["Peripheral\nZ-Score < {}\nn={}".format(cutoff, len(peripheral_weak_gwas_counts))] * len(peripheral_weak_gwas_counts)})

boxplot(data=df, x="Group", y="TSS Counts", showfliers=False)

In [None]:
import seaborn as sns
import numpy as np
from statsmodels.stats.multitest import fdrcorrection
from speos.visualization.settings import *
import matplotlib.pyplot as plt

def pval_to_string(pval):
    if pval < 1e-3:
        return "***"
    elif pval < 1e-2:
        return "***"
    elif pval < 5e-2:
        return "***"
    else:
        return "n.s."

fig, ax = plt.subplots(figsize=(full_width*0.33*cm, 6*cm))


bp = ax.boxplot(x=(peripheral_weak_gwas_counts, peripheral_strong_gwas_counts, ambi_but_weak_gwas_counts, ambi_but_strong_gwas_counts, core_but_weak_gwas_counts, core_but_strong_gwas_counts), 
              positions=range(6), widths=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], showfliers=False, zorder=5, patch_artist=True)

for feature, color in zip(['boxes', "medians", "whiskers", "caps"], ["darkgray", "black", "darkgray", "darkgray"]):
    plt.setp(bp[feature], color=color)

data = pd.DataFrame({"TSS Counts per Gene": core_but_strong_gwas_counts + core_but_weak_gwas_counts +ambi_but_strong_gwas_counts + ambi_but_weak_gwas_counts + peripheral_weak_gwas_counts + peripheral_strong_gwas_counts,
                     "Groups": ["Core Genes\n> {}\n{}".format(cutoff, len(core_but_strong_gwas_counts))] * len(core_but_strong_gwas_counts) 
                                    + ["Core Genes\n< {}\n{}".format(cutoff, len(core_but_weak_gwas_counts))] * len(core_but_weak_gwas_counts)  
                                    + ["Ambiv. Genes\n> {}\n{}".format(cutoff, len(ambi_but_strong_gwas_counts))] * len(ambi_but_strong_gwas_counts) 
                                    + ["Ambiv. Genes\n< {}\n{}".format(cutoff, len(ambi_but_weak_gwas_counts))] * len(ambi_but_weak_gwas_counts)  
                                    + ["Peripheral\n> {}\n{}".format(cutoff, len(peripheral_strong_gwas_counts))] * len(peripheral_strong_gwas_counts)  
                                    + ["Peripheral\n< {}\n{}".format(cutoff, len(peripheral_weak_gwas_counts))] * len(peripheral_weak_gwas_counts)})

data = data[data["TSS Counts per Gene"] < np.quantile(data["TSS Counts per Gene"], 0.995)]

pvals = []
pvals.append(mannwhitneyu(peripheral_weak_gwas_counts, peripheral_strong_gwas_counts)[1])
pvals.append(mannwhitneyu(ambi_but_weak_gwas_counts, ambi_but_strong_gwas_counts)[1])
pvals.append(mannwhitneyu(core_but_weak_gwas_counts, core_but_strong_gwas_counts)[1])





pvals.append(mannwhitneyu(peripheral_weak_gwas_counts + peripheral_strong_gwas_counts, ambi_but_weak_gwas_counts + ambi_but_strong_gwas_counts)[1])
pvals.append(mannwhitneyu(core_but_weak_gwas_counts + core_but_strong_gwas_counts, ambi_but_weak_gwas_counts + ambi_but_strong_gwas_counts)[1])
pvals.append(mannwhitneyu(core_but_weak_gwas_counts + core_but_strong_gwas_counts, peripheral_strong_gwas_counts + peripheral_weak_gwas_counts)[1])

pvals = fdrcorrection(pvals)[1]

groups = ["Peripheral\n< {}\n{}".format(cutoff, len(peripheral_weak_gwas_counts)),
                             "Peripheral\n> {}\n{}".format(cutoff, len(peripheral_strong_gwas_counts)),
                             "Ambiv. Genes\n< {}\n{}".format(cutoff, len(ambi_but_weak_gwas_counts)),
                             "Ambiv. Genes\n> {}\n{}".format(cutoff, len(ambi_but_strong_gwas_counts)),
                             "Core Genes\n< {}\n{}".format(cutoff, len(core_but_weak_gwas_counts)),
                             "Core Genes\n> {}\n{}".format(cutoff, len(core_but_strong_gwas_counts))
                             ]

ax = sns.violinplot(data=data,
                    x="Groups",
                    y="TSS Counts per Gene",
                    order = groups,
                    palette={"Core Genes\n> {}\n{}".format(cutoff, len(core_but_strong_gwas_counts)): "#01016f",
                                 "Core Genes\n< {}\n{}".format(cutoff, len(core_but_weak_gwas_counts)):  "#01016f", 
                                 "Ambiv. Genes\n> {}\n{}".format(cutoff, len(ambi_but_strong_gwas_counts)): "lightblue",
                                 "Ambiv. Genes\n< {}\n{}".format(cutoff, len(ambi_but_weak_gwas_counts)): "lightblue",
                                 "Peripheral\n> {}\n{}".format(cutoff, len(peripheral_strong_gwas_counts)):  "#5a5a5a",
                                 "Peripheral\n< {}\n{}".format(cutoff, len(peripheral_weak_gwas_counts)):  "#5a5a5a"},

                    inner=None,
                    cut=0,
                    linewidth=0.5,
                    ax=ax)

maxs = data.groupby("Groups").max()

for i, group in enumerate(groups):
    ax.vlines(i, maxs.loc[group].item() + 3, 35, color="black", linewidth=0.5)

ax.hlines((35, 35, 35), (0, 2, 4), (1, 3, 5), color="black", linewidth=0.5)

for i, pval in enumerate(pvals[:3]):
    ax.text(x=(i * 2)+0.5, y=34, s=pval_to_string(pval), va="top", ha="center", fontsize=6)

ax.vlines((0.6, 2.4, 2.6, 4.4), (35, 35, 35, 35), (40, 40, 40, 40), color="black", linewidth=0.5)
ax.vlines((0.4, 4.6), (35, 35), (45, 45, ), color="black", linewidth=0.5)

ax.hlines((40, 40, 45), (0.6, 2.6, 0.4), (2.4, 4.4, 4.6), color="black", linewidth=0.5)

for i, pval in enumerate(pvals[3:5]):
    ax.text(x=(i * 2)+1.5, y=40.5, s=pval_to_string(pval), va="bottom", ha="center", fontsize=6)

ax.text(x=2.5, y=45.5, s=pval_to_string(pvals[-1]), va="bottom", ha="center", fontsize=6)
ax.set_ylim(-2,50)
ax.set_ylabel("TSS Counts per Gene", fontsize=8)
ax.set_xlabel("", fontsize=8)
plt.savefig("tss_counts.svg", bbox_inches="tight")



In [None]:
pvals

In [None]:
#taken from https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9201719/

degs = set(pd.read_csv("deg_uc.tsv", sep=" ", header=None).transpose()[0].tolist())

In [None]:
from scipy.stats import fisher_exact

all_genes = set()
all_genes.update(allcore)
all_genes.update(other_coregenes)
all_genes.update(noncore)

for group in [set(mendelians), set(allcore).difference(mendelians), set(other_coregenes), set(noncore)]:
    a = len(group.intersection(degs))
    b = len(all_genes.difference(group).intersection(degs))
    c = len(group.difference(degs))
    d = len(all_genes.difference(group).difference(degs))
    array = [[a, b], [c, d]]
    print(fisher_exact(array))

In [None]:
import scipy.stats as stats


with open("/mnt/storage/speos/results/{}_film_nohetioouter_results.json".format("uc"), "r") as file:
        candidate2cs = json.load(file)[0]

pvals = []
zvals = []
odds_coregenes = []


for cutoff in range(11, 1, -1):
    coregenes = [key for key, value in candidate2cs.items() if value >= cutoff]
    other_coregenes = [key for key, value in candidate2cs.items() if value < cutoff]

    arrays = []
    for group in [set(coregenes).difference(mendelians), set(other_coregenes)]:
        a = len(group.intersection(degs))
        b = len(all_genes.difference(group).intersection(degs))
        c = len(group.difference(degs))
        d = len(all_genes.difference(group).difference(degs))
        arrays.append(np.asarray([[a, b], [c, d]]))

    odds_typical = fisher_exact(arrays[0])[0]
    odds_atypical = fisher_exact(arrays[1])[0]

    odds_coregenes.append(odds_typical)

    log_odds_typical = np.log(odds_typical)
    log_odds_atypical = np.log(odds_atypical)
    delta = log_odds_typical - log_odds_atypical


    var_typical = np.sum([1 / value for value in arrays[0].flatten()])
    var_atypical = np.sum([1 / value for value in arrays[1].flatten()])

    se_delta = np.sqrt(var_typical + var_atypical)
    zval = delta / se_delta
    zvals.append(zval)
    pval = stats.norm.sf(np.abs(zval)) * 2
    pvals.append(pval)

In [None]:
pvals

In [None]:
zvals

In [None]:
odds_coregenes

In [None]:
fdrcorrection(pvals)

In [None]:
arrays = []
for group in [set(allcore).difference(mendelians), set(mendelians)]:
    a = len(group.intersection(degs))
    b = len(all_genes.difference(group).intersection(degs))
    c = len(group.difference(degs))
    d = len(all_genes.difference(group).difference(degs))
    arrays.append(np.asarray([[a, b], [c, d]]))

odds_typical = fisher_exact(arrays[0])[0]
odds_atypical = fisher_exact(arrays[1])[0]

odds_coregenes.append(odds_typical)

log_odds_typical = np.log(odds_typical)
log_odds_atypical = np.log(odds_atypical)
delta = log_odds_typical - log_odds_atypical


var_typical = np.sum([1 / value for value in arrays[0].flatten()])
var_atypical = np.sum([1 / value for value in arrays[1].flatten()])

se_delta = np.sqrt(var_typical + var_atypical)
zval = delta / se_delta
zvals.append(zval)
pval = stats.norm.sf(np.abs(zval)) * 2
pval

In [None]:
coregenes = [key for key, value in candidate2cs.items() if value >= 11]
other_coregenes = [key for key, value in candidate2cs.items() if value < 11]

arrays = []
for group in [set(other_coregenes), set(mendelians).union(coregenes)]:
    a = len(group.intersection(degs))
    b = len(all_genes.difference(group).intersection(degs))
    c = len(group.difference(degs))
    d = len(all_genes.difference(group).difference(degs))
    arrays.append(np.asarray([[a, b], [c, d]]))

odds_typical = fisher_exact(arrays[0])[0]
odds_atypical = fisher_exact(arrays[1])[0]

odds_coregenes.append(odds_typical)

log_odds_typical = np.log(odds_typical)
log_odds_atypical = np.log(odds_atypical)
delta = log_odds_typical - log_odds_atypical


var_typical = np.sum([1 / value for value in arrays[0].flatten()])
var_atypical = np.sum([1 / value for value in arrays[1].flatten()])

se_delta = np.sqrt(var_typical + var_atypical)
zval = delta / se_delta
zvals.append(zval)
pval = stats.norm.sf(np.abs(zval)) * 2
pval