In [None]:
import pandas as pd

trait = "UC"
df = pd.read_csv("/mnt/storage/speos/data/gwas/{}.genes.out".format(trait), sep=" ", header=0,
                 usecols=["ZSTAT", "GENE"],
                 dtype = {'ZSTAT': float, 'GENE': int})

translator = pd.read_csv("/mnt/storage/speos/data/hgnc_official_list.tsv", sep="\t", header=0,
                 usecols=["entrez_id", "symbol"], dtype = {'entrez_id': int, 'symbol': str}).set_index('entrez_id')["symbol"].to_dict()

gene2weight = {}

for i, row in df.iterrows():
    try:
        name = translator[int(row[0])]
    except KeyError:
        continue
    try:
        if gene2weight[name] < row[1]:
            gene2weight[name] = row[1]
    except KeyError:
        gene2weight[name] = row[1]

In [None]:
print(len(gene2weight))

In [None]:
disorder = "uc_film_nohetio"

table = pd.read_csv("/mnt/storage/speos/results/{}_pp_table.tsv".format(disorder), header=0, sep="\t")

table = table[table["Is Included"]]

In [None]:
table.head()

In [None]:
hgnc_score_df = pd.DataFrame({"HGNC": table["Unnamed: 0"][table["CS"] == 11],
                              "ZScore": [gene2weight[gene] for gene in table["Unnamed: 0"][table["CS"] == 11]]})

hgnc_score_df.sort_values("ZScore").head(20)

In [None]:
hgnc_score_df = pd.DataFrame({"HGNC": table["Unnamed: 0"][table["Mendelian"]],
                              "ZScore": [gene2weight[gene] for gene in table["Unnamed: 0"][table["Mendelian"]]]})

hgnc_score_df.sort_values("ZScore").head(20)

In [None]:
hgnc_score_df = pd.DataFrame({"HGNC": table["Unnamed: 0"][table["Mendelian"]],
                              "ZScore": [gene2weight[gene] for gene in table["Unnamed: 0"][table["Mendelian"]]]})

hgnc_score_df.sort_values("ZScore", key=abs).head(20)

In [None]:
gene2weight["NFKB2"]

In [None]:
gene2weight_subset = {key: value for key, value in gene2weight.items() if key in table["Unnamed: 0"].tolist()}

sorted_dict = sorted(gene2weight_subset, key=gene2weight_subset.get)[::-1]
print(sorted_dict[:10])
print([gene2weight_subset[sorted_dict[i]] for i in range(10)])

gene2score = {gene: cs for gene, cs in zip(table["Unnamed: 0"], table["CS"].astype(int))}

for mendelian in table["Unnamed: 0"][table["Mendelian"]]:
    gene2score[mendelian] = 12

mendelians = set(table["Unnamed: 0"][table["Mendelian"]].tolist())
candidates = {}
for i in range(11):
    candidates[i + 1] = set(table["Unnamed: 0"][table["CS"] > i].tolist())

noncandidates = {}
for i in range(11):
    noncandidates[i + 1] = set(table["Unnamed: 0"][(table["CS"] <= i) & (~table["Mendelian"])].tolist())

In [None]:
mendelians2weight = [gene2weight[symbol] for symbol in mendelians if symbol in gene2weight.keys()]
candidates2weight = {key: [gene2weight[symbol] for symbol in value if symbol in gene2weight.keys()] for key, value in candidates.items()}
noncandidates2weight = {key: [gene2weight[symbol] for symbol in value if symbol in gene2weight.keys()] for key, value in noncandidates.items()}

In [None]:
coregenes2weight = []
coregenes2weight.extend(mendelians2weight)
coregenes2weight.extend(candidates2weight[1])

highcoregenes2weight = []
highcoregenes2weight.extend(mendelians2weight)
highcoregenes2weight.extend(candidates2weight[11])

In [None]:
mendelians2weight_dict = {symbol: gene2weight[symbol] for symbol in mendelians if symbol in gene2weight.keys()}
sorted_dict = sorted(mendelians2weight_dict, key=mendelians2weight_dict.get)[::-1]
print(sorted_dict[:6])
print([gene2weight[sorted_dict[i]] for i in range(6)])

In [None]:
noncandidates2weight_dict = {symbol: gene2weight[symbol] for symbol in noncandidates[1] if symbol in gene2weight.keys()}
sorted_dict = sorted(noncandidates2weight_dict, key=noncandidates2weight_dict.get)[::-1]
print(sorted_dict[:6])
print([gene2weight[sorted_dict[i]] for i in range(6)])

In [None]:
candidates2weight_dict = {symbol: gene2weight[symbol] for symbol in candidates[1] if symbol in gene2weight.keys()}
sorted_dict = sorted(candidates2weight_dict, key=candidates2weight_dict.get)[::-1]
print(sorted_dict[:10])
print([gene2weight[sorted_dict[i]] for i in range(10)])

In [None]:
coregenes2weight_dict = {symbol: gene2weight[symbol] for symbol in list(candidates[1]) + list(mendelians) if symbol in gene2weight.keys()}
sorted_dict = sorted(coregenes2weight_dict, key=coregenes2weight_dict.get)[::-1]
print(sorted_dict[:10])
print([gene2weight[sorted_dict[i]] for i in range(10)])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from adjustText import adjust_text
import matplotlib
from scipy.stats import ttest_ind

def pval_sign(pval):
    if pval > 0.05:
        return "(n.s.)"
    elif pval > 0.01:
        return "*"
    elif pval > 0.01:
        return "**"
    else:
        return "***"
    
    

norm = matplotlib.colors.Normalize(vmin=0.0, vmax=12.0)
cmap = matplotlib.colormaps['viridis']
pallette = {int(value): cmap(norm(value)) for value in range(13)}

fig, axes  = plt.subplots(1,2, figsize=(10,5))

nonc = noncandidates2weight[1]
nonc_names = list(noncandidates[1])

cand = coregenes2weight
cand_names =  list(mendelians) + list(candidates[1])

axes[0] = sns.violinplot(y=nonc + cand, 
                         x=(["Peripheral Genes"] * len(nonc)) + (["Core Genes"] * len(cand)),
                         ax=axes[0],
                         cut=0)

x = -0.3 ; y = 5 ; width = 1.6; height = 8; 
rect = mpatches.Rectangle((x, y), width, height, linewidth=1,linestyle="--", edgecolor='grey', facecolor='#AAAAAA00', zorder=-5)
patch = axes[0].add_patch(rect)
axes[0].set_ylim(top=13.5)

nonc_vs_cand = ttest_ind(nonc, cand)

axes[0].axhline(y=8, xmin=0.28, xmax=0.72, color="black", linewidth="1")
axes[0].text(0.45, 8.2, pval_sign(nonc_vs_cand[1]))

nonc_names = np.asarray(nonc_names)[np.asarray(nonc) > 5].tolist()
nonc = np.asarray(nonc)[np.asarray(nonc) > 5].tolist()

cand_names = np.asarray(cand_names)[np.asarray(cand) > 5].tolist()
cand = np.asarray(cand)[np.asarray(cand) > 5].tolist()

axes[1] = sns.swarmplot(y=nonc + cand, 
                         x=(["Peripheral Genes"] * len(nonc)) + (["Core Genes"] * len(cand)),
                         ax=axes[1],
                         palette=pallette,
                         hue=[gene2score[gene] for gene in nonc_names + cand_names])
                       
axes[1].set_ylim(top=13)

coregenes2weight_dict = {symbol: gene2weight[symbol] for symbol in noncandidates[1] if symbol in gene2weight.keys()}
sorted_dict = sorted(coregenes2weight_dict, key=coregenes2weight_dict.get)[::-1]

texts = []

x, y = np.asarray(axes[1].get_children()[0].get_offsets()).T
y, x = np.asarray(sorted(zip(y,x))[::-1]).T
for i in range(8):
    texts.append(plt.text(x[i], gene2weight[sorted_dict[i]], sorted_dict[i], size=6))


adjust_text(texts, force_points=1, arrowprops=dict(arrowstyle='-', 
color='black', lw=0.5))


coregenes2weight_dict = {symbol: gene2weight[symbol] for symbol in list(candidates[11]) + list(mendelians) if symbol in gene2weight.keys()}
sorted_dict = sorted(coregenes2weight_dict, key=coregenes2weight_dict.get)[::-1]

texts = []

x, y = np.asarray(axes[1].get_children()[1].get_offsets()).T
y, x = np.asarray(sorted(zip(y,x))[::-1]).T
for i in range(30):
    gene = sorted_dict[i]
    texts.append(plt.text(x[i], gene2weight[sorted_dict[i]], gene, size=6))
    #color = cmap(norm(gene2score[gene]))
    #axes[1].plot(x[i] - ((x[i] - 1) * 0.), y[i], 'o', markersize=5, markeredgewidth=0, c=color, zorder=5)

axes[1].legend(loc='upper left', title="Speos CS")
adjust_text(texts, force_points=3, arrowprops=dict(arrowstyle='-', 
color='black', lw=0.5))

for ax in axes:
    ax.set_ylabel("UC GWAS Z-Score")

axes[1].yaxis.tick_right()
axes[1].yaxis.set_label_position("right")
plt.tight_layout()
#plt.savefig("UC_corevsnpnoncore.pdf", bbox_inches="tight")

In [None]:
highcoregenes2weight_dict = {symbol: gene2weight[symbol] for symbol in list(candidates[11]) + list(mendelians) if symbol in gene2weight.keys()}
highcore_sorted_dict = sorted(highcoregenes2weight_dict, key=highcoregenes2weight_dict.get)[::-1]
print(highcore_sorted_dict[:10])
print([gene2weight[sorted_dict[i]] for i in range(10)])

lowcoregenes2weight_dict = {symbol: gene2weight[symbol] for symbol in list(candidates[1].difference(candidates[11])) if symbol in gene2weight.keys()}
lowcore_sorted_dict = sorted(lowcoregenes2weight_dict, key=lowcoregenes2weight_dict.get)[::-1]
print(lowcore_sorted_dict[:10])
print([gene2weight[lowcore_sorted_dict[i]] for i in range(10)])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
from adjustText import adjust_text
import matplotlib
from scipy.stats import ttest_ind, mannwhitneyu
from speos.visualization.settings import *
import matplotlib.pyplot as plt

def pval_sign(pval):
    if pval > 0.05:
        return "(n.s.)"
    elif pval > 0.01:
        return "*"
    elif pval > 0.01:
        return "**"
    else:
        return "***"



norm = matplotlib.colors.Normalize(vmin=0.0, vmax=12.0)
cmap = matplotlib.colormaps['viridis']
pallette = {int(value): cmap(norm(value)) for value in range(13)}

fig, axes  = plt.subplots(1,2, figsize=(full_width*cm*0.66,6*cm))

nonc = noncandidates2weight[1]
nonc_names = list(noncandidates[1])

banstring = "XXX" # replace with "HLA" to filter out HLA region genes

lowc  = [value for key, value in lowcoregenes2weight_dict.items() if not key.startswith(banstring)]
lowc_names =  [key for key, value in lowcoregenes2weight_dict.items() if not key.startswith(banstring)]

cand = [value for key, value in highcoregenes2weight_dict.items() if not key.startswith(banstring)]
cand_names =  [key for key, value in highcoregenes2weight_dict.items() if not key.startswith(banstring)]

bp = axes[0].boxplot(x=(nonc, lowc, cand), 
              positions=[0,1,2], widths=[0.1, 0.1, 0.1], showfliers=False, zorder=5, patch_artist=True)

for feature, color in zip(['boxes', "medians", "whiskers", "caps"], ["darkgray", "black", "darkgray", "darkgray"]):
    plt.setp(bp[feature], color=color)

axes[0] = sns.violinplot(y=nonc + lowc + cand, 
                         x=(["Peripheral\nGenes"] * len(nonc)) + (["Intermediate\nGenes"] * len(lowc)) + (["High Core\nGenes"] * len(cand)),
                         ax=axes[0],
                         palette={"Peripheral\nGenes": "#5a5a5a", "Intermediate\nGenes": "lightblue", "High Core\nGenes": "#01016f"},
                         cut=0,
                         inner=None,
                         linewidth=0.5)




#axes[0] = sns.boxplot(y=nonc + lowc + cand, 
#              x=(["Peripheral\nGenes"] * len(nonc)) + (["Low Core\nGenes"] * len(lowc)) + (["High Core\nGenes"] * len(cand)),
#              palette={"Peripheral\nGenes": "darkgray", "Low Core\nGenes": "darkgray", "High Core\nGenes": "white"},
#              linewidth=0.5, 
#              width=0.25, ax=axes[0], showfliers=False, zorder=5)

x = -0.3 ; y = 5 ; width = 2.6; height = 8; 
rect = mpatches.Rectangle((x, y), width, height, linewidth=1,linestyle="--", edgecolor='grey', facecolor='#AAAAAA00', zorder=-5)
patch = axes[0].add_patch(rect)
axes[0].set_ylim(top=13.5)

nonc_vs_lowc = ttest_ind(nonc, lowc)
nonc_vs_cand = ttest_ind(nonc, cand)
lowc_vs_cand = ttest_ind(lowc, cand)

axes[0].axhline(y=7.35, xmin=0.18, xmax=0.48, color="black", linewidth="1")
axes[0].text(0.5, 7.45, pval_sign(nonc_vs_lowc[1]), va="center", ha="center")

axes[0].axhline(y=7.35, xmin=0.52, xmax=0.82, color="black", linewidth="1")
axes[0].text(1.5, 7.45, pval_sign(lowc_vs_cand[1]), va="center", ha="center")

axes[0].axhline(y=8.35, xmin=0.18, xmax=0.82, color="black", linewidth="1")
axes[0].text(1, 8.45, pval_sign(nonc_vs_cand[1]), va="center", ha="center")


nonc_names = np.asarray(nonc_names)[np.asarray(nonc) > 5].tolist()
nonc = np.asarray(nonc)[np.asarray(nonc) > 5].tolist()

lowc_names = np.asarray(lowc_names)[np.asarray(lowc) > 5].tolist()
lowc = np.asarray(lowc)[np.asarray(lowc) > 5].tolist()

cand_names = np.asarray(cand_names)[np.asarray(cand) > 5].tolist()
cand = np.asarray(cand)[np.asarray(cand) > 5].tolist()

axes[1] = sns.swarmplot(y=nonc + lowc + cand, 
                         x=(["Peripheral\nGenes"] * len(nonc)) + (["Intermediate\nGenes"] * len(lowc)) + (["Core\nGenes"] * len(cand)),
                         ax=axes[1],
                         palette=pallette,
                         hue=[gene2score[gene] for gene in nonc_names + lowc_names + cand_names],
                         size=3)
                       
axes[1].set_ylim(top=13)

coregenes2weight_dict = {symbol: gene2weight[symbol] for symbol in noncandidates[1] if symbol in gene2weight.keys()}
sorted_dict = sorted(coregenes2weight_dict, key=coregenes2weight_dict.get)[::-1]

texts = []

x, y = np.asarray(axes[1].get_children()[0].get_offsets()).T
y, x = np.asarray(sorted(zip(y,x))[::-1]).T
for i in range(8):
    texts.append(plt.text(x[i], gene2weight[sorted_dict[i]], sorted_dict[i], size=4))


adjust_text(texts, force_points=1, arrowprops=dict(arrowstyle='-', 
color='black', lw=0.5))

texts = []

x, y = np.asarray(axes[1].get_children()[1].get_offsets()).T
y, x = np.asarray(sorted(zip(y,x))[::-1]).T
for i in range(8):
    texts.append(plt.text(x[i], gene2weight[lowcore_sorted_dict[i]], lowcore_sorted_dict[i], size=4))


adjust_text(texts, force_points=1, arrowprops=dict(arrowstyle='-', 
color='black', lw=0.5))



texts = []

x, y = np.asarray(axes[1].get_children()[2].get_offsets()).T
y, x = np.asarray(sorted(zip(y,x))[::-1]).T
for i in range(30):
    try:
        texts.append(plt.text(x[i], gene2weight[highcore_sorted_dict[i]], highcore_sorted_dict[i], size=4))
    except IndexError:
        break
    #color = cmap(norm(gene2score[gene]))
    #axes[1].plot(x[i] - ((x[i] - 1) * 0.), y[i], 'o', markersize=5, markeredgewidth=0, c=color, zorder=5)

axes[1].legend(loc='upper left', title="Speos CS", ncol=3, fontsize=5, markerscale=0.5, title_fontsize=7, columnspacing=0.8, labelspacing =0.3, handletextpad=-0.2)
adjust_text(texts, force_points=3, arrowprops=dict(arrowstyle='-', 
color='black', lw=0.5))

for ax in axes:
    ax.set_ylabel("UC GWAS Z-Score", fontsize=8)

axes[1].yaxis.tick_right()
axes[1].yaxis.set_label_position("right")
plt.tight_layout()
#plt.savefig("UC_highcorevsnoncore.pdf", bbox_inches="tight")

In [None]:
nonc_vs_cand

In [None]:
nonc_vs_lowc

In [None]:
lowc_vs_cand