In [None]:
import numpy as np
import pandas as pd
from textwrap import wrap
from matplotlib import pyplot as plt
from sklearn import metrics
import matplotlib.ticker as mticker
import sys, os
from hsbmpy import plot_topic_size, get_max_available_L
from hypergeom import parameters_for_hypergeometric, build_map, plot_map

In [None]:
directory="/home/jovyan/work/phd/datasets/gtex/10"
os.chdir(directory)
sys.path.append('/home/jovyan/work/phd/')

In [None]:
L = get_max_available_L(directory)

In [None]:
df = pd.read_csv("mainTable.csv", index_col=[0])

# topic size

In [None]:
for l in range(0,L+1):
    plot_topic_size(directory,l)

## Topic O

In [None]:
df = pd.read_csv("mainTable.csv", index_col=0,header=0)

In [None]:
df_mv=pd.DataFrame(data=[df.mean(1), df.var(1),df.apply(lambda x: len([x[x>0]])/float(len(x)), 1)], index=['average', 'var', 'O']).transpose()
df_mv.head()

In [None]:
for l in range(0,L+1):
    fig = plt.figure(figsize=(15,8))
    ax = fig.subplots(1,2)
    candles = get_candles(directory,l,df_mv,ax[0])
    candlestick2_ohlc(ax[0], candles['open'],candles['high'],candles['low'],candles['close'],width=0.6,colordown='b')
    ax[1].hist((np.array(candles['open'])+np.array(candles['close']))/2, weights=candles['size'], range=(-0.05,1.05), bins=10, histtype='step')
    ax[1].set_xlabel("$O_i", fontsize=18)
    plt.show()
    fig.savefig("%s/topic_Ocandles_level_%d.pdf"%(directory,l))

# Geneontology

In [None]:
from geneontology import get_ontology_df, ensg_to_symbol
from tableanalyser import get_symbol
import gseapy as gs

In [None]:
import importlib, geneontology,tableanalyser
importlib.reload(geneontology)
importlib.reload(tableanalyser)
from geneontology import get_ontology_df, ensg_to_symbol
from tableanalyser import get_symbol

In [None]:
l=L-1
algorithm = "topsbm"
df_topics = pd.read_csv(f"{directory}/{algorithm}/{algorithm}_level_{l}_topics.csv")
df_topics_smooth = pd.read_csv(f"{directory}/{algorithm}/{algorithm}_level_{l}_word-dist.csv",index_col=0)
df_topics_smooth.index = [g[:15] for g in df_topics_smooth.index]
print(f"level {l} with {df_topics.shape[1]} topics")

In [None]:
def get_topic_over_thr(topic_name, q=0.05):
    topic = df_topics_smooth[topic_name]
    topic = topic[topic>0]
    topic = topic[topic>topic.quantile(q=q)]
    return topic.sort_values(ascending=False)

In [None]:
df_symbols= pd.read_csv("https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_pub_ensembl_id&col=md_ensembl_id&col=md_eg_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit", index_col=[0], sep='\t')

In [None]:
def get_sea():
    for g in df_topics.values.ravel()[[str(s)!='nan' for s in df_topics.values.ravel()]]:
        yield get_symbol(g)

In [None]:
with open("gback.txt",'w') as f:
    list(map(lambda x: f.writelines(x+'\n') if len(x)>1 else None, get_sea()))
    
with open("gback_ensg.txt",'w') as f:
    list(map(lambda x: f.writelines(x[:15]+'\n')if len(x)>1 else None, df_topics.values.ravel()[[str(s)!='nan' for s in df_topics.values.ravel()]]))

In [None]:
gs.get_library_name()

In [None]:
#https://www.gsea-msigdb.org/gsea/downloads.jsp
gene_sets = ['GO_Molecular_Function_2018',
             'GO_Biological_Process_2018',
             'GO_Cellular_Component_2018',
             'Human_Phenotype_Ontology',
             'WikiPathways_2019_Human',
             '/home/jovyan/work/phd/MSigDB/c1.all.v7.1.symbols.gmt',
             '/home/jovyan/work/phd/MSigDB/c2.all.v7.1.symbols.gmt',
             '/home/jovyan/work/phd/MSigDB/c3.all.v7.1.symbols.gmt',
             '/home/jovyan/work/phd/MSigDB/c4.all.v7.1.symbols.gmt',
             '/home/jovyan/work/phd/MSigDB/c5.all.v7.1.symbols.gmt',
             '/home/jovyan/work/phd/MSigDB/c6.all.v7.1.symbols.gmt',
             '/home/jovyan/work/phd/MSigDB/c7.all.v7.1.symbols.gmt',
            ]

In [None]:
threshhold = 5e-1
cutoff = 5e-1
background = len([g for g in get_sea()])
os.system("mkdir -p gsea")
os.system("mkdir -p gsea/{}".format(algorithm))
for itopic,topic in enumerate(df_topics.columns):
    try:
        enriched_topic = pd.read_csv("gsea/%s/gsea_level_%d_topic_%d.csv"%(algorithm,l,itopic+1), index_col=[0])
        print(topic)
    except:
        try:
            gene_list = ensg_to_symbol(df_topics.loc[:,topic].dropna().values)
            #gene_list = ensg_to_symbol(get_topic_over_thr(topic).index)
            print(topic)
            enriched_topic = get_ontology_df(gene_list, cutoff=cutoff, threshhold = threshhold, gene_sets = gene_sets, background=background)
            enriched_topic = enriched_topic.sort_values(by=['Adjusted P-value'], ascending=True)[:20]
            enriched_topic.to_csv("gsea/%s/gsea_level_%d_topic_%d.csv"%(algorithm,l,itopic+1))
        except:
            print(*sys.exc_info())
            continue
    print(enriched_topic)

In [None]:
topic_pvalues = []
topic_gos = []
for itopic,topic in enumerate(df_topics.columns):
    try:
        enriched_topic = pd.read_csv("gsea/%s/gsea_level_%d_topic_%d.csv"%(algorithm,l,itopic+1))
        if len(enriched_topic.index) >0:
            p_val = np.sort(enriched_topic['Adjusted P-value'])[0]
            topic_pvalues.append(-np.log10(p_val))
            for goc in enriched_topic['Gene_set'][:10].unique():
                topic_gos.append(goc)
        print(topic)
    except:
        print("error", sys.exc_info()[0])

In [None]:
fig = plt.figure(figsize=(18,15))
x = np.arange(1,1+len(topic_pvalues))
c, _, _ = plt.hist(topic_pvalues, histtype='step', lw=20, bins=45, color="gray")
plt.vlines(-np.log10(0.05),0,np.max(c)*1.1, color="red", ls='--', lw=10, label="$\\alpha=0.05$")
plt.xlabel('-log(P-value)', fontsize=35)
plt.ylabel("number of topics", fontsize=35)
#plt.ylim(0,0.055)
#plt.yscale('log')
plt.legend(fontsize=35)
plt.tick_params(which="both",labelsize=35)
fig.savefig("%s/pvalues_acrosstopic_%s_(%d).pdf"%(directory,algorithm,l))

In [None]:
fig = plt.figure(figsize=(18,20))
gos, goscounts = np.unique(topic_gos, return_counts=True)
plt.barh(["\n".join(wrap(str(l).replace('_',' '),20)) for l in gos], goscounts)
plt.yticks(fontsize=15)
plt.tick_params(which="both",labelsize=35)
plt.tight_layout()
plt.show()
fig.savefig("%s/pvalue_categories_%s_(%d).pdf"%(directory,algorithm,l))

# hypergeometric operlaps

In [None]:
from scipy.stats import hypergeom
from sklearn.metrics import v_measure_score
import seaborn as sns
sns.set_context('paper')

In [None]:
import importlib, hypergeom
importlib.reload(hypergeom)
from hypergeom import *

In [None]:
hsbm_list_topics = pd.read_csv("topsbm/topsbm_level_3_topics.csv")
gene_list = hsbm_list_topics.values.ravel().astype(str)
gene_list = list(map(lambda g: g[:15],filter(lambda g: g!="nan", gene_list)))
hsbm_list=pd.Series(index=[g[:15] for g in gene_list], dtype=str)
for topic in hsbm_list_topics.columns:
    hsbm_list[[g[:15] for g in hsbm_list_topics[topic].dropna()]]=topic

In [None]:
hsbm_list_topics = pd.read_csv("topsbm-log/topsbm-log_level_3_topics.csv")
gene_list = hsbm_list_topics.values.ravel().astype(str)
gene_list = list(map(lambda g: g[:15],filter(lambda g: g!="nan", gene_list)))
hsbm_log_list=pd.Series(index=[g[:15] for g in gene_list], dtype=str)
for topic in hsbm_list_topics.columns:
    hsbm_log_list[[g[:15] for g in hsbm_list_topics[topic].dropna()]]=topic

In [None]:
wgcna_list_topics = pd.read_csv("wgcna/wgcna_level_0_topics.csv")
gene_list = wgcna_list_topics.values.ravel().astype(str)
gene_list = list(map(lambda g: g[:15],filter(lambda g: g!="nan", gene_list)))
wgcna_list=pd.Series(index=np.unique([g[:15] for g in gene_list]), dtype=str)
for topic in wgcna_list_topics.columns:
    wgcna_list[np.unique([g[:15] for g in wgcna_list_topics[topic].dropna()])]=topic

In [None]:
tm_list_topics = pd.read_csv("tm/tm_level_0_topics.csv")
gene_list = tm_list_topics.values.ravel().astype(str)
gene_list = list(map(lambda g: g[:15],filter(lambda g: g!="nan", gene_list)))
tm_list=pd.Series(index=np.unique([g[:15] for g in gene_list]), dtype=str)
for topic in tm_list_topics.columns:
    tm_list[np.unique([g[:15] for g in tm_list_topics[topic].dropna()])]=topic

In [None]:
lda_list_topics = pd.read_csv("lda/lda_level_1_topics.csv")
gene_list = lda_list_topics.values.ravel().astype(str)
gene_list = list(map(lambda g: g[:15],filter(lambda g: g!="nan", gene_list)))
lda_list=pd.Series(index=[g[:15] for g in gene_list], dtype=str)
for topic in lda_list_topics.columns:
    lda_list[lda_list.index.isin([g[:15] for g in lda_list_topics[topic].dropna()])]=topic
lda_list=lda_list.reset_index().drop_duplicates("index").set_index("index")
lda_list=pd.Series(index=lda_list.index, data=lda_list.values.ravel())

In [None]:
def run(first_name, last_name):
    list_1 = globals()[f"{first_name}_list"]
    list_2 = globals()[f"{last_name}_list"]

    #to uniform
    list_1 = list_1[list_1.index.isin(list_2.index)]
    list_2 = list_2[list_2.index.isin(list_1.index)]
    hyper_params = parameters_for_hypergeometric(list_1, list_2)
    df_cmap = build_map(*hyper_params)
    df_cmap[df_cmap<3]=0
    df_cmap = df_cmap.sort_values(by=[t for t in df_cmap.columns], ascending=False)
    plot_map(df_cmap, first_name=first_name, last_name=last_name)

In [None]:
run("hsbm","tm")
run("hsbm","lda")
run("hsbm","wgcna")
run("tm","lda")
run("tm","wgcna")
run("lda","wgcna")

In [None]:
print("hsbm & tm %.3f \\\\ \hline"%v_measure_score(hsbm_list.reindex_like(tm_list), tm_list))
print("hsbm & lda %.3f \\\\ \hline"%v_measure_score(hsbm_list.reindex_like(lda_list).dropna(), lda_list.reindex_like(hsbm_list).dropna()))
print("hsbm & wgcna %.3f \\\\ \hline"%v_measure_score(hsbm_list.reindex_like(wgcna_list), wgcna_list))
print("tm & lda %.3f \\\\ \hline"%v_measure_score(tm_list.reindex_like(lda_list).dropna(), lda_list.reindex_like(tm_list).dropna()))
print("tm & wgcna %.3f \\\\ \hline"%v_measure_score(tm_list.reindex_like(wgcna_list).dropna(), wgcna_list.reindex_like(tm_list).dropna()))
print("lda & wgcna %.3f \\\\ \hline"%v_measure_score(lda_list.reindex_like(wgcna_list).dropna(), wgcna_list.reindex_like(lda_list).dropna()))

In [None]:
for g in hsbm_list[hsbm_list=="Topic 1"].index:
    print(g)