In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from topicpy.hsbmpy import fraction_bar_plot, get_max_available_L
from topicpy.geneontology import get_symbol
from tableanalyser import get_ensg, get_symbol
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

In [None]:
directory = r'datasets/tcga/BRCA/mirna/'
l=get_max_available_L(directory, "topsbm")

In [None]:
import multiprocessing as mp

df = pd.DataFrame()

def get_series(line):
    data = line.split("\t")
    if len(data) < 2:
        return pd.Series(dtype=int)
    return pd.Series(index = data[2:], name = data[0], data = np.repeat(1, len(data)-2).astype(int), dtype=int)

def join_series(series):
    global df
    if series is not None:
        df = df.join(series, how="outer").fillna(0).astype(int)

pool = mp.Pool()
with open("MSigDB/c3.all.v7.1.symbols.gmt", "r") as file:
    work = pool.map_async(get_series, file.read().split("\n"), callback=join_series, error_callback=lambda err: print(err))
    
pool.close()
pool.join()

In [None]:
df

In [None]:
df.sum()

In [None]:
genes = pd.read_csv("gene_symbol.txt", index_col=0)
genes

In [None]:
df_topics = pd.read_csv("%s/topsbm/topsbm_level_%d_topics.csv"%(directory,l))
df_topic_distr = pd.read_csv("%s/topsbm/topsbm_level_%d_word-dist.csv"%(directory,l), index_col=0)

In [None]:
topic_tf = pd.DataFrame(index=df.columns)
for topic in df_topics.columns:
    print(topic)
    #get P(gene|topic) with symbols
    subdf = df_topic_distr[topic][df_topic_distr[topic]>0]
    subdf.reindex(index=np.unique([get_symbol(e) for e in subdf.index]))
    #get TF for current genes
    cdf = df[df.index.isin(list(map(get_symbol, subdf.index)))].astype(float)
    #weight TF for probability
    cdf.multiply(subdf, axis=0).fillna(0.)
    #sum # of TF
    c_series = cdf.apply(lambda x: np.sum(x.astype(float)), axis=0)
    topic_tf.insert(0, topic, c_series)

In [None]:
mask = topic_tf.sum(1) > 180

In [None]:
fraction_tf = topic_tf[mask].fillna(0).transpose()
#fraction_tf=fraction_tf.divide(fraction_tf.sum(axis=1), axis=0).fillna(0)
fraction_tf = fraction_tf.to_dict('list')

In [None]:
fig=plt.figure(figsize=(15,10))
ax=fig.subplots()
x = np.arange(1,1+len(df_topics.columns))
fraction_bar_plot(x, fraction_tf, ax)
ax.tick_params(labelsize=25)
ax.set_xlabel("Topics", fontsize=35)
#plt.yscale('log')
#plt.legend(ncol=20)
plt.show()