In [None]:
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import tensorflow as tf
from hsbmpy import get_max_available_L, get_file
import tensorflow as tf

In [None]:
import importlib, geneontology
from geneontology import *
importlib.reload(geneontology)
from geneontology import *

In [None]:
algorithm = 'topsbm-log'
directory = "/home/jovyan/work/phd/topics/datasets/gtex10/"
L=get_max_available_L(directory, algorithm)-1
os.chdir(directory)

In [None]:
df_Pwt = pd.read_csv("%s/%s_level_%d_word-dist.csv"%(algorithm,algorithm,L), index_col=0)

In [None]:
fig=go.Figure()
for topic in df_Pwt.columns[:5]:
    dist = df_Pwt.sort_values(by=[topic], ascending=False)[topic].astype(float)
    dist = dist[dist>1e-50].values
    dist = dist/sum(dist)
    if len(dist)<2:
        pass
    fig.add_trace(go.Scatter(y=np.sort(dist[dist>0].T)[::-1], mode="lines+markers", name=topic))
    
x = np.linspace(1, df_Pwt.shape[0])
fig.add_trace(go.Scatter(x=x, y=0.1*x**-0.8, mode="lines", line_width=10, line_color="gray", line_dash="dash", name=""))
layout = dict()
layout["xaxis_title"]="words"
layout["xaxis_titlefont_size"]=35
layout["xaxis_exponentformat"]="e"
layout["xaxis_type"]="log"
layout["yaxis_title"]="$P(word | topic)$"
layout["yaxis_titlefont_size"]=45
layout["yaxis_type"]="log"
layout["yaxis_exponentformat"]="e"
fig.update_layout(layout)
fig.show()
fig.write_image("p_w_tw_%s.pdf"%algorithm)

In [None]:
df_Ptd = pd.read_csv("%s/%s_level_%d_topic-dist.csv"%(algorithm,algorithm,L), index_col=1).drop('i_doc',1)

In [None]:
fig=go.Figure()
for sample in df_Ptd.index.values[:10]:
    fig.add_trace(go.Scatter(y=df_Ptd.loc[sample,:].astype(float).values, mode='markers+lines'))

layout=dict()
layout["showlegend"]=False
layout["xaxis_title"]="topics"
layout["xaxis_titlefont_size"]=35
layout["xaxis_exponentformat"]="e"
layout["yaxis_title"]="$P(topic | sample)$"
layout["yaxis_titlefont_size"]=45
layout["yaxis_exponentformat"]="e"
fig.update_layout(layout)
fig.show()
fig.write_image("p_t_s_%s.pdf"%algorithm)

In [None]:
df_mt = pd.read_csv("mainTable.csv",sep=",",index_col=0)
df_mt.index = [g[:15] for g in df_mt.index]
df_files=pd.read_csv("files.dat", index_col=0)
df_mt=df_mt.reindex(index=df_Pwt.index.values.ravel(),
                    columns=[get_file(sample, df_files).name for sample in df_Ptd.index.values.ravel()])
df_mt.dropna(how="all", axis=0, inplace=True)
#df_mt=df_mt.applymap(lambda tpm: np.log2(tpm+1))
#df_mt=df_mt.where(df_mt<1e6,1e6)
df_mt.head()

In [None]:
Pgt = tf.convert_to_tensor(df_Pwt.values)
Pts = tf.transpose(tf.convert_to_tensor(df_Ptd.values))
Pgs = tf.matmul(Pgt,Pts)
Ps = tf.convert_to_tensor(df_mt.sum(0).values)
Ps = tf.divide(Ps, tf.reduce_sum(Ps))#normalize
Pg = tf.squeeze(tf.matmul(Pgs, tf.reshape(Ps,[Ps.shape[0],1])))

In [None]:
print(Pgt.shape,Pts.shape,Pgs.shape,Pg.shape, Ps.shape)

In [None]:
Pgs_data = Pgs.numpy()
Pg_data = Pg.numpy()
Pgs_data.shape

In [None]:
df_Pgs = pd.DataFrame(data=Pgs_data, index=df_Pwt.values.T[0], columns=df_Ptd.index.values.ravel())

In [None]:
A=df_mt.fillna(0).astype(int).sum(1).values

In [None]:
fig=go.Figure()
fig.add_traces([
    go.Scatter(y=np.sort(A)[::-1]/A.sum(), line_width=15, name='data'),
    go.Scatter(y=np.sort(Pg_data)[::-1]/np.sum(Pg_data), line_width=15, line_dash='dot', name='P(w)')
])

layout=dict()
layout["xaxis_title"]="words"
layout["xaxis_titlefont_size"]=35
layout["xaxis_exponentformat"]="e"
layout["xaxis_type"]="log"
layout["yaxis_title"]="$P(word)$"
layout["yaxis_titlefont_size"]=45
layout["yaxis_exponentformat"]="e"
layout["yaxis_type"]="log"
fig.update_layout(layout)
fig.show()
fig.write_image("p_w_%s.pdf"%algorithm)

In [None]:
df_topics = pd.read_csv("%s/%s_level_%d_topics.csv"%(algorithm,algorithm,L))
f = df_mt.sum(1)/df_mt.sum().sum()
f = f.sort_values(ascending=False)

df_Ptd = pd.read_csv("%s/%s_level_%d_topic-dist.csv"%(algorithm,algorithm,L), index_col=1).drop("i_doc",1)
df_Ptd = df_Ptd.transpose()
f_t = df_Ptd.sum(1)
f_t = f_t / f_t.sum()

correlation = []

for topic in df_topics.columns:
    f_g_value = np.mean(f[[g[:15] for g in df_topics[topic].dropna()]])
    f_t_value = f_t[topic]
    correlation.append((f_g_value,f_t_value))

In [None]:
fig=go.Figure()
x,y = np.array(correlation).T
fig.add_traces([
    go.Scatter(x=x, y=y, line_color="gray", name="topic", mode="markers", marker_size=20),
    go.Scatter(x=[x.min(), x.max()], y=[x.min(), x.max()], mode="lines", line_color="black", line_width=18, line_dash="dash", name="y=x")
])


layout=dict()
layout["title"]=f"{algorithm} - Correlation: {np.round(np.corrcoef(*np.array(correlation).T)[0,1], 2)}"
layout["xaxis_title"]="<Frequency>, $<f_i>_t$"
layout["xaxis_titlefont_size"]=35
layout["xaxis_exponentformat"]="e"
layout["xaxis_type"]="log"
layout["xaxis_tickfont_size"]=20
layout["yaxis_title"]="$<P(topic)>$"
layout["yaxis_titlefont_size"]=45
layout["yaxis_exponentformat"]="e"
layout["yaxis_type"]="log"
layout["yaxis_tickfont_size"]=20
fig.update_layout(layout)
fig.show()
fig.write_image("corr_f_pt_%s_%d.pdf"%(algorithm,L))