In [None]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from topicpy.hsbmpy import get_max_available_L
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
directory = "/home/jovyan/work/phd/topics/datasets/gtexall/"
algorithm = "wgcna"
L=0
os.chdir(directory)

In [None]:
df_topics = pd.read_csv("%s/%s_level_%d_topic-dist.csv"%(algorithm,algorithm,L), index_col=1).drop("i_doc", 1)
print("Working with", df_topics.shape[1],"topics")
df_files = pd.read_csv("files.dat", index_col=0).reindex(index=df_topics.index)
df_files.head(2)

In [None]:
df_topics.shape

In [None]:
df_topics["tissue"]=df_files["SMTS"]
df_tissues = df_topics.groupby("tissue").mean().transpose()
correlations = df_tissues.corr()
classes = df_tissues.columns

In [None]:
cm = sns.clustermap(correlations, 
                    vmax=1,  
                    row_cluster=True, 
                    col_cluster=False, 
                    xticklabels=classes, 
                    yticklabels=classes, 
                    annot=False,
                    cbar_pos=(0.99,0.06,0.05,0.15))
ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("Tissue", fontsize=35, rotation=90)
ax.set_yticklabels(labels=classes[cm.dendrogram_row.reordered_ind], rotation=0)
#ax.yaxis.tick_left()
#ax.yaxis.set_label_position("left")

#ax.set_xticklabels(labels=classes[cm.dendrogram_row.reordered_ind], rotation=90)
ax.set_xlabel("Tissue",fontsize=35)
ax.tick_params(labelsize=15)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("P()", fontsize=30)
plt.tight_layout()
cm.savefig(f"topic_correlation_{algorithm}_level{L}.pdf")

plt.show()
fig.savefig("heatmap_SMTS.pdf")

In [None]:
fig,ax = plt.subplots(figsize=(10,10))
cm.dendrogram_row.rotate=False
cm.dendrogram_row.plot(ax, {"linewidths":10})
ax.set_ylabel("Distance", fontsize=35, rotation=90)

ax.set_xticks(np.linspace(5, len(classes)*10-5, len(classes)))
ax.set_xticklabels(labels=classes[cm.dendrogram_row.reordered_ind], rotation=90)
ax.set_xlabel("Tissue",fontsize=35)
ax.tick_params(labelsize=15)
plt.tight_layout()
fig.savefig(f"topic_correlation_{algorithm}_level{L}.pdf")

In [None]:
df = pd.read_csv("mainTable.csv", index_col=0).transpose()

In [None]:
df["tissue"]=df_files.reindex(index=df.index)["SMTS"]
df = df.groupby("tissue").mean().transpose()
data_correlations = df.corr()
data_classes = df.columns

In [None]:
cm = sns.clustermap(data_correlations, 
                    vmax=1,  
                    row_cluster=True, 
                    col_cluster=False, 
                    xticklabels=data_classes, 
                    yticklabels=data_classes, 
                    annot=False,
                    cbar_pos=(0.99,0.06,0.05,0.15))
ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("Tissue", fontsize=35, rotation=90)
ax.set_yticklabels(labels=classes[cm.dendrogram_row.reordered_ind], rotation=0)
#ax.yaxis.tick_left()
#ax.yaxis.set_label_position("left")

#ax.set_xticklabels(labels=classes[cm.dendrogram_row.reordered_ind], rotation=90)
ax.set_xlabel("Tissue",fontsize=35)
ax.tick_params(labelsize=15)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("P()", fontsize=30)
plt.tight_layout()
cm.savefig(f"topic_correlation_{algorithm}_level{L}.pdf")

plt.show()
fig.savefig("heatmap_data_SMTS.pdf")

In [None]:
assert((classes==data_classes).all())

In [None]:
from scipy.stats import spearmanr
import plotly.graph_objects as go

In [None]:
differences={}

for label, data_v, topic_v in zip(classes,correlations.values, data_correlations.values):
    differences[label] = spearmanr(data_v,topic_v)[0]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=list(differences.keys()), y=list(differences.values())))

layout = {
    "title":algorithm,
    "xaxis":{
        "tickangle":290
    },
    "yaxis":{
        "title": "Spearman between <br> data and topics' space",
        "titlefont":{
            "size":24
        },
        "range":[0,1]
    }
}

fig.update_layout(layout)

fig.show()
fig.write_image(f"correlation_spearman_topic_data_{algorithm}.pdf")

In [None]:
df = pd.read_csv("mainTable.csv", index_col=0).transpose()

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(21)
df_pca = pd.DataFrame(data=pca.fit_transform(df.values), index=df.index)
df_pca["tissue"]=df_files.reindex(index=df_pca.index)["SMTS"]
df_pca = df_pca.groupby("tissue").mean().transpose()
correlations_pca = df_pca.corr()
classes_pca = df_tissues.columns

In [None]:
from scipy.stats import pearsonr

In [None]:
differences_pca={}

for label, data_v, topic_v in zip(classes_pca, correlations_pca.values, data_correlations.values):
    differences_pca[label] = pearsonr(data_v,topic_v)[0]

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(x=list(differences_pca.keys()), y=list(differences_pca.values()), name="pca"),
    go.Bar(x=list(differences.keys()), y=list(differences.values()), name="topic modeling")
])

layout = {
    "barmode":"group",
    "title": algorithm,
    "xaxis":{
        "tickangle":290
    },
    "yaxis":{
        "title": "Pearson between <br> data and topics' space",
        "titlefont":{
            "size":24
        },
        "range":[0,1]
    }
}

fig.update_layout(layout)

fig.show()

In [None]:
import plotly.express as px

In [None]:
fig=go.Figure()

fig.add_traces([
    go.Histogram(x=list(map(lambda x: abs(x[1]-x[0]), zip(differences.values(), differences_pca.values()))), name="sbm"),
    go.Histogram(x=list(map(lambda x: abs(x[1]-x[0]), zip(differences_lda.values(), differences_pca.values()))), name="lda")
])