In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf

In [None]:
working_dir = "datasets/tcga/COAD/mirna_allsamples"
alg = "trisbm"
L = 1
os.chdir(working_dir)

In [None]:
df_Pgt = pd.read_csv(f"{alg}/{alg}_level_{L}_word-dist.csv", index_col=0)
df_Pgt.head(2)

In [None]:
df_Pts = pd.read_csv(f"{alg}/{alg}_level_{L}_topic-dist.csv", index_col=1).drop("i_doc", 1)
df_Pts.head(2)

In [None]:
df_Pmk = pd.read_csv(f"{alg}/{alg}_level_{L}_keyword-dist.csv", index_col=0)
df_Pmk.head(2)

In [None]:
df = pd.read_csv(f"mainTable_all.csv", index_col=0)
df = df[df.index.isin(np.concatenate([df_Pmk.index, df_Pgt.index]))]
df = df.reindex(columns=df_Pts.index)
df = df.applymap(lambda fpkm: np.log2(fpkm+1))
print(df.shape)
df.head(2)

In [None]:
es = df[df.index.isin(filter(lambda g: "ENSG" in g, df.index))].sum(0)
emirna = df[df.index.isin(filter(lambda g: "ENSG" not in g, df.index))].sum(1)

In [None]:
df_Psm = pd.read_csv(f"{alg}/{alg}_level_{L}_metadatum-dist.csv", index_col=1).drop("i_doc", 1)
df_Psm = pd.DataFrame(data=df_Psm.values*np.repeat(es.values,df_Psm.shape[1]).reshape(df_Psm.shape),
                           index=df_Psm.index,
                           columns=df_Psm.columns
                          )
df_Psm = df_Psm.divide(df_Psm.sum(0),1)
df_Psm.head(2)

In [None]:
df_Pmk.head(2)

In [None]:
df_Pmk = pd.read_csv(f"{alg}/{alg}_level_{L}_keyword-dist.csv", index_col=0)
df_Pmk = pd.DataFrame(data=df_Pmk.values*np.repeat(emirna.values, df_Pmk.shape[1]).reshape(df_Pmk.shape),
                           index=df_Pmk.index,
                           columns=df_Pmk.columns
                          )
df_Pmk = df_Pmk.divide(df_Pmk.sum(1),0)
df_Pmk.head(2)

In [None]:
Pgs = np.matmul(df_Pgt.values, df_Pts.values.T)
Pgm = np.matmul(Pgs, df_Psm.fillna(0).values)
Pgk = np.matmul(Pgm / np.sum(Pgm,0), df_Pmk.divide(df_Pmk.sum(1),0).values.T)
df_Pgm = pd.DataFrame(data=Pgm, index=df_Pgt.index, columns=df_Psm.columns)
df_Pgk = pd.DataFrame(index = df_Pgt.index, columns = df_Pmk.index, data = Pgk/np.sum(Pgk,0)[np.newaxis,:])

In [None]:
df_Pgk.head(2)

In [None]:
import seaborn as sns
cm = sns.clustermap(df_Pgk.subtract(df_Pgm.mean(1),0).divide(df_Pgm.mean(1),0).dropna(how="all", axis=1),
              )

In [None]:
for g in df_Pgk.loc[df_Pgk.index[cm.dendrogram_col.reordered_ind[:30]],:].index:
    print(g)

In [None]:
for mirna in df_Pgk.loc[np.random.choice(df_Pgk.index),:].sort_values(ascending=False)[:20].items():
    print(mirna[0])

## Distinctivness

In [None]:
@tf.function
def kullbach_liebler(theta_k, theta_l):
    # dey-visualizing paper
    return tf.subtract(tf.add(tf.math.multiply(theta_k, tf.math.log(tf.math.divide(theta_k, theta_l))), theta_l),
                       theta_k)

@tf.function
def distinctivness(theta_k):
    return tf.reduce_min(tf.sort(theta_k)[:, 1:], axis=1)

@tf.function
def get_distinctivness(data):
    KL_tensor = tf.map_fn(fn=lambda k: tf.map_fn(fn=lambda l: kullbach_liebler(k, l), elems=data), elems=data, parallel_iterations=3)
    KL_tensor_min = tf.map_fn(distinctivness, tf.transpose(KL_tensor, perm=[2, 0, 1]), parallel_iterations=3)
    return KL_tensor_min

In [None]:
out = get_distinctivness(tf.convert_to_tensor(df_Pgm.values)).numpy()
out.shape

In [None]:
df_D = pd.DataFrame(data=out, columns=df_Pgm.index, index=df_Pgm.columns)
df_D.head(2)

In [None]:
for g in df_D.transpose()["Metadatum 4"].sort_values(ascending=False).items():
    print(g[0])