In [None]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
import scanpy as sc
import numpy as np
import multiprocessing as mp
import logging
log = logging.getLogger("slda")
hdl = logging.StreamHandler()
hdl.setLevel(logging.DEBUG)
hdl.setFormatter(logging.Formatter("%(message)s"))
log.addHandler(hdl)
log.setLevel(logging.DEBUG)

# Data

In [None]:
df_allfiles = pd.read_csv("../miRNA/files.dat", index_col=0)
df_allfiles["level_0"][:2], df_allfiles["index"][:2]

# Process miRNA

In [None]:
df_mirna = pd.read_csv("../miRNA/mainTable_miRNA.csv", index_col=0).reindex(columns=df_allfiles["level_0"].values)

In [None]:
X = df_mirna.transpose().values

In [None]:
lda = LatentDirichletAllocation(n_jobs=12)
topics = lda.fit_transform(X)

In [None]:
topics.shape

In [None]:
np.savetxt("train-label.txt", topics.argmax(1).T, fmt="%d")

## Save data

In [None]:
df =  pd.read_csv("../miRNA/mainTable_fpkm.csv", index_col=0).reindex(columns=df_allfiles["index"])
df_files= pd.read_csv("../miRNA/files_fpkm.dat", index_col=0).reindex(index=df.columns)

In [None]:
adata = sc.AnnData(X = df.transpose(), obs=df_files)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=3000)

In [None]:
df = df.reindex(index=adata.var[adata.var["highly_variable"]].index).applymap(lambda fpkm: np.log2(fpkm+1))
df.head(2)

In [None]:
with open("train-data.txt","w") as file:
    for sample in df.columns:
        doc = df[sample].round().astype(int)
        file.write(f"{len(doc[doc>0])} ")
        for el in [f"{iw}:{w[1]}" for iw,w in enumerate(doc.items()) if w[1] > 0]:
            file.write(el+" ")
        file.write("\n")

# Postprocess

In [None]:
df_topics = pd.read_csv("mirna/final.gamma", sep=" ", header=None)
df_topics.index = df_allfiles.index
df_topics.columns = ["Topic %d"%(t+1) for t in range(df_topics.shape[1])]
df_topics.head(2)

In [None]:
df_word_dist = pd.DataFrame(index=df.index, columns = ["Topic %d"%(t+1) for t in range(10)]).fillna(0)
df_word_dist.head(2)

In [None]:
def assign_word(line, sample):
    new_sample = pd.Series(name=sample, index=df_word_dist.index, dtype=object)
    for token in line:
        idx, cnt = token.split(":")
        new_sample.at[new_sample.index[int(idx)]]=int(cnt)
    return new_sample
    
def assign_doc(sample):
    global df_word_dist_temp
    df_word_dist_temp = df_word_dist_temp.join(sample, how="outer")

In [None]:
pool = mp.Pool(6)
df_word_dist_temp = pd.DataFrame()
with open("mirna/word-assignments.dat") as file:
    lines = file.read().split("\n")
    log.debug(len(lines))
    log.debug(lines[0][:50])        
    w = [pool.apply_async(assign_word, args=([line.split(" ")[1:], sample]), callback = assign_doc, error_callback=lambda err:log.error(err)) for line, sample in zip(lines, df_word_dist.columns)]
    
    pool.close()

pool.join()
df_word_dist_temp=df_word_dist_temp.reindex(index=df_word_dist.index, columns=df_word_dist.columns)

In [None]:
for g, data in df_word_dist_temp.apply(lambda x: np.unique(x[~x.isna()],return_counts=True), 1).items():
    for t, c in zip(*data):
        df_word_dist.at[g,df_word_dist.columns[t]]=c

In [None]:
df_word_dist = df_word_dist.divide(df_word_dist.sum(0),1)

In [None]:
topic = "Topic 4"
for g in df_word_dist[topic][df_word_dist[topic]>df_word_dist[topic].quantile(0.95)].index:
    print(g)

In [None]:
df_keyword_dist = pd.DataFrame(data=lda.components_.T, index=df_mirna.index, columns = ["Metadatum %d"%(m+1) for m in range(topics.shape[1])])
df_keyword_dist=df_keyword_dist.divide(df_keyword_dist.sum(0),1)
df_keyword_dist.head(2)

In [None]:
df_metadata = pd.DataFrame(data=topics, index=df_allfiles.index, columns = ["Metadatum %d"%(m+1) for m in range(topics.shape[1])])
df_metadata.head(2)

## Distinctivness

In [None]:
import tensorflow as tf
from distinctivness_helper import get_distinctivness

In [None]:
out = get_distinctivness(tf.convert_to_tensor(df_keyword_dist.transpose().values)).numpy()
out.shape

In [None]:
df_D = pd.DataFrame(data=out, index=df_keyword_dist.index, columns=df_keyword_dist.columns)
df_D.head(2)

In [None]:
for g in df_D["Metadatum 10"].sort_values(ascending=False).index[:30]:
    print(g)