# Process data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_messanger = pd.read_csv("mainTable_fpkm.csv", index_col=0)
df_miRNA = pd.read_csv("mainTable_miRNA.csv", index_col=0)

In [None]:
df_files = pd.read_csv("files_manifest.dat")

In [None]:
df_files.drop_duplicates(subset=["file_fpkm"], keep="first", inplace=True)
df_files.drop_duplicates(subset=["file_miRNA"], keep="first", inplace=True)

In [None]:
df_files.set_index("file_fpkm", inplace=True)
df_files = df_files.reindex(index=df_messanger.columns).dropna(how="all", axis=0)
df_messanger = df_messanger.reindex(columns=df_files.index)
df_messanger.columns = df_files["cases.0.submitter_id"]
df_messanger.head(2)

In [None]:
df_files = df_files.reset_index().set_index("file_miRNA")
df_files = df_files.reindex(index=df_miRNA.columns).dropna(how="all", axis=0)
df_miRNA = df_miRNA.reindex(columns=df_files.index)
df_miRNA.columns = df_files["cases.0.submitter_id"]
df_miRNA.head(2)

In [None]:
df = df_messanger.append(df_miRNA.reindex(columns=df_messanger.columns))
df.to_csv("mainTable_all.csv", index=True)

In [None]:
df

In [None]:
df_files = df_files.reset_index().set_index("cases.0.submitter_id")

In [None]:
df_files = df_files.reset_index().set_index("cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id.1")

In [None]:
df_biolinks = pd.read_csv("TCGA_biolinks.csv", index_col=0)
df_biolinks = df_biolinks[df_biolinks["cancer.type"]=="BRCA"]
df_biolinks.set_index("pan.samplesID", inplace=True)

In [None]:
df_files = df_files.join(df_biolinks).dropna(how="all", axis=1).reset_index().set_index("cases.0.submitter_id")
df_files = df_files.reindex(index=df.columns).dropna(how="all", axis=0)
df_files.to_csv("files.dat")
df_files.head(2)

# Filter genes
## HVG

In [None]:
import scanpy as sc

In [None]:
adata = sc.AnnData(X=df.reindex(columns=df_files.index).dropna(how="all", axis=1).transpose(), obs=df_files)

In [None]:
sc.pp.log1p(adata, copy=False)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, n_bins=50)

In [None]:
sc.pl.highly_variable_genes(adata, log=False, save='hvg.pdf')

In [None]:
hvg = adata.var[adata.var['highly_variable']==True].index
samples = adata.obs.index

In [None]:
hvg

In [None]:
df.reindex(index=hvg, columns=samples).to_csv("mainTable_hv.csv")

# Make Graph

In [None]:
import graph_tool.all as gt

In [None]:
g = gt.Graph(directed=False)
name = g.vp["name"] = g.new_vp("string")
kind = g.vp["kind"] = g.new_vp("int")
weight = g.ep["count"] = g.new_ep("float")

In [None]:
for doc in df.columns:
    d = g.add_vertex()
    name[d] = doc
    kind[d] = 0

In [None]:
for word in df.index:
    w = g.add_vertex()
    name[w] = word
    kind[w] = 1 if "ENSG" in word else 2

In [None]:
D = df.shape[1]

for i_doc, doc in enumerate(df.columns):
    text = df[doc]
    for i_word, word in enumerate(df.index):
        e = g.add_edge(i_doc, D+i_word)
        weight[e] = text[word]

In [None]:
g.save("graph_all.xml.gz")

In [None]:
clabel = g.vp['kind']
state_args = {'clabel': clabel, 'pclabel': clabel}
state_args["eweight"] = g.ep.count

In [None]:
state = gt.minimize_nested_blockmodel_dl(g, 
                                deg_corr=True,
                                overlap=False,
                                state_args=state_args,
                                mcmc_args={'sequential': False},
                                mcmc_equilibrate_args={'mcmc_args': {'sequential': False}},
                                mcmc_multilevel_args={
                                      'mcmc_equilibrate_args': {
                                          'mcmc_args': {'sequential': False}
                                      },
                                      'anneal_args': {
                                          'mcmc_equilibrate_args': {
                                               'mcmc_args': {'sequential': False}
                                          }
                                      }
                                  })

## sbmtm

In [None]:
import sys
sys.path.append("../hSBM_Topicmodel/")

In [None]:
from sbmtm import sbmtm

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df.reindex(index=hvg).dropna(how="any", axis=1))
model.save_graph("graph_hv.xml.gz")

### log

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df.reindex(index=hvg).dropna(how="any", axis=1).applymap(lambda fpkm: np.log(fpkm+1)))
model.save_graph("graph_hv_log.xml.gz")

In [None]:
model.fit()