In [1]:
%load_ext watermark
%watermark  -a Filippo_Valle -v -m -g -r -v -p pandas,numpy,graph_tool,cloudpickle,regex,topicpy,scanpy,matplotlib,plotly

Author: Filippo_Valle

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.22.0

pandas     : 1.2.3
numpy      : 1.19.0
graph_tool : 2.37 (commit afba9459, )
cloudpickle: 1.6.0
regex      : 2021.4.4
topicpy    : 0.2.1
scanpy     : 1.7.1
matplotlib : 3.4.1
plotly     : 4.14.3

Compiler    : GCC 9.3.0
OS          : Linux
Release     : 5.8.0-50-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 12
Architecture: 64bit

Git hash: aec0cb27d23e921cf53771b154b07fadbbd6854a

Git repo: git@github.com:fvalle1/epj.git



In [2]:
import pandas as pd
import numpy as np
import graph_tool.all as gt
import sys
sys.path.append("../../hSBM_Topicmodel/")
sys.path.append("../../trisbm/")

from sbmtm import sbmtm
from trisbm import trisbm
import scanpy as sc
import cloudpickle as pickle

In [None]:
df = pd.read_csv("mainTable_fpkm.csv", index_col=0)

In [None]:
df_mirna = pd.read_csv("mainTable_cnv.csv", index_col=0)

In [None]:
df_files = pd.read_csv("tcga/files.dat")

In [None]:
df = df[df.columns[df.columns.isin(df_files["file_fpkm"])]]
df_mirna = df_mirna[df_mirna.columns[df_mirna.columns.isin(df_files["file_mirna"])]]

In [None]:
df.columns=df_files.set_index("file_fpkm").reindex(index=df.columns)["cases.0.submitter_id"]
df_mirna.columns=df_files.set_index("file_mirna").reindex(index=df_mirna.columns)["cases.0.submitter_id"]
df_mirna = df_mirna.reindex(columns=df.columns)

In [None]:
df_files.set_index("cases.0.submitter_id", inplace=True)

In [None]:
with open("cases.txt","w") as file:
    file.write("\n".join(df_files.index))

## HVG

In [None]:
adata = sc.AnnData(
    X=df.transpose().reindex(index=df_files.index), 
    obs=df_files)

In [None]:
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=3000)
sc.pl.highly_variable_genes(adata)

In [None]:
hvg = adata.var[adata.var["highly_variable"]==True].index

## Highly variable miRNA

In [None]:
adata = sc.AnnData(
    X=df_mirna.transpose().reindex(index=df_files.index), 
    obs=df_files)

In [None]:
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=1000)
sc.pl.highly_variable_genes(adata)

In [None]:
hvmiRNA = adata.var[adata.var["highly_variable"]==True].index.unique()

# hSBM

In [None]:
hsbm = sbmtm()

In [None]:
hsbm.make_graph_from_BoW_df(df.reindex(index=hvg))
hsbm.g

In [None]:
hsbm.save_graph("graph_sbmtm.xml.gz")

## triSBM

In [None]:
df_all = df.reindex(index=hvg).append(df_mirna.reindex(index=hvmiRNA))

In [None]:
trisbm_model = trisbm()
trisbm_model.make_graph(df_all, lambda gene: 1 if "ENSG" in gene else 2)

In [None]:
trisbm_model.g

In [None]:
trisbm_model.save_graph("graph_trisbm.xml.gz")

In [None]:
pd.read_csv("files.dat", index_col=0).set_index("cases.0.submitter_id").to_csv("files.dat")

## Benchmark

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
labels = ["cases.0.project.primary_site"]
scores = get_scores("tcga_key", labels, algorithm="trisbm", verbose=False)
scores['trisbm'] = scores[labels[0]]
scores["hsbm"]=get_scores("tcga", labels, algorithm="topsbm", verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled("tcga_key", pd.read_csv("tcga/files.dat", sep=",", index_col=0), label=labels[0], algorithm='trisbm')
normalise_score(scores, base_algorithm="shuffle", operation=lambda x,y: x/y)

In [None]:
fig=plt.figure(figsize=(18,15))
ax = fig.subplots(1)
add_score_lines(ax,scores,labels=["hsbm","trisbm", "shuffle"], V="norm_V", alpha=1)
ax.set_xscale('log')
ax.set_ylim(0,max(map(lambda s: max(s["norm_V"]), scores.values()))*1.1)
ax.set_xlim(0,max(map(lambda s: max(s["xl"]), scores.values()))*1.1)

plt.show()
fig.savefig("metric_scores.pdf")

In [None]:
import plotly.graph_objects as go

In [None]:
import cloudpickle as pickle 

with open("sbmtm.pkl", "rb") as file:
    hsbm = pickle.load(file)
    
with open("trisbm.pkl", "rb") as file:
    trisbm_model = pickle.load(file)

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y = [hsbm.get_mdl()/hsbm.g.num_edges()], name="hSBM"),
    go.Bar(y = [trisbm_model.get_mdl()/trisbm_model.g.num_edges()], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"TCGA dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "∑",
        "type":"log",
        #"range": [10e3,20e3],
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_entropies_bar.pdf")

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y=scores["hsbm"]["norm_V"], name="hSBM"),
    go.Bar(y=scores["trisbm"]["norm_V"], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "NMI/NMI*",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_scores_bar.pdf")