In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Preprocessing and run hSBM

In [None]:
# import some libraries
import pandas as pd
import numpy as np
import os, sys

In [None]:
#install scanpy
!pip install scanpy > /dev/null $2>&1
import scanpy as sc

In [None]:
#install graph-tool
!echo "deb http://downloads.skewed.de/apt/ bionic main" >> /etc/apt/sources.list
!apt-key adv --keyserver keys.openpgp.org --recv-key 612DEFB798507F25
!apt-get update > /dev/null $2>&1
!apt-get install python3-graph-tool python3-cairo > /dev/null $2>&1
sys.path.append('/content/drive/My Drive/phd/hSBM_Topicmodel/')

In [None]:
os.chdir('/content/drive/My Drive/phd/topics/datasets/gtexrandom')

# Download data or get the data

## Use already downloaded (example from TCGA) data

In [None]:
df = pd.read_csv("mainTable_all.csv", index_col=0)
df.index = [g[:15] for g in df.index]
df.drop_duplicates(subset=df.index.name, ignore_index=False, inplace=True)
#df_files=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)
df_files = pd.read_csv("files.dat", sep=",", index_col=0)
df_files = df_files[df_files.index.isin(df.columns)]
df = df.reindex(columns=df.columns[df.columns.isin(df_files.index)])
print(df.shape)
print(df_files["cancer.type"].unique())
df_files.head(2)

## Import from GTEx

This needs to download a big dataset, may require time

In [None]:
df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', skiprows=2, sep='\t', index_col=0)
df.index=[g[:15] for g in df.index]
df=df[df.index.isin(pd.read_csv("http://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None).values.ravel())]
df_files=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)
df_files = df_files[df_files.index.isin(df.columns)]
df_files.head()

In [None]:
df_files.to_csv("files.dat")

In [None]:
df = df[df.columns[df.columns.isin(df_files.index)]]
df.head(2)

### Sample columns
We pick some samples at random. We get $100$ samples from 10 most represented tissues

In [None]:
rs = np.random.RandomState(seed=314)
samples = pd.DataFrame(columns=df_files.columns)
for site in df_files.groupby('SMTS').count().sort_values('SMTSD',ascending=False).index[:10]:
  samples = samples.append(df_files[df_files['SMTS']==site].sample(100, random_state=rs))

In [None]:
# chack and verify the sampling process
samples.groupby('SMTS').count().sort_values('SMTSD',ascending=False)

In [None]:
df[samples.index].to_csv("mainTable_tpm_all_seed314.csv")

In [None]:
df_files.to_csv("files.dat")

# Gene selections

## Select Highly Variable Genes

We use scanpy to select hvg

In [None]:
samples = df_files[df_files.index.isin(df.columns)]
genes = df.index
print(len(samples), len(genes))

In [None]:
adata = sc.AnnData(X=df.reindex(index=genes, columns=samples.index).transpose(), obs=samples)

In [None]:
sc.pp.log1p(adata, copy=False)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, n_bins=50)

In [None]:
sc.pl.highly_variable_genes(adata, log=False, save='hvg.pdf')

In [None]:
hvg = adata.var[adata.var['highly_variable']==True].index
samples = adata.obs.index

Save a new table with only hvg

In [None]:
file_name = "mainTable_fpkm_all_hv.csv"
df.reindex(index=hvg, columns=samples).to_csv(file_name)
from google.colab import files 
files.download(file_name)

In [None]:
df_files.to_csv("files.dat")
files.download("files.dat")

## Select HouseKeeping

We download [Human House Keeping genes](https://www.cell.com/trends/genetics/fulltext/S0168-9525(13)00089-9?_returnURL=https%3A%2F%2Flinkinghub.elsevier.com%2Fretrieve%2Fpii%2FS0168952513000899%3Fshowall%3Dtrue) from [https://www.tau.ac.il/~elieis/HKG/](https://www.tau.ac.il/~elieis/HKG/HK_exons.xlsx)

In [None]:
hk = pd.read_excel("HK_exons.xlsx")["Gene Name"].unique()

Here we use [https://www.genenames.org](https://www.genenames.org) to convert  genes' names to Ensamble' ids

In [None]:
hgcn_url = "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=md_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_name&format=text&submit=submit"
df_conversion=pd.read_csv(hgcn_url, sep="\t")

In [None]:
df_hk_ensg = df_conversion[(df_conversion["Approved symbol"].isin(hk))&(df_conversion["Ensembl ID(supplied by Ensembl)"].isin(df.index))]["Ensembl ID(supplied by Ensembl)"].drop_duplicates().values

In [None]:
df_hk = df.reindex(index=df_hk_ensg)
df_hk.to_csv("mainTable_hk.csv", index=True)

## Select DEG

In [None]:
with open("HDE_Lung.csv") as file:
    hde = list(map(lambda g:g.split("\n")[0], file.readlines()))[1:]

In [None]:
df_hde = df.reindex(index=df.index[df.index.isin(hde)])
print(df_hde.shape)
df_hde.to_csv("mainTable_hde.csv", index=True)

## Select Random

In [None]:
seed = 0
rand_genes = df.sample(3000, axis=0, random_state=seed).index
rand_genes

In [None]:
df_rand = df.reindex(index=df.index[df.index.isin(rand_genes)], columns = samples.index)
print(df_rand.shape)
print(df_rand.head(2))
df_rand.to_csv(f"mainTable_random_{seed}.csv", index=True)

# Run Stochastic Block Model
We run [stochastic block model](https://github.com/martingerlach/hSBM_Topicmodel/tree/develop)

In [None]:
from sbmtm import sbmtm

# create the model
model = sbmtm()

In [None]:
samples = samples
genes = hvg
print(len(samples), len(genes))

### make a graph with the data


In [None]:
df_read = pd.read_csv("mainTable_random_0.csv", index_col=0)
genes = df_read.index
samples = pd.Series(index=df_read.columns, dtype=str)
print(len(samples), len(genes))
df_read.head(2)

In [None]:
# .applymap(lambda tpm: np.log2(tpm+1))
model.make_graph_from_BoW_df(df.reindex(index=genes, columns=samples.index).dropna())
model.save_graph("graph_random_00.xml.gz")
model.g

### load data
Use this only if you **already have a *graph.xml.gz*** file

In [None]:
# load graph
model.load_graph("graph_counts.xml.gz")
model.g

### load pretrained
Use this only if you **already have a *topsbm.pkl*** file with a trained model 

In [None]:
# load pretrainded
import graph_tool as gt
from graph_tool.inference.nested_blockmodel import NestedBlockState
import pickle

with open("topsbm/topsbm.pkl", "rb") as file:
  model = pickle.load(file)

model.mul

## Run

In [None]:
#os.chdir('/content/drive/My Drive/phd/datasets/gtex/10')
config = "hv"
os.system(f"mkdir -p {config} && mkdir -p {config}/topsbm")
os.chdir(f"{config}/topsbm")

model.fit(n_init=5, parallel=True, verbose=True, B_min=5, B_max=500)
#model.multiflip_mcmc_sweep(n_steps=100, verbose=True)
model.save_data()

# check models
You can use these functions to inspect saved models.

Do not use unless needed

In [None]:
os.chdir("/content/drive/My Drive/phd/datasets/gtex/gtexall")

In [None]:
df = pd.read_csv("mainTable.csv", index_col=0)
df.info()

In [None]:
(df>1e5).sum().sum()/df.shape[0]/df.shape[1]*100

In [None]:
df = pd.read_csv("mainTable_all.csv", index_col=0)

In [None]:
#os.chdir("/content/drive/My Drive/phd/TOPSBM_TEST")
os.chdir("/content/drive/My Drive/phd/datasets/gtex/10")
os.getcwd()

In [None]:
import graph_tool as gt
import seaborn as sns
from sbmtm import sbmtm

In [None]:
import matplotlib.pyplot as plt
model = sbmtm()

def load_and_print(graph="graph.xml.gz", **kwargs):
  model.load_graph(graph)
  print(model.g)
  print(len(model.words),len(model.documents))
  data = gt.spectral.adjacency(model.g, weight=model.g.edge_properties["count"]).toarray()
  n_doc = len(model.documents)
  data = data[n_doc:,:n_doc]
  ax = sns.heatmap(data, **kwargs)
  ax.set_ylabel("words", fontsize=35, rotation=90)
  ax.yaxis.tick_left()
  ax.yaxis.set_label_position("left")

  ax.set_xlabel("documents",fontsize=35)
  ax.tick_params(labelsize=25)
  return model, data

def load_trained_and_print(graph="topsbm.pkl", **kwargs):
  import pickle
  with open(graph,"rb") as io:
    model = pickle.load(io)
  print(model.g)
  print(len(model.words),len(model.documents))
  data = gt.spectral.adjacency(model.g, weight=model.g.edge_properties["count"]).toarray()
  n_doc = len(model.documents)
  data = data[n_doc:,:n_doc]
  ax = sns.heatmap(data, **kwargs)
  ax.set_ylabel("words", fontsize=35, rotation=90)
  ax.yaxis.tick_left()
  ax.yaxis.set_label_position("left")

  ax.set_xlabel("documents",fontsize=35)
  ax.tick_params(labelsize=25)
  return model, data

In [None]:
model, data = load_and_print("topsbm/graph.xml.gz");
data.max()

In [None]:
load_and_print("graph_hk.xml.gz", vmax=5e3);

In [None]:
load_and_print("graph_log10.xml.gz");

In [None]:
ld = os.listdir()
ld.sort()
ld

In [None]:
model, data = load_and_print("graph_log.xml.gz");
data.max()

In [None]:
df.reindex(index=hvg, columns=samples).loc[[g[:15] for g in model.words[:5]],model.documents[:5]]

In [None]:
model.save_graph("graph_tpm_hv.xml.gz")

In [None]:
model.load_graph("topsbm/graph_hde.xml.gz")

In [None]:
model.multiflip_mcmc_sweep(n_steps=10)

In [None]:
df = df.drop_duplicates()
df.reindex(index=pd.read_csv("topsbm-rg/topsbm-rg_level_0_word-dist.csv", index_col=0).index).to_csv("mainTable-rg.csv")

In [None]:
df.index=[g[:15] for g in df.index]