In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd
import numpy as np
!pip install scanpy
import scanpy as sc
import os, sys
!echo "deb http://downloads.skewed.de/apt/ bionic main" >> /etc/apt/sources.list
!apt-key adv --keyserver keys.openpgp.org --recv-key 612DEFB798507F25
!apt-get update
!apt-get install python3-graph-tool
sys.path.append('/content/drive/My Drive/phd/hsbm-occam/')

In [0]:
os.chdir('/content/drive/My Drive/phd/datasets/gtex/10')

# Download data or get the data

In [0]:
df = pd.read_csv("mainTable_all.csv", index_col=0)
df.index = [g[:15] for g in df.index]
df.drop_duplicates(subset=df.index.name, ignore_index=False, inplace=True)
df_files = pd.read_csv("files.dat",index_col=0)

In [0]:
#df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', skiprows=2, sep='\t', index_col=0)
df_files=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)
#df_files = df_files[df_files.index.isin(df.columns)]

In [0]:
df_files.to_csv("files.dat")

In [0]:
df = df[df.columns[df.columns.isin(df_files.index)]]
print(df.head())

In [0]:
#df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', skiprows=2, sep='\t', index_col=0)
df_files=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)
df_files = df_files[df_files.index.isin(df.columns)]

In [0]:
rs = np.random.RandomState(seed=42)
samples = pd.DataFrame(columns=df_files.columns)
for site in df_files.groupby('SMTS').count().sort_values('SMTSD',ascending=False).index[:10]:
    samples = samples.append(df_files[df_files['SMTS']==site].sample(100, random_state=rs))

In [0]:
samples.groupby('SMTS').count().sort_values('SMTSD',ascending=False)

In [0]:
df[samples.index].to_csv("mainTable_tpm.csv")

In [0]:
df_files.to_csv("files.dat")

In [0]:
os.getcwd()

# Select HVG

In [0]:
samples = samples
genes = df.index
print(len(samples), len(genes))

In [0]:
hde=pd.read_csv("HDE_Lung.csv",index_col=0).index.values
genes = list(filter(lambda g: g in df.index, hde))

In [0]:
adata = sc.AnnData(X=df.reindex(index=genes, columns=samples.index).transpose(), obs=samples)

In [0]:
adata = sc.pp.log1p(adata, copy=True)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, n_bins=50)

In [0]:
sc.pl.highly_variable_genes(adata, log=False, save='hvg_counts.pdf')

In [0]:
hvg = adata.var[adata.var['highly_variable']==True].index
samples = adata.obs.index

In [0]:
df.reindex(index=hvg, columns=samples).to_csv("mainTable_counts_hv.csv")

#Select HK

In [0]:
hk = pd.read_excel("HK_exons.xlsx")["Gene Name"].unique()

In [0]:
hgcn_url = "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=md_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_name&format=text&submit=submit"
df_conversion=pd.read_csv(hgcn_url, sep="\t")

In [0]:
df_hk_ensg = df_conversion[(df_conversion["Approved symbol"].isin(hk))&(df_conversion["Ensembl ID(supplied by Ensembl)"].isin(df.index))]["Ensembl ID(supplied by Ensembl)"].drop_duplicates().values

In [0]:
df_hk = df.reindex(index=df_hk_ensg)
df_hk.to_csv("mainTable_hk.csv", index=True)

# Use SBM

In [0]:
from sbmtm import sbmtm
model = sbmtm()

In [0]:
samples = df_files
genes = hvg
print(len(samples), len(genes))

### make data


In [0]:
df = pd.read_csv("mainTable_counts_hv.csv", index_col=0)
genes = df.index
samples = pd.Series(index=df.columns)
print(len(samples), len(genes))

In [0]:
# .applymap(lambda tpm: np.log2(tpm+1))
model.make_graph_from_BoW_df(df.reindex(index=genes, columns=samples.index).dropna())
model.save_graph("graph_counts.xml.gz")
model.g

### load data

In [0]:
# load data
model.load_graph("graph_counts.xml.gz")
model.g

### load pretrained

In [0]:
# load pretrainded
import graph_tool as gt
from graph_tool.inference.nested_blockmodel import NestedBlockState
import pickle

with open("topsbm/topsbm.pkl", "rb") as file:
  model = pickle.load(file)

model.mul

## Run

In [0]:
config = "topsbm_counts"
os.system(f"mkdir -p {config}")
os.chdir(f"{config}")
model.fit(n_init=1, parallel=True, verbose=True, B_min=0, B_max=500)
model.multiflip_mcmc_sweep(n_steps=50, verbose=True)
model.save_data()

In [0]:
pd.read_csv("mainTable.csv", index_col=0).applymap(lambda tpm: np.log10(tpm+1)).max().hist()

In [0]:
pd.read_csv("mainTable_log.csv", index_col=0).max().hist()

In [0]:
os.getcwd()

In [0]:
import graph_tool as gt
import seaborn as sns
from sbmtm import sbmtm

In [0]:
model = sbmtm()

In [0]:
import matplotlib.pyplot as plt
def load_and_print(graph="graph.xml.gz", **kwargs):
  model.load_graph(graph)
  print(model.g)
  print(len(model.words),len(model.documents))
  data = gt.spectral.adjacency(model.g, weight=model.g.edge_properties["count"]).toarray()
  n_doc = len(model.documents)
  data = data[n_doc:,:n_doc]
  ax = sns.heatmap(data, **kwargs)
  ax.set_ylabel("words", fontsize=35, rotation=90)
  ax.yaxis.tick_left()
  ax.yaxis.set_label_position("left")

  ax.set_xlabel("documents",fontsize=35)
  ax.tick_params(labelsize=25)

In [0]:
load_and_print("graph_log.xml.gz")

In [0]:
load_and_print("graph_hk.xml.gz", vmax=5e3)

In [0]:
load_and_print("graph_log10.xml.gz")