In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
!pip install scanpy
import scanpy as sc
import os, sys
!echo "deb http://downloads.skewed.de/apt/ bionic main" >> /etc/apt/sources.list
!apt-key adv --keyserver keys.openpgp.org --recv-key 612DEFB798507F25
!apt-get update
!apt-get install python3-graph-tool
sys.path.append('/content/drive/My Drive/phd/hsbm-occam/')

In [None]:
os.chdir('/content/drive/My Drive/phd/datasets/gtex/10')

# Download data or get the data

In [None]:
df = pd.read_csv("mainTable_all.csv", index_col=0)
df.index = [g[:15] for g in df.index]
df.drop_duplicates(subset=df.index.name, ignore_index=False, inplace=True)
df_files = pd.read_csv("files.dat",index_col=0)

In [None]:
df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz', skiprows=2, sep='\t', index_col=0)
df_files=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)
df_files = df_files[df_files.index.isin(df.columns)]

In [None]:
print(df.head())
df.to_csv("mainTable_all_counts.csv", index=True, header=True)

In [None]:
#df = pd.read_csv('https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz', skiprows=2, sep='\t', index_col=0)
df_files=pd.read_csv("https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt", sep='\t', index_col=0)
df_files = df_files[df_files.index.isin(df.columns)]

In [None]:
rs = np.random.RandomState(seed=42)
samples = pd.DataFrame(columns=df_files.columns)
for site in df_files.groupby('SMTS').count().sort_values('SMTSD',ascending=False).index[:10]:
    samples = samples.append(df_files[df_files['SMTS']==site].sample(100, random_state=rs))

In [None]:
samples.groupby('SMTS').count().sort_values('SMTSD',ascending=False)

In [None]:
df[samples.index].to_csv("mainTable_counts.csv")

In [None]:
df_files.to_csv("files.dat")

In [None]:
os.getcwd()

# Select HVG

In [None]:
samples = df_files
genes = df.index

In [None]:
hde=pd.read_csv("HDE_Lung.csv",index_col=0).index.values
genes = list(filter(lambda g: g in df.index, hde))

In [None]:
adata = sc.AnnData(X=df.reindex(index=genes, columns=samples.index).transpose(), obs=samples)

In [None]:
adata_log = sc.pp.log1p(adata, copy=True)
sc.pp.highly_variable_genes(adata_log, n_top_genes=3000, n_bins=50)

In [None]:
sc.pl.highly_variable_genes(adata_log, log=False, save='hvg_counts.pdf')

In [None]:
hvg = adata_log.var[adata_log.var['highly_variable']==True].index
samples = adata_log.obs.index

In [None]:
df.loc[hvg, samples].to_csv("mainTable_hv_counts.csv")

#Select HK

In [None]:
hk = pd.read_excel("HK_exons.xlsx")["Gene Name"].unique()

In [None]:
hgcn_url = "https://www.genenames.org/cgi-bin/download/custom?col=gd_hgnc_id&col=gd_app_sym&col=gd_app_name&col=md_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_name&format=text&submit=submit"
df_conversion=pd.read_csv(hgcn_url, sep="\t")

In [None]:
df_hk_ensg = df_conversion[(df_conversion["Approved symbol"].isin(hk))&(df_conversion["Ensembl ID(supplied by Ensembl)"].isin(df.index))]["Ensembl ID(supplied by Ensembl)"].drop_duplicates().values

In [None]:
df_hk = df.reindex(index=df_hk_ensg)
df_hk.to_csv("mainTable_hk.csv", index=True)

# Use SBM

In [None]:
from sbmtm import sbmtm
model = sbmtm()

In [None]:
samples = df_files
genes = hvg

In [None]:
model.make_graph_from_BoW_df(df.reindex(index=genes, columns=samples.index).dropna().applymap(lambda tpm: np.log2(tpm+1)))
model.save_graph("graph.xml.gz")
model.g

In [None]:
model.load_graph("graph.xml.gz")
model.g

In [None]:
config = "counts"
os.system(f"mkdir -p {config}")
os.chdir(f"{config}")
model.fit(n_init=1, parallel=True, verbose=True, B_min=5, B_max=500)
model.save_data()

In [None]:
pd.read_csv("mainTable.csv", index_col=0).applymap(lambda tpm: np.log10(tpm+1)).max().hist()

In [None]:
pd.read_csv("mainTable_log.csv", index_col=0).max().hist()

In [None]:
os.getcwd()

In [None]:
import graph_tool as gt
import seaborn as sns
from sbmtm import sbmtm

In [None]:
model = sbmtm()

In [None]:
import matplotlib.pyplot as plt
def load_and_print(graph="graph.xml.gz", **kwargs):
  model.load_graph(graph)
  print(model.g)
  data = gt.spectral.adjacency(model.g, weight=model.g.edge_properties["count"]).toarray()
  data = data[1000:,:1000]
  sns.heatmap(data, **kwargs)

In [None]:
load_and_print("graph.xml.gz", vmax=2e4)

In [None]:
load_and_print("graph_log.xml.gz")

In [None]:
load_and_print("graph_hk.xml.gz", vmax=5e3)

In [None]:
load_and_print("graph_log10.xml.gz")