In [None]:
import os
import glob
import pickle
import pandas as pd
import numpy as np

from dask.diagnostics import ProgressBar

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from pyscenic.export import export2loom, add_scenic_metadata
from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

import seaborn as sns

In [2]:
import scanpy as sc
import anndata
import re
from imp import reload
from distributed import Client, LocalCluster
#import umap

#from geosketch import gs
#import pynndescent
import matplotlib.pyplot as pl
#import PyIOH5 as myh5
import matplotlib.pyplot as plt

In [4]:
ncores = 8
nthreads = 8

In [None]:
#wd="/home/jovyan/SCI_project/result/pyscenic/WT_py/SCTscaledata/"
#wd="/home/jovyan/zxli_SCI/result/pyscenic/WT.merge.replace06/"
wd="/home/jovyan/zxli_SCI/result/pyscenic/WT.merge.replace_v2/SCT/"
#os.mkdir(wd)
os.chdir(wd)
os.getcwd()

In [4]:
#DATA_FOLDER="/home/jovyan/SCI_project/result/pyscenic/WT_scanpy_pyscenic/rawdata2scanpy"
RESOURCES_FOLDER="/home/jovyan/zxli_SCI/data/pyscenic/resources/"
DATABASE_FOLDER = "/home/jovyan/zxli_SCI/data/pyscenic/databases/"
#SCHEDULER="113.105.131.192:8176"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "mm10_*.mc9nr.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'mm_mgi_tfs.txt')
#SC_EXP_FNAME = os.path.join(DATA_FOLDER, "WT.hvg8k.norm.data.csv")
REGULONS_FNAME = os.path.join(wd, "regulons.p")
MOTIFS_FNAME = os.path.join(wd, "motifs.csv")
REGULONS_DF_FNAME = os.path.join(wd, "regulons.csv")
AUCMTX_FNAME = os.path.join(wd, "auc_mtx.csv")

In [5]:
#ANNOTATIONS_FNAME = ""
LOOM_FILE = os.path.join(wd, "WT.SCT.pyscenic.auc.loom")

- input:WT.merge.replace0.6mm normalizeddata, filter genes<0.1%spots

In [None]:
#ex_matrix = pd.read_csv("/home/jovyan/zxli_SCI/data/pyscenic/WT.merge.replace_v2.express_cells20.18508g.22820allspots.normalizedData.csv",index_col=0).T
ex_matrix = pd.read_csv("/home/jovyan/zxli_SCI/result/Seurat/reg.CC/WT_replace_v2/SCT/WT.merge.replace_v2.express_cells20.18447g.22820allspots.SCTnormalizedData.csv",index_col=0).T
#ex_matrix=pd.DataFrame(adata.X,index=adata.obs_names,columns=adata.var_names)
ex_matrix.shape

In [None]:
tf_names = load_tf_names(MM_TFS_FNAME)
#获取即在tf database 又在我们的top3000 high variable genes里的gene
tf_names = [i for i in tf_names if i in ex_matrix.columns.values]
len(tf_names)

In [None]:
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs

In [None]:
adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)

# Save adjacencies matrix （保存共表达module的结果）
adjacencies.to_csv("adjacencies.csv")
adjacencies.shape

In [None]:

adjacencies=pd.read_csv("adjacencies.csv",header=0,index_col=0)
adjacencies.shape


In [None]:
modules = list(modules_from_adjacencies(adjacencies, ex_matrix))

modules=np.array(modules)
np.save("modules.npy",modules)

In [None]:

modules=np.load("modules.npy",allow_pickle=True)
modules=modules.tolist()
modules[0].head()

In [None]:
# Calculate a list of enriched motifs and the corresponding target genes for all modules.
with ProgressBar():
    df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)

# Create regulons from this table of enriched motifs.
regulons = df2regulons(df)

# Save the enriched motifs and the discovered regulons to disk.
df.to_csv(MOTIFS_FNAME)
with open(REGULONS_FNAME, "wb") as f:
    pickle.dump(regulons, f)

In [6]:
with open(REGULONS_FNAME,"rb") as f:
    regulons=pickle.load(f)

In [14]:
name=[];tfs=[];targets = [];score = [];state=[];motif=[]
for i in regulons:
    name.append(i.name)
    tfs.append(i.transcription_factor)
    targets.append(','.join(i.genes))
    score.append(i.score)
    ct=list(i.context)
    if 'png' in ct[0]:
        motif.append(ct[0].split('.')[0])
    elif 'png' in ct[1]:
        motif.append(ct[1].split('.')[0])
    else:
        motif.append('')

regulons_df = pd.DataFrame(data={'name':name,'tfs':tfs, 'score':score,'targets':targets,'motif':motif})
regulons_df.to_csv(REGULONS_DF_FNAME, index=False)
# regulons_df是 dataframe格式的regulon的结果

In [15]:
#Phase III: cellular regulon enrichment matrix (aka AUCell)
auc_mtx = aucell(ex_matrix, regulons, num_workers=4)
#sns.clustermap(auc_mtx, figsize=(8,8))

In [16]:
auc_mtx.to_csv(AUCMTX_FNAME)

In [None]:
##create loom file
#check if expression matrix has the correct format
def is_valid_exp_matrix(mtx):
    return (all(isinstance(idx, str) for idx in mtx.index) 
            and all(isinstance(idx, str) for idx in mtx.columns)
            and (mtx.index.nlevels == 1)
            and (mtx.columns.nlevels == 1))
is_valid_exp_matrix(ex_matrix)

In [None]:
motif=load_motifs(MOTIFS_FNAME)
motif.head()

In [None]:
regulons = [r.rename(r.name.replace('(+)',' ('+str(len(r))+'g)')) for r in regulons] 

len(regulons)
#regulons[0]

In [None]:
export2loom(ex_matrix, regulons, 
                LOOM_FILE,
                #title = "Zeisel et al.",
                #nomenclature = "MGI"
           )