# pySCENIC gene regulatory network analysis of pre-processed data

[GitHub](https://github.com/aertslab/pySCENIC?tab=readme-ov-file#id16)  
[Tutorial](https://pyscenic.readthedocs.io/en/latest/tutorial.html)  
[Worflow publication](https://www.nature.com/articles/s41596-020-0336-2)  
[Data base](https://resources.aertslab.org/cistarget/databases/)  

[About group comparisions](https://github.com/aertslab/pySCENIC/issues/292)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from arboreto.algo import grnboost2
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

from arboreto.utils import load_tf_names
from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase

import scanpy as sc
import loompy as lp

import pandas as pd
import numpy as np

import pickle

import glob

import os

In [None]:
os.chdir('/research/peer/fdeckert/FD20200109SPLENO')

In [None]:
# rpy2 
os.environ['R_HOME'] = '/nobackup/peer/fdeckert/miniconda3/envs/r.4.4.1-FD20200109SPLENO/lib/R'

## Plotting 

In [None]:
sc.settings.vector_friendly = False

sc.set_figure_params(figsize=(5, 5), dpi_save=1200, fontsize=12, frameon=False, facecolor='white')
sc.settings.figdir = 'result/figures/'

# pySCENIC workflow 

In [None]:
# Scenic data base files 
tf_file = '/research/peer/fdeckert/reference/animaltfdb/Mus_musculus_TF.txt'
db_file = glob.glob('/nobackup/peer/fdeckert/scenic/mm10/*feather')
motif_file = '/nobackup/peer/fdeckert/scenic/mm10/motifs-v10nr_clust-nr.mgi-m0.001-o0.0.tbl'

adj_file = 'adj.csv'
module_file = 'module.pkl'
reg_file = 'reg.pkl'
auc_file = 'auc_mtx.csv'

# Store animaltfdb in pySCENIC reference 

In [None]:
animaltfdb = pd.read_csv(tf_file, sep="\t", header=0)
animaltfdb["Symbol"].to_csv('/nobackup/peer/fdeckert/scenic/mm10/animaltfdb.txt', index=False, header=False)

In [None]:
tf_file = '/nobackup/peer/fdeckert/scenic/mm10/animaltfdb.txt'

# Load reference data 

In [None]:
# Load TF list
tf_names = load_tf_names(tf_file)

In [None]:
# Load motif feather
def name(fname): return os.path.splitext(os.path.basename(fname))[0]
db = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_file]

# pySCENIC analysis for erythroid lineage from BSF samples 

## Subset AnnData

In [None]:
adata = sc.read_h5ad('/research/peer/fdeckert/FD20200109SPLENO/data/scRNAseq/object/pp.h5ad')

In [None]:
adata = adata[(adata.obs.celltype_low.isin(['MegP', 'MEP', 'Proerythroblast', 'Erythroblast'])) & (adata.obs.facility=='BSF')]

In [None]:
genes_1 = adata[:, (adata.X>=5).sum(axis=0)>=1].var_names.to_list()
genes_2 = adata[:, (adata.X>=1).sum(axis=0)>=20].var_names.to_list()
genes = list(set(genes_1 + genes_2))

In [None]:
adata = adata[:, genes].copy()

## Set out dir

In [None]:
# Result directory   
result_dir = '/research/peer/fdeckert/FD20200109SPLENO/result/scenic/scRNAseq/res_0'

In [None]:
os.chdir(result_dir)

## Expression matrix

In [None]:
mat = pd.DataFrame(adata.X.A if hasattr(adata.X, 'A') else adata.X, index=adata.obs_names, columns=adata.var_names)

## Compute graph adjacency 

In [None]:
adj_cache = True

In [None]:
if adj_cache: 

    # Load pre-compute adjacency
    adj = pd.read_csv(adj_file)

else: 
    
    # Run GRNBoost2 
    adj = grnboost2(mat, tf_names=tf_names, seed=42, verbose=True)

    # Store results
    adj.to_csv(adj_file, index=False)

## Create modules

In [None]:
module_cache = True

In [None]:
if module_cache: 

    # Load pre-computed adjacency 
    with open(module_file, 'rb') as f: module = pickle.load(f)

else: 

    module = list(modules_from_adjacencies(adj, mat, rho_mask_dropouts=False))
    with open(module_file, 'wb') as f: pickle.dump(module, f)

## Prune graph by biding motifs 

In [None]:
reg_cache = True

In [None]:
if reg_cache: 

    # Load pre-computed regulons 
    with open(reg_file, 'rb') as f: reg = pickle.load(f)

else: 

    reg = prune2df(db, module, motif_annotations_fname=motif_file, filter_for_annotation=True, num_workers=32)

    # Save as pkl for further processing 
    with open(reg_file, 'wb') as f: pickle.dump(reg, f)

In [None]:
reg_df = reg[('Enrichment', 'TargetGenes')].reset_index()
reg_df = reg_df.explode(('Enrichment', 'TargetGenes'))
reg_df['TargetGene'] = reg_df[('Enrichment', 'TargetGenes')].apply(lambda x: x[0])
reg_df = reg_df[['TF', 'TargetGene']]
reg_df = reg_df.drop_duplicates().reset_index(drop=True)

In [None]:
reg_df.to_csv('reg.csv')

## AUC matrix 

In [None]:
reg = df2regulons(reg)

In [None]:
auc_mtx = aucell(mat, reg, num_workers=4)
auc_mtx.to_csv(auc_file)

## AUC embedding UMAP

In [None]:
adata.obsm['X_regulon'] = auc_mtx
sc.pp.neighbors(adata, use_rep='X_regulon')
sc.tl.umap(adata)
sc.pl.umap(adata, color=['celltype_low', 'sample_group'], size=20)

# pySCENIC analysis for erythroid lineage from VBC samples 

## Subset AnnData

In [None]:
adata = sc.read_h5ad('/research/peer/fdeckert/FD20200109SPLENO/data/scRNAseq/object/pp.h5ad')

In [None]:
adata = adata[(adata.obs.celltype_low.isin(['MegP', 'MEP', 'Proerythroblast', 'Erythroblast'])) & (adata.obs.facility=='VBC')]

In [None]:
genes_1 = adata[:, (adata.X>=5).sum(axis=0)>=1].var_names.to_list()
genes_2 = adata[:, (adata.X>=1).sum(axis=0)>=20].var_names.to_list()
genes = list(set(genes_1 + genes_2))

In [None]:
adata = adata[:, genes].copy()

## Set out dir

In [None]:
# Result directory   
result_dir = '/research/peer/fdeckert/FD20200109SPLENO/result/scenic/scRNAseq/res_1'

In [None]:
os.chdir(result_dir)

## Expression matrix

In [None]:
mat = pd.DataFrame(adata.X.A if hasattr(adata.X, 'A') else adata.X, index=adata.obs_names, columns=adata.var_names)

## Compute graph adjacency 

In [None]:
adj_cache = True

In [None]:
if adj_cache: 

    # Load pre-compute adjacency
    adj = pd.read_csv(adj_file)

else: 
    
    # Run GRNBoost2 
    adj = grnboost2(mat, tf_names=tf_names, seed=42, verbose=True)

    # Store results
    adj.to_csv(adj_file, index=False)

## Create modules

In [None]:
module_cache = True

In [None]:
if module_cache: 

    # Load pre-computed adjacency 
    with open(module_file, 'rb') as f: module = pickle.load(f)

else: 

    module = list(modules_from_adjacencies(adj, mat, rho_mask_dropouts=False))
    with open(module_file, 'wb') as f: pickle.dump(module, f)

## Prune graph by biding motifs 

In [None]:
reg_cache = True

In [None]:
if reg_cache: 

    # Load pre-computed regulons 
    with open(reg_file, 'rb') as f: reg = pickle.load(f)

else: 

    reg = prune2df(db, module, motif_annotations_fname=motif_file, filter_for_annotation=True, num_workers=32)

    # Save as pkl for further processing 
    with open(reg_file, 'wb') as f: pickle.dump(reg, f)

In [None]:
reg_df = reg[('Enrichment', 'TargetGenes')].reset_index()
reg_df = reg_df.explode(('Enrichment', 'TargetGenes'))
reg_df['TargetGene'] = reg_df[('Enrichment', 'TargetGenes')].apply(lambda x: x[0])
reg_df = reg_df[['TF', 'TargetGene']]
reg_df = reg_df.drop_duplicates().reset_index(drop=True)

In [None]:
reg_df.to_csv('reg.csv')

## AUC matrix 

In [None]:
reg = df2regulons(reg)

In [None]:
auc_mtx = aucell(mat, reg, num_workers=4)
auc_mtx.to_csv(auc_file)

## AUC embedding UMAP

In [None]:
adata.obsm['X_regulon'] = auc_mtx
sc.pp.neighbors(adata, use_rep='X_regulon')
sc.tl.umap(adata)
sc.pl.umap(adata, color=['celltype_low', 'sample_group'], size=20)

# pySCENIC analysis for myeloid lineage from BSF samples 

## Subset AnnData

In [33]:
adata = sc.read_h5ad('/research/peer/fdeckert/FD20200109SPLENO/data/scRNAseq/object/pp.h5ad')

In [34]:
adata = adata[(adata.obs.celltype_low.isin(['cMo', 'intMo', 'ncMo', 'RPM'])) & (adata.obs.facility=='BSF')]

In [35]:
genes_1 = adata[:, (adata.X>=5).sum(axis=0)>=1].var_names.to_list()
genes_2 = adata[:, (adata.X>=1).sum(axis=0)>=20].var_names.to_list()
genes = list(set(genes_1 + genes_2))

In [36]:
adata = adata[:, genes].copy()

## Set out dir

In [38]:
# Result directory   
result_dir = '/research/peer/fdeckert/FD20200109SPLENO/result/scenic/scRNAseq/res_2'

In [39]:
os.chdir(result_dir)

## Expression matrix

In [40]:
mat = pd.DataFrame(adata.X.A if hasattr(adata.X, 'A') else adata.X, index=adata.obs_names, columns=adata.var_names)

## Compute graph adjacency 

In [42]:
adj_cache = True

In [43]:
if adj_cache: 

    # Load pre-compute adjacency
    adj = pd.read_csv(adj_file)

else: 
    
    # Run GRNBoost2 
    adj = grnboost2(mat, tf_names=tf_names, seed=42, verbose=True)

    # Store results
    adj.to_csv(adj_file, index=False)

## Create modules

In [53]:
module_cache = True

In [54]:
if module_cache: 

    # Load pre-computed adjacency Spic
    
    with open(module_file, 'rb') as f: module = pickle.load(f)

else: 

    module = list(modules_from_adjacencies(adj, mat, rho_mask_dropouts=False))
    with open(module_file, 'wb') as f: pickle.dump(module, f)

## Prune graph by biding motifs 

In [67]:
reg_cache = True

In [68]:
if reg_cache: 

    # Load pre-computed regulons 
    with open(reg_file, 'rb') as f: reg = pickle.load(f)

else: 

    reg = prune2df(db, module, motif_annotations_fname=motif_file, filter_for_annotation=True, num_workers=32)

    # Save as pkl for further processing 
    with open(reg_file, 'wb') as f: pickle.dump(reg, f)

In [69]:
reg_df = reg[('Enrichment', 'TargetGenes')].reset_index()
reg_df = reg_df.explode(('Enrichment', 'TargetGenes'))
reg_df['TargetGene'] = reg_df[('Enrichment', 'TargetGenes')].apply(lambda x: x[0])
reg_df = reg_df[['TF', 'TargetGene']]
reg_df = reg_df.drop_duplicates().reset_index(drop=True)

In [None]:
reg_df.to_csv('reg.csv')

## AUC matrix 

In [None]:
reg = df2regulons(reg)

In [None]:
auc_mtx = aucell(mat, reg, num_workers=4)
auc_mtx.to_csv(auc_file)

## AUC embedding UMAP

In [None]:
adata.obsm['X_regulon'] = auc_mtx
sc.pp.neighbors(adata, use_rep='X_regulon')
sc.tl.umap(adata)
sc.pl.umap(adata, color=['celltype_low', 'sample_group'], size=20)

# pySCENIC analysis for myeloid lineage from VBC samples 

## Subset AnnData

In [None]:
adata = sc.read_h5ad('/research/peer/fdeckert/FD20200109SPLENO/data/scRNAseq/object/pp.h5ad')

In [None]:
adata = adata[(adata.obs.celltype_low.isin(['cMo', 'intMo', 'ncMo', 'RPM'])) & (adata.obs.facility=='VBC')]

In [None]:
genes_1 = adata[:, (adata.X>=5).sum(axis=0)>=1].var_names.to_list()
genes_2 = adata[:, (adata.X>=1).sum(axis=0)>=20].var_names.to_list()
genes = list(set(genes_1 + genes_2))

In [None]:
adata = adata[:, genes].copy()

## Set out dir

In [None]:
# Result directory   
result_dir = '/research/peer/fdeckert/FD20200109SPLENO/result/scenic/scRNAseq/res_3'

In [None]:
os.chdir(result_dir)

## Expression matrix

In [None]:
mat = pd.DataFrame(adata.X.A if hasattr(adata.X, 'A') else adata.X, index=adata.obs_names, columns=adata.var_names)

## Compute graph adjacency 

In [None]:
adj_cache = True

In [None]:
if adj_cache: 

    # Load pre-compute adjacency
    adj = pd.read_csv(adj_file)

else: 
    
    # Run GRNBoost2 
    adj = grnboost2(mat, tf_names=tf_names, seed=42, verbose=True)

    # Store results
    adj.to_csv(adj_file, index=False)

## Create modules

In [None]:
module_cache = True

In [None]:
if module_cache: 

    # Load pre-computed adjacency 
    with open(module_file, 'rb') as f: module = pickle.load(f)

else: 

    module = list(modules_from_adjacencies(adj, mat, rho_mask_dropouts=False))
    with open(module_file, 'wb') as f: pickle.dump(module, f)

## Prune graph by biding motifs 

In [None]:
reg_cache = True

In [None]:
if reg_cache: 

    # Load pre-computed regulons 
    with open(reg_file, 'rb') as f: reg = pickle.load(f)

else: 

    reg = prune2df(db, module, motif_annotations_fname=motif_file, filter_for_annotation=True, num_workers=32)

    # Save as pkl for further processing 
    with open(reg_file, 'wb') as f: pickle.dump(reg, f)

In [None]:
reg_df = reg[('Enrichment', 'TargetGenes')].reset_index()
reg_df = reg_df.explode(('Enrichment', 'TargetGenes'))
reg_df['TargetGene'] = reg_df[('Enrichment', 'TargetGenes')].apply(lambda x: x[0])
reg_df = reg_df[['TF', 'TargetGene']]
reg_df = reg_df.drop_duplicates().reset_index(drop=True)

In [None]:
reg_df.to_csv('reg.csv')

## AUC matrix 

In [None]:
reg = df2regulons(reg)

In [None]:
auc_mtx = aucell(mat, reg, num_workers=4)
auc_mtx.to_csv(auc_file)

## AUC embedding UMAP

In [None]:
adata.obsm['X_regulon'] = auc_mtx
sc.pp.neighbors(adata, use_rep='X_regulon')
sc.tl.umap(adata)
sc.pl.umap(adata, color=['celltype_low', 'sample_group'], size=20)