In [1]:
#load packages

import os, glob, re, pickle
from functools import partial
from collections import OrderedDict
import operator as op
from cytoolz import compose

import pandas as pd
import seaborn as sns
import numpy as np
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as plt

from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.binarization import binarize
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_binarization, plot_rss

from IPython.display import HTML, display

print(sc.__version__)
! pip show dask

  data = yaml.load(f.read()) or {}


1.5.1
Name: dask
Version: 1.0.0
Summary: Parallel PyData with Task Scheduling
Home-page: http://github.com/dask/dask/
Author: None
Author-email: None
License: BSD
Location: /scbio4/home/kevinm/anaconda3/envs/pyscenic_py36/lib/python3.6/site-packages
Requires: 
Required-by: pyscenic, distributed, arboreto


In [2]:
# Set maximum number of jobs for Scanpy.
sc.settings.njobs = 32

In [57]:
#Change per dataset
RESOURCES_FOLDERNAME = "../../MacroMono/Healthy/Samsung_Lee_Colon/"


AUXILLIARIES_FOLDERNAME = "../../TrailScenic/Auxillaries/"
RESULTS_FOLDERNAME = "../../MacroMono/Healthy/Samsung_Output/"
FIGURES_FOLDERNAME = "../../TrailScenic/Figures/"

In [58]:
sc.settings.figdir = FIGURES_FOLDERNAME

In [59]:
BASE_URL = "http://motifcollections.aertslab.org/v9/logos/"
COLUMN_NAME_LOGO = "MotifLogo"
COLUMN_NAME_MOTIF_ID = "MotifID"
COLUMN_NAME_TARGETS = "TargetGenes"

In [60]:
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
    """
    Save figure as vector-based SVG image format.
    """
    fig.tight_layout()
    fig.savefig(os.path.join(folder, fname), format='svg')

In [61]:
def display_logos(df: pd.DataFrame, top_target_genes: int = 3, base_url: str = BASE_URL):
    """
    :param df:
    :param base_url:
    """
    # Make sure the original dataframe is not altered.
    df = df.copy()
    
    # Add column with URLs to sequence logo.
    def create_url(motif_id):
        return '<img src="{}{}.png" style="max-height:124px;"></img>'.format(base_url, motif_id)
    df[("Enrichment", COLUMN_NAME_LOGO)] = list(map(create_url, df.index.get_level_values(COLUMN_NAME_MOTIF_ID)))
    
    # Truncate TargetGenes.
    def truncate(col_val):
        return sorted(col_val, key=op.itemgetter(1))[:top_target_genes]
    df[("Enrichment", COLUMN_NAME_TARGETS)] = list(map(truncate, df[("Enrichment", COLUMN_NAME_TARGETS)]))
    
    MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
    pd.set_option('display.max_colwidth', -1)
    display(HTML(df.head().to_html(escape=False)))
    pd.set_option('display.max_colwidth', MAX_COL_WIDTH)

In [62]:
# Downloaded fromm pySCENIC github repo: https://github.com/aertslab/pySCENIC/tree/master/resources
HUMAN_TFS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'hs_hgnc_curated_tfs.txt')
# Ranking databases. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
RANKING_DBS_FNAMES = list(map(lambda fn: os.path.join(AUXILLIARIES_FOLDERNAME, fn),
                       ['hg19-500bp-upstream-10species.mc9nr.feather',
                       'hg19-tss-centered-5kb-10species.mc9nr.feather',
                        'hg19-tss-centered-10kb-10species.mc9nr.feather']))
# Motif annotations. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
MOTIF_ANNOTATIONS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'motifs-v9-nr.hgnc-m0.001-o0.0.tbl')

In [63]:
DATASET_ID = "Healthy_Macs_Samsung"
TCGA_CODE = 'Healthy_Macs_Samsung'

In [64]:
#Adjust


# Downloaded from GEO on 28 FEB 2019.
#CELL_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDERNAME, "GSE115978_cell.annotations.csv")
# Downloaded from Cell Journal website on 1 MAR 2019.
SAMPLE_METADATA_FNAME = os.path.join(RESOURCES_FOLDERNAME, "Meta_Healthy_Macroverse_Samsung_Colon.txt")
# Downloaded from GEO on 1 MAR 2019.
EXP_MTX_TPM_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'TPM_Colon_Samsung_Healthy.txt')
#EXP_MTX_COUNTS_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'GSE115978_counts.csv')

In [65]:
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.tpm.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))

Reading TPM and QC

In [66]:
#Read table
# df_tpm = sc.read_text(EXP_MTX_TPM_FNAME, delimiter='\t', first_column_names=True)
df_tpm = pd.read_csv(EXP_MTX_TPM_FNAME, sep='\t', index_col=0)
display(df_tpm.head())
df_tpm.shape

Unnamed: 0,SMC01-N_AACTCCCGTCCCTACT,SMC01-N_AACTCTTGTGGCAAAC,SMC01-N_ACATACGCATGGAATA,SMC01-N_AGAGCGAAGGGTTCCC,SMC01-N_ATGAGGGAGTTTGCGT,SMC01-N_CATATTCAGTTCGATC,SMC01-N_CCGGGATTCTCGAGTA,SMC01-N_CGGAGCTAGTCGAGTG,SMC01-N_CGTCAGGGTGAGTGAC,SMC01-N_CTACGTCTCTTGTACT,...,SMC10-N_TCGCGTTTCGCTTAGA,SMC10-N_TCGGTAACACTGTCGG,SMC10-N_TGAGCCGGTTCAGTAC,SMC10-N_TGCGTGGGTTGATTCG,SMC10-N_TGGCTGGTCGCCATAA,SMC10-N_TTATGCTTCCACGTGG,SMC10-N_TTCGGTCTCTATCGCC,SMC10-N_TTCTACAAGTCCTCCT,SMC10-N_TTCTCCTTCGACGGAA,SMC10-N_TTGAACGGTCTCCATC
A1BG,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
A1BG-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A1CF,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2M,0,9,1,10,0,0,1,0,2,2,...,0,0,0,2,1,1,0,0,0,2
A2M-AS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(33694, 226)

In [67]:
# transpose the TPM HERE
df_tpm_T = df_tpm.T




In [None]:
#TRANSFORM MATRIX ONLY
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

#Initialize a scaler, then apply it to the features
scaler = MinMaxScaler(feature_range=(0,2)) # default=(0, 1)
numerical = list(df_tpm_T.columns)

df_tpm_T_minmax = pd.DataFrame(data = df_tpm_T)
df_tpm_T_minmax[numerical] = scaler.fit_transform(df_tpm_T[numerical])

# Show an example of a record with scaling applied
display(df_tpm_T_minmax.head(n = 5))

In [68]:
# df_samples = pd.read_csv(SAMPLE_METADATA_FNAME, sep='\t')
df_samples = pd.read_csv(SAMPLE_METADATA_FNAME, sep='\t', index_col=0) # if you want first column to be the index
df_samples.rename(columns={'**UMAP1_md0.001_k150': 'UMAP1', '**UMAP2_md0.001_k150': 'UMAP2', '**Phenograph_md0.001_k150': 'Phenograph', 
                          }, inplace=True)

df_samples.head()

Unnamed: 0,Tissue Ascites-1 Blood-2 Breast-3 Colon-4 Stomach-5 Kidney-6 Liver-7 Lung-8 Pancreas-9 Skin-10 Spleen-11 Tonsil-12,Tissue,Study,Patients,Study-No,Global Healthy-1_Cancer-2_Other-3,Condition per tissue,Patient No,CellType MirgDC-1 DC1-2 DC2-3 MacoMono-4,UMAP1_ByTissue,...,Phenograph_md0.001_k100,UMAP1,UMAP2,Phenograph,UMAP1_md0.005_k100,UMAP2_md0.005_k100,Phenograph_md0.005_k100,UMAP1_md0.005_k150,UMAP2_md0.005_k150,Phenograph_md0.005_k150
SMC01-N_AACTCCCGTCCCTACT,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,728.522894,...,1,7388.413084,3137.684199,5,7896.658689,2917.895745,1,7413.556557,3083.216526,5
SMC01-N_AACTCTTGTGGCAAAC,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,793.561794,...,13,3581.76492,5278.849458,7,4540.143945,6395.617257,13,3709.731085,5140.562114,7
SMC01-N_ACATACGCATGGAATA,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,920.405479,...,5,3029.536182,4822.213721,2,4161.494683,7193.372248,5,3303.671401,4661.794719,2
SMC01-N_AGAGCGAAGGGTTCCC,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,1041.91152,...,7,1388.72516,3853.947521,3,3097.507313,4244.649495,7,1363.265914,3876.192858,3
SMC01-N_ATGAGGGAGTTTGCGT,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,1027.583236,...,5,2806.556856,4826.3594,2,3855.531861,7108.626219,5,3064.760422,4617.988419,2


In [36]:
df_samples

Unnamed: 0,Tissue Ascites-1 Blood-2 Breast-3 Colon-4 Stomach-5 Kidney-6 Liver-7 Lung-8 Pancreas-9 Skin-10 Spleen-11 Tonsil-12,Tissue,Study,Patients,Study-No,Global Healthy-1_Cancer-2_Other-3,Condition per tissue,Patient No,CellType MirgDC-1 DC1-2 DC2-3 MacoMono-4,UMAP1_ByTissue,...,Phenograph_md0.001_k100,UMAP1,UMAP2,Phenograph,UMAP1_md0.005_k100,UMAP2_md0.005_k100,Phenograph_md0.005_k100,UMAP1_md0.005_k150,UMAP2_md0.005_k150,Phenograph_md0.005_k150
SMC01-N_AACTCCCGTCCCTACT,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,728.522894,...,1,7388.413084,3137.684199,5,7896.658689,2917.895745,1,7413.556557,3083.216526,5
SMC01-N_AACTCTTGTGGCAAAC,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,793.561794,...,13,3581.764920,5278.849458,7,4540.143945,6395.617257,13,3709.731085,5140.562114,7
SMC01-N_ACATACGCATGGAATA,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,920.405479,...,5,3029.536182,4822.213721,2,4161.494683,7193.372248,5,3303.671401,4661.794719,2
SMC01-N_AGAGCGAAGGGTTCCC,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,1041.911520,...,7,1388.725160,3853.947521,3,3097.507313,4244.649495,7,1363.265914,3876.192858,3
SMC01-N_ATGAGGGAGTTTGCGT,4,Colon,Samsung,Samsung_Healthy_1,36,1,1,1,4,1027.583236,...,5,2806.556856,4826.359400,2,3855.531861,7108.626219,5,3064.760422,4617.988419,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SMC10-N_TTATGCTTCCACGTGG,4,Colon,Samsung,Samsung_Healthy_10,36,1,1,10,4,949.171506,...,5,2791.649470,4915.005621,2,3984.783078,7070.785500,5,3114.993502,4603.969250,2
SMC10-N_TTCGGTCTCTATCGCC,4,Colon,Samsung,Samsung_Healthy_10,36,1,1,10,4,1144.767843,...,17,7206.279865,3432.721042,5,7178.248092,3323.873218,17,7242.094421,3398.233501,5
SMC10-N_TTCTACAAGTCCTCCT,4,Colon,Samsung,Samsung_Healthy_10,36,1,1,10,4,1161.833563,...,6,7881.787548,1102.548340,8,8201.491873,1077.360780,6,7940.109646,1069.567636,8
SMC10-N_TTCTCCTTCGACGGAA,4,Colon,Samsung,Samsung_Healthy_10,36,1,1,10,4,714.915572,...,17,8374.662360,3243.579251,5,8491.108417,3096.566489,17,8325.930197,3169.509477,5


In [69]:
adata = sc.AnnData(X=df_tpm_T.sort_index())
df_obs = df_samples[['Tissue','UMAP1', 'UMAP2', 'Phenograph']].sort_index()

adata.obs = df_obs
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
# In the scanpy's tutorials this is used to stored all genes in log-transformed counts before retaining only Highly Variable Genes (HVG). 
# Because in this case no filtering is done we use this feature to store raw counts.
adata.raw = adata 
sc.pp.log1p(adata)
adata

AnnData object with n_obs × n_vars = 226 × 10196
    obs: 'Tissue', 'UMAP1', 'UMAP2', 'Phenograph', 'n_genes'
    var: 'n_cells'
    uns: 'log1p'

In [70]:
df_tpm_F = df_tpm_T.loc[adata.obs.index.values].copy()


In [39]:
adata.write_h5ad(ANNDATA_FNAME) # Categorical dtypes are created.

... storing 'Tissue' as categorical


In [71]:
adata.to_df().to_csv(EXP_MTX_QC_FNAME)

In [None]:
# adata.to_df()

STEP 1: Network inference based on GRNBoost2 from CLI
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.

Check Server how much RAM is avialable and adjust the num_workers


In [72]:
import sys
!{sys.executable} -m pip install fsspec>=0.3.3
!{sys.executable} -m pip install dask[dataframe] --upgrade
!{sys.executable} -m pip install distributed -U

Collecting dask[dataframe]
  Using cached dask-2.25.0-py3-none-any.whl (834 kB)
[31mERROR: pyscenic 0.10.0 has requirement dask==1.0.0, but you'll have dask 2.25.0 which is incompatible.[0m
Installing collected packages: dask
  Attempting uninstall: dask
    Found existing installation: dask 1.0.0
    Uninstalling dask-1.0.0:
      Successfully uninstalled dask-1.0.0
Successfully installed dask-2.25.0
Collecting distributed
  Using cached distributed-2.25.0-py3-none-any.whl (652 kB)
[31mERROR: pyscenic 0.10.0 has requirement dask==1.0.0, but you'll have dask 2.25.0 which is incompatible.[0m
[31mERROR: pyscenic 0.10.0 has requirement distributed<2.0.0,>=1.21.6, but you'll have distributed 2.25.0 which is incompatible.[0m
Installing collected packages: distributed
  Attempting uninstall: distributed
    Found existing installation: distributed 1.28.1
    Uninstalling distributed-1.28.1:
      Successfully uninstalled distributed-1.28.1
Successfully installed distributed-2.25.0


In [73]:
!pyscenic grn {EXP_MTX_QC_FNAME} {HUMAN_TFS_FNAME} -o {ADJACENCIES_FNAME} --num_workers 15


2020-09-02 12:49:24,916 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.

2020-09-02 12:49:26,049 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 46465 instead
  http_address["port"], self.http_server.port
preparing dask client
parsing input
  expression_matrix = expression_data.as_matrix()
creating dask graph
15 partitions
computing dask graph
not shutting down client, client was created externally
finished

2020-09-02 12:50:39,558 - pyscenic.cli.pyscenic - INFO - Writing results to file.


In [52]:
# check presence of output path
!ls ../../MacroMono/Healthy/Samsung_Lee_Colon/

Meta_Healthy_Macroverse_Samsung_Colon.txt  TPM_Colon_Samsung_Healthy.txt


In [None]:
# head output results
!head ../../TrailScenic/Results/MigrDCverse_Maier.adjacencies.tsv

In [74]:
import sys

!{sys.executable} -m pip install dask==1.0.0 distributed'>=1.21.6,<2.0.0'


Collecting dask==1.0.0
  Using cached dask-1.0.0-py2.py3-none-any.whl (685 kB)
Collecting distributed<2.0.0,>=1.21.6
  Using cached distributed-1.28.1-py2.py3-none-any.whl (517 kB)
Installing collected packages: dask, distributed
  Attempting uninstall: dask
    Found existing installation: dask 2.25.0
    Uninstalling dask-2.25.0:
      Successfully uninstalled dask-2.25.0
  Attempting uninstall: distributed
    Found existing installation: distributed 2.25.0
    Uninstalling distributed-2.25.0:
      Successfully uninstalled distributed-2.25.0
Successfully installed dask-1.0.0 distributed-1.28.1


STEP 2-3: Regulon prediction aka cisTarget from CLI
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.

Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.

In [75]:
DBS_PARAM = ' '.join(RANKING_DBS_FNAMES)

In [76]:
!pyscenic ctx {ADJACENCIES_FNAME} {DBS_PARAM} \
            --annotations_fname {MOTIF_ANNOTATIONS_FNAME} \
            --expression_mtx_fname {EXP_MTX_QC_FNAME} \
            --output {MOTIFS_FNAME} \
            --num_workers 20

  data = yaml.load(f.read()) or {}

2020-09-02 12:50:51,237 - pyscenic.cli.pyscenic - INFO - Creating modules.

2020-09-02 12:50:51,660 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.

2020-09-02 12:50:52,840 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2020-09-02 12:51:05,225 - pyscenic.utils - INFO - Creating modules.

2020-09-02 12:51:55,057 - pyscenic.cli.pyscenic - INFO - Loading databases.

2020-09-02 12:51:55,057 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
[                                        ] | 0% Completed | 21.3s
[                                        ] | 0% Completed | 23.6s
[                                        ] | 0% Completed | 26.7s
[                                        ] | 0% Completed | 26.9s
[                                        ] | 0% Completed | 27.0s
[                                        ] | 0% Completed | 27.4s
[                                        ] | 0% Co

[#                                       ] | 3% Completed | 53.7s
[#                                       ] | 3% Completed | 56.4s
[#                                       ] | 3% Completed | 59.8s
[#                                       ] | 3% Completed |  1min  4.3s
[#                                       ] | 3% Completed |  1min  5.8s
[#                                       ] | 3% Completed |  1min  7.9s
[#                                       ] | 3% Completed |  1min  9.6s
[#                                       ] | 3% Completed |  1min  9.8s
[#                                       ] | 3% Completed |  1min 10.3s
[#                                       ] | 3% Completed |  1min 12.0s
[#                                       ] | 3% Completed |  1min 13.4s

[#                                       ] | 3% Completed |  1min 14.0s
[#                                       ] | 3% Completed |  1min 14.3s
[#                                       ] | 3% Completed |  1min 15.6s
[#       

[##                                      ] | 5% Completed |  1min 39.4s
[##                                      ] | 5% Completed |  1min 41.3s
[##                                      ] | 5% Completed |  1min 42.2s
[##                                      ] | 5% Completed |  1min 43.0s
[##                                      ] | 6% Completed |  1min 47.4s
[##                                      ] | 6% Completed |  1min 49.0s
[##                                      ] | 6% Completed |  1min 49.6s
[##                                      ] | 6% Completed |  1min 51.7s
[##                                      ] | 6% Completed |  1min 56.4s

[###                                     ] | 7% Completed |  1min 59.0s
[###                                     ] | 7% Completed |  2min  0.6s
[###                                     ] | 7% Completed |  2min  1.5s
[###                                     ] | 7% Completed |  2min  8.8s
[###                                     ] | 8% Completed |  2m

[#######                                 ] | 19% Completed |  3min 29.5s
[########                                ] | 20% Completed |  3min 36.1s
[########                                ] | 21% Completed |  3min 37.3s
[##########                              ] | 25% Completed |  3min 48.5s
[############                            ] | 30% Completed |  4min  2.8s
[############                            ] | 30% Completed |  4min  4.8s

[############                            ] | 30% Completed |  4min 13.7s
[############                            ] | 31% Completed |  4min 17.7s
[############                            ] | 31% Completed |  4min 18.2s
[############                            ] | 31% Completed |  4min 19.2s
[############                            ] | 31% Completed |  4min 20.9s
[############                            ] | 31% Completed |  4min 21.8s
[############                            ] | 31% Completed |  4min 24.5s
[############                            ] | 31% C

[#############                           ] | 33% Completed |  4min 59.0s
[#############                           ] | 33% Completed |  5min  1.6s
[#############                           ] | 33% Completed |  5min  4.6s
[#############                           ] | 33% Completed |  5min  4.8s
[#############                           ] | 33% Completed |  5min  4.9s
[#############                           ] | 33% Completed |  5min  5.2s
[#############                           ] | 33% Completed |  5min  6.1s
[#############                           ] | 33% Completed |  5min  6.3s
[#############                           ] | 33% Completed |  5min  7.0s
[#############                           ] | 34% Completed |  5min 14.3s
[#############                           ] | 34% Completed |  5min 14.8s
[##############                          ] | 35% Completed |  5min 17.1s
[##############                          ] | 35% Completed |  5min 18.1s
[##############                          ] | 35% Co

[###############                         ] | 37% Completed |  6min 12.1s
[###############                         ] | 38% Completed |  6min 12.9s
[###############                         ] | 38% Completed |  6min 16.1s
[###############                         ] | 38% Completed |  6min 17.5s
[###############                         ] | 38% Completed |  6min 18.3s
[###############                         ] | 38% Completed |  6min 18.5s
[###############                         ] | 38% Completed |  6min 18.8s
[###############                         ] | 38% Completed |  6min 19.1s
[###############                         ] | 38% Completed |  6min 19.2s
[###############                         ] | 38% Completed |  6min 19.5s
[###############                         ] | 38% Completed |  6min 20.0s
[###############                         ] | 38% Completed |  6min 20.3s
[###############                         ] | 38% Completed |  6min 20.4s
[###############                         ] | 38% Co

[#######################                 ] | 59% Completed |  8min 20.9s
[#######################                 ] | 59% Completed |  8min 21.3s
[########################                ] | 61% Completed |  8min 34.4s
[########################                ] | 62% Completed |  8min 56.9s
[########################                ] | 62% Completed |  8min 57.2s
[########################                ] | 62% Completed |  9min  9.1s
[########################                ] | 62% Completed |  9min 10.5s
[#########################               ] | 63% Completed |  9min 14.5s
[#########################               ] | 63% Completed |  9min 17.0s
[#########################               ] | 63% Completed |  9min 20.4s
[#########################               ] | 63% Completed |  9min 37.3s
[#########################               ] | 63% Completed |  9min 37.8s
[#########################               ] | 63% Completed |  9min 41.1s
[#########################               ] | 63% Co

[###########################             ] | 67% Completed | 10min 40.0s
[###########################             ] | 67% Completed | 10min 41.2s
[###########################             ] | 67% Completed | 10min 54.1s
[###########################             ] | 67% Completed | 10min 59.0s
[###########################             ] | 67% Completed | 10min 59.7s
[###########################             ] | 67% Completed | 10min 59.8s
[###########################             ] | 67% Completed | 11min 12.0s
[###########################             ] | 67% Completed | 11min 15.6s
[###########################             ] | 67% Completed | 11min 15.9s
[###########################             ] | 67% Completed | 11min 17.4s
[###########################             ] | 67% Completed | 11min 21.0s
[###########################             ] | 68% Completed | 11min 28.8s
[###########################             ] | 68% Completed | 11min 38.0s
[###########################             ] | 68% Co

[############################            ] | 71% Completed | 13min  2.3s
[############################            ] | 71% Completed | 13min  2.7s
[############################            ] | 71% Completed | 13min  3.4s
[############################            ] | 72% Completed | 13min  7.8s
[############################            ] | 72% Completed | 13min 10.1s
[############################            ] | 72% Completed | 13min 10.4s
[############################            ] | 72% Completed | 13min 11.5s
[############################            ] | 72% Completed | 13min 11.7s
[############################            ] | 72% Completed | 13min 12.1s
[############################            ] | 72% Completed | 13min 13.0s
[############################            ] | 72% Completed | 13min 13.6s
[############################            ] | 72% Completed | 13min 13.8s
[############################            ] | 72% Completed | 13min 14.0s
[############################            ] | 72% Co

[####################################    ] | 90% Completed | 16min 43.8s
[########################################] | 100% Completed | 22min 41.6s

2020-09-02 13:14:38,048 - pyscenic.cli.pyscenic - INFO - Writing results to file.


In [77]:
df_motifs = load_motifs(MOTIFS_FNAME)

In [None]:
!head {MOTIFS_FNAME}

AUCELL Step

First checking genes 



In [None]:
nGenesDetectedPerCell = np.sum(df_tpm>0, axis=1)
percentiles = nGenesDetectedPerCell.quantile([.01, .05, .10, .50, 1])
print(percentiles)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5), dpi=150)
sns.distplot(nGenesDetectedPerCell, norm_hist=False, kde=False, bins='fd')
for i,x in enumerate(percentiles):
    fig.gca().axvline(x=x, ymin=0,ymax=1, color='red')
    ax.text(x=x, y=ax.get_ylim()[1], s=f'{int(x)} ({percentiles.index.values[i]*100}%)', color='red', rotation=30, size='x-small',rotation_mode='anchor' )
ax.set_xlabel('# of genes')
ax.set_ylabel('# of cells')
fig.tight_layout()

In [78]:
df_motifs = load_motifs(MOTIFS_FNAME)
regulons = df2regulons(df_motifs)
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
    pickle.dump(regulons, f)

Create regulons from a dataframe of enriched features.
Additional columns saved: []


In [79]:
%%time
auc_mtx = aucell(df_tpm_F, regulons, num_workers=32)
auc_mtx.to_csv(AUCELL_MTX_FNAME)

CPU times: user 8 s, sys: 3.4 s, total: 11.4 s
Wall time: 14 s


In [80]:
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)

In [81]:
auc_mtx

Unnamed: 0_level_0,AHRR(+),ARID3A(+),ARNT(+),ATF1(+),ATF2(+),ATF3(+),ATF4(+),ATF6(+),ATF6B(+),ATF7(+),...,ZNF655(+),ZNF740(+),ZNF76(+),ZNF768(+),ZNF770(+),ZNF84(+),ZNF91(+),ZNF92(+),ZSCAN29(+),ZXDC(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SMC01-N_AACTCCCGTCCCTACT,0.040694,0.050123,0.146800,0.151504,0.026231,0.144559,0.224846,0.105522,0.037143,0.053446,...,0.228420,0.021117,0.033070,0.000000,0.410970,0.062789,0.086668,0.000000,0.055181,0.000000
SMC01-N_AACTCTTGTGGCAAAC,0.049680,0.015278,0.202667,0.074044,0.053344,0.176578,0.200587,0.141568,0.096421,0.110165,...,0.000000,0.098391,0.028013,0.075621,0.018552,0.079800,0.098339,0.503427,0.026651,0.000000
SMC01-N_ACATACGCATGGAATA,0.053243,0.085311,0.143175,0.070039,0.050894,0.158217,0.185201,0.137671,0.058695,0.069919,...,0.000000,0.000000,0.015700,0.095114,0.003867,0.029951,0.092647,0.059542,0.082450,0.160538
SMC01-N_AGAGCGAAGGGTTCCC,0.037011,0.040241,0.107707,0.048116,0.032943,0.180369,0.169854,0.156395,0.054143,0.108752,...,0.000000,0.064210,0.015492,0.017224,0.000000,0.055296,0.026975,0.044766,0.067079,0.000000
SMC01-N_ATGAGGGAGTTTGCGT,0.085284,0.041574,0.223104,0.063679,0.037870,0.185470,0.159960,0.130520,0.063915,0.096927,...,0.000000,0.000000,0.062451,0.106239,0.055549,0.041532,0.072616,0.014230,0.061644,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SMC10-N_TTATGCTTCCACGTGG,0.045889,0.058280,0.137478,0.055634,0.056252,0.133420,0.140491,0.124772,0.053647,0.087470,...,0.000000,0.000000,0.008940,0.000000,0.012811,0.022958,0.044474,0.082644,0.053338,0.375971
SMC10-N_TTCGGTCTCTATCGCC,0.041415,0.134708,0.140662,0.081518,0.050904,0.162420,0.206830,0.116514,0.060733,0.068994,...,0.268994,0.000000,0.023188,0.000000,0.000434,0.036346,0.091634,0.000000,0.024020,0.000000
SMC10-N_TTCTACAAGTCCTCCT,0.048902,0.077763,0.089594,0.088762,0.036153,0.141417,0.181010,0.103265,0.052323,0.128835,...,0.136158,0.042049,0.056191,0.000000,0.037774,0.055295,0.068338,0.000152,0.028413,0.000000
SMC10-N_TTCTCCTTCGACGGAA,0.050780,0.042480,0.138812,0.082621,0.051599,0.145904,0.201179,0.112036,0.078531,0.106011,...,0.217796,0.044729,0.008370,0.000000,0.060160,0.098506,0.088338,0.000000,0.085532,0.000000


Downstream Analysis


Needs Testing still


In [None]:
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]

PCA run

In [None]:
sc.tl.pca(adata, svd_solver='arpack')

Run TSNE and UMAP

In [None]:
sc.tl.tsne(adata)

In [None]:
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['Phenograph'], 
           title=['MigrDCverse'], ncols=3, color_map="Set1",
          save=' - MigrDCverse_Phenograph.svg')

In [None]:
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)

RUN TNSE UMAP on AUCell

In [None]:
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)

In [None]:
print(len(auc_mtx))

In [None]:
auc_mtx

In [None]:
print (adata.n_obs)

We change the tSNE projection so that it relies on AUCell instead of PCA.

In [None]:
sc.tl.tsne(adata, use_rep='X_aucell')

In [None]:
auc_mtx_reindex = auc_mtx.reindex(adata.obs_names)
auc_mtx_reindex.head()

In [None]:
import umap
from MulticoreTSNE import MulticoreTSNE as TSNE

# UMAP
runUmap = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='correlation').fit_transform
dr_umap = runUmap(auc_mtx_reindex)
pd.DataFrame(dr_umap, columns=['X', 'Y'], index=auc_mtx.index).to_csv( "Output/scenic_umap_IDO_Macs.txt", sep='\t')

# tSNE
tsne = TSNE( n_jobs=20 )
dr_tsne = tsne.fit_transform(auc_mtx_reindex)
pd.DataFrame(dr_tsne, columns=['X', 'Y'], index=auc_mtx.index).to_csv( "Output/scenic_tsne_IDO_macs.txt", sep='\t')

In [None]:
adata.obsm

In [None]:
adata.obsm['X_tsne'] = dr_tsne

In [None]:
adata.obsm['X_umap'] = dr_umap

In [None]:
adata.obs['Phenograph'] = adata.obs['Phenograph'].astype(str)
sc.pl.tsne(adata, color=['Phenograph'], palette="Paired", save='pdf');

In [None]:
sc.pl.umap(adata, color=['Tissue'], palette="Paired",save='UMAP_AUCELL_pdf_IDO_Macs_');

In [None]:
# adata.obsm['X_umap_offline']

In [None]:
df_dr_umap = pd.DataFrame(dr_umap, columns=['umap1', 'umap2'])
df_dr_umap.head()

In [None]:
# adata.obs['Phenograph'].values

In [None]:
df_dr_umap['celltype'] = adata.obs['Phenograph'].values
df_dr_umap['cellid'] = adata.obs_names.values
df_dr_umap.set_index('cellid', drop=True, inplace=True)
df_dr_umap.head()

In [None]:
# combine auc with the metadata
df_dr_umap_auc = pd.merge(auc_mtx_reindex, df_dr_umap, how='inner',  left_index=True, right_index=True)
df_dr_umap_auc.head()

In [None]:
df_dr_umap_auc.to_csv('Output/dr_umap_auc_IDO_Macs_transformed.csv', sep=',')

In [None]:
pd.read_csv('output/dr_umap_auc_Maier.csv', sep=',', index_col=0)

### plotting

In [None]:
df_dr_umap['celltype'] = df_dr_umap['celltype'].astype(str)

In [None]:
df_dr_umap.dtypes

In [None]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

ax = sns.scatterplot(x="umap1", y="umap2", hue="celltype", s=5,
                     data=df_dr_umap)
ax

In [None]:
sc.set_figure_params(frameon=False, dpi=600, fontsize=10, dpi_save=600)

sc.pl.scatter( dr_umap, 
    color=['Phenograph'],
    title=['HVG - UMAP (UMAP AUcell)'],
    alpha=0.8,
    save='_MigrDC_AUcell_UMAP.pdf'
    )

In [None]:
sc.pl.scatter(adata, 
    x = adata.obsm['X_umap_offline'][:, 0],
    y = adata.obsm['X_umap_offline'][:, 1],
    color=['Phenograph'],
    title=['HVG - UMAP (UMAP AUcell)'],
    alpha=0.8,
    save='_MigrDC_AUcell_UMAP_test.pdf'
    )

In [None]:
# adata.obs.Phenograph
adata.obs['Phenograph']

In [None]:
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 6), dpi=100)
plot_rss(rss, '1', ax=ax1)
ax1.set_xlabel('')
plot_rss(rss, '2', ax=ax2)
ax2.set_xlabel('')
ax2.set_ylabel('')
plot_rss(rss, '3', ax=ax3)
ax3.set_xlabel('')
ax3.set_ylabel('')
plot_rss(rss, '4', ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('')
plot_rss(rss, '5', ax=ax5)
plot_rss(rss, '6', ax=ax6)
ax6.set_ylabel('')
plot_rss(rss, '7', ax=ax7)
ax7.set_ylabel('')
plot_rss(rss, '8', ax=ax8)
ax8.set_ylabel('')
plt.tight_layout()
savesvg('plots - GSE103322 - rss.svg', fig)

Regulon analysis

In [None]:
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['Phenograph']]
df_results = ((df_scores.groupby(by='Phenograph').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()

In [None]:
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 2.0].sort_values('Z', ascending=False),
                           index='Phenograph', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched (becuase of small number of cells).
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray', 
            cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - MigrDCall - regulons.svg', fig)

Caluclation of specific regulons on the Scenic
Change per data analysis

In [None]:
data=df_results[df_results.Z >= 1.0].sort_values('Z', ascending=False)
df_results.to_csv('Maier_Regulon_Z_Score_Pheno_1_5_7OUT.csv',sep=',')

In [None]:
sc.pl.umap(adata, color=['Phenograph', 'Regulon(STAT2(+))', 'Regulon(BATF(+))'],
           title=['MigrDCMaier - HNSC - Phenograph', 'STAT2', 'BATF'], ncols=3, use_raw=False,
          save=' - Maier_MigrDC - regulons_STAT2_BATF.svg')