In [34]:
import os
import scipy
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import anndata as ad
import seaborn as sns
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
np.set_printoptions(suppress=True)
#scv.settings.verbosity = 3
#scv.settings.presenter_view = True
#scv.set_figure_params('scanpy')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
np.set_printoptions(suppress=True)
sc.set_figure_params(dpi=100, color_map = 'viridis_r')
sc.settings.verbosity = 1
plt.rcParams['pdf.fonttype']=42

In [None]:

## 01. QC & Doublets Removal using Scrublet with Default Settings 
wdir="/path/to/cellranger/"
files=os.listdir(wdir) ## list all sample ids
dub_list,score_list=[],[]
for i in files:
    filtered= os.path.join(wdir,i,"/outs/filtered_feature_bc_matrix.h5")
    c_filtered=sc.read_10x_h5(filtered, gex_only=True, backup_url=None)
    c_filtered.obs['dataset']=i
    c_filtered.obs['CB']=c_filtered_obs.index
    c_filtered.obs.index=c_filtered.obs['dataset'].str.cat(c_filtered.obs['CB'],sep='_')
    exp=c_filtered.n_obs/1000*0.4/100 ## setting the priors for 10K cells per samples
    print('Sample {} Number of cells in cell list: {}'.format(i, c_filtered.n_obs))
    print('Sample {} Number of genes in gene list: {}'.format(i, c_filtered.n_vars))
    sc.external.pp.scrublet(c_filtered,expected_doublet_rate=exp,log_transform=True,n_prin_comps=30)
    score_list.append(c_filtered.obs['doublet_score'])
    dub_list.append(c_filtered.obs['predicted_doublet'])

score_list=pd.concat(score_list)
dub_list=pd.concat(dub_list)
dubs=dub_list[dub_list==1].index

adata=sc.read("/path/to/h5ad/integrated_gex.h5ad")

adata = adata[(adata.obs.nCount_RNA<1000)&(adata.obs.nCount_RNA<50000)&
              (adata.obs.nCount_ATAC>1000)&(adata.obs.nCount_ATAC<100000)&
              (adata.obs.nFeature_RNA>400)&(adata.obs.nCount_ATAC<100000)&
              (adata.obs["percent.mt"]<0.2)&(adata.obs['TSS.enrichment'].gt(1.0))& 
              (adata.obs['nucleosome_signal'].lt(2.0)),:]                   ## QC

adata_filtered = adata[[i for i in adata.obs.index if i not in dubs],:] ## removal all predicted doublets 
adata_filtered.write("/path/to/h5ad/integrated_gex_filtered.h5ad")

In [None]:

## 02. UMAP plotting
adata=sc.read("/tank/data2/cw/snPlacenta/download/scPlacenta_host.h5ad")
colors = {
  "SCT" : "#3F84AA",
  "Epi" : "#C9EBFB",
  "DSC" : "#65A83E",
  "VCT" : "#8870ad",
  "dNK" :  "#C3C388",#
  "M" : "#c9a997",
  "EVT" : "#F397C0",#
  "FB" :  "#C594BF",#                 
  "HB" :  "#FBBE92",#
  "PV" :  "#9e6762",#                              
  "T" :  "#DFCDE4",#
  "cDC" :  "#989898",#                               
  "Ery" :  "#c9a997",#   
  "B" : "#B51D8D",
'LEC':"#FACB12",
'eS':"#8EC792",
'fVEC':"#eda450",
'mVEC':"#EF4E22",
"EC" : "#EF4E22",
'cili':"#139992"}


colPalette = [colors[i] for i in sorted(np.unique(adata.obs['minor_class']))]


colPalette = [colors[i] for i in sorted(np.unique(adata.obs['minor_class']))]
sc.pl.scatter(adata,color='minor_class',basis="umap",legend_loc='on data',
              palette=colPalette,size=2)

In [None]:
## 03. Cell type composition across gestational ages
import seaborn as sns
import pandas as pd
import re
from matplotlib.colors import ListedColormap

## Piechart
def func(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return "{:.1f}%".format(pct)

#adata=sc.read("/tank/data2/cw/snPlacenta/download/scPlacenta_host.h5ad")
stages=adata.obs.gestational_age_group.unique()
pal=sns.color_palette('tab20', 15)
fig, axs = plt.subplots(ncols=len(stages), nrows=1, figsize=(25,5))
pal=pal.as_hex()
order=adata.obs.major_class.value_counts().index.astype('str')

for col,j in enumerate(stages):
    fig.add_subplot(axs[col])
    df=adata.obs.loc[adata.obs['gestational_age_group']==j,'major_class'].value_counts().loc[order]
    wedges=axs[col].pie(df,labels=df.index,colors=pal)
    plt.axis('off')


## Stacked Area Plot

gws=pd.DataFrame(index=adata.obs.gestational_week.unique().astype('str'))
gws['gw'] = [int(re.findall(r'\d+', x)[0]) for x in gws.index]
gws['ga_subgroup']=pd.cut(gws['gw'], bins=[5, 9, 13, 20, 25, 40],labels=["early1st", "late1st", "early2nd",'late2nd','term'])
adata.obs['ga_subgroup']=adata.obs['gestational_week'].map(dict(gws['ga_subgroup']))
df=adata.obs[['ga_subgroup','major_class']].value_counts().reset_index()
df['proportion'] = df.groupby('ga_subgroup')[0].transform(lambda x: x / x.sum())



colPalette = [colors[i] for i in sorted(np.unique(adata.obs['major_class']))]
cmap = ListedColormap(colPalette)



pivot_df = df.pivot(index='ga_subgroup', columns='major_class', values='proportion')
pivot_df = pivot_df.fillna(0)
order=['early1st','late1st','early2nd','late2nd','term']
pivot_df=pivot_df.loc[order]
pivot_df.plot.area(
    colormap=cmap,
    figsize=(10,6),
    linewidth=0)

plt.xlabel("Time point")
plt.ylabel("Proportion")
plt.title("Cluster Composition Over Time")
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(False)
plt.tight_layout()



In [None]:
## 04 Gene expression dotplot for markers
target_gene=['TP63','TENM3','HLA-G','PRG2','ERVW-1','HOPX','DKK1','LUM','PGR','IGF1',"BMP5",'PDPN',
             "ACTA2","AFF2","POU5F1","PAEP","FOXJ1","TP73","PECAM1",'VWF','MEOX2','PROX1','CCL21','IGHM',
             'BCL11A',"CD3D","CD3G","GNLY","NKG7","MS4A7","CD14",'MRC1','LYVE1','HLA-DRA','CD74',"HBA1","HBB"]
categories_order=['VCT',"EVT","SCT","DSC",'eS',"FB","PV","Epi",'cili',"mVEC",'fVEC','LEC',
                  "B","T","dNK","M","HB","cDC","Ery"]
sc.pl.dotplot(adata,groupby='minor_class',var_names=target_gene,standard_scale='var',cmap='coolwarm',
              categories_order=categories_order,dot_max=0.6,use_raw=False)

In [None]:
## 05 piechart of maternal vs fetal origin per cell type 

def func(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return "{:.1f}%".format(pct)
x=adata.obs.loc[adata.obs.major_class=="FB","origin"].value_counts()
plt.pie(x,labels=x.index,autopct=lambda pct: func(pct, x),colors=['dodgerblue','deeppink','grey'])




In [None]:
## 06 Heatmap plotting on motif activity by chromVar
chromvar=sc.read("/path/to/chromVar.h5ad")
chromvar=chromvar[adata.obs.index,:]
chromvar.obs=adata.obs
scv.set_figure_params('scanpy')
sc.tl.rank_genes_groups(chromvar, groupby='minor_class',method='wilcoxon')

motif_ids=pd.read_csv("/path/to/chromvar.motif.annotation.txt",sep="\t",header=0,index_col=2)
chromvar.var=motif_ids
chromvar.var['MA']=chromvar.var.index
chromvar.var.index=chromvar.var.jaspar2020
categories_order=['VCT',"EVT","SCT","STM","FB","PV","Epi","Endo","B","T","dNK","M","HB","cDC","Erythrocyte"]
TFs=['TP63','TEAD3',"TEAD1",'GATA3',"GCM1","TFAP2B","TFAP2A",
     "HAND2","HOXA10","EBF1","HNF1B","HOXB4","ETV1","IRF4","IKZF1","RUNX2","ETS1",'RUNX2',"SPI1","ELF1","ETV6",
     "GATA1::TAL1"]
sc.pl.matrixplot(chromvar, TFs, categories_order=categories_order,groupby='subclass',dendrogram=False,
              cmap="bwr",standard_scale='var')

In [None]:
## 07 Heatmap of Enhancers with Differential Accessibility

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scanpy as sc
plt.rcParams['pdf.fonttype']=42

enhancer=sc.read("/path/to/F5.enhancer.h5ad")
enhancer_anno=pd.read_table("/path/to/F5.enhancer.anno",sep="\t",header=0,index_col=None)
enhancer_anno=pd.concat((enhancer.var.reset_index(),enhancer_anno),1)
enhancer_anno.index=enhancer_anno.iloc[:,0]
enhancer_anno=enhancer_anno.drop("index",1)

sc.tl.rank_genes_groups(enhancer, groupby='minor_class',method='wilcoxon')


order = ['VCT',"EVT","SCT",'DSC','eS',"FB",'PV',"Epi","cili",
        "mVEC","fVEC","LEC","B","T","dNK","M",'HB','cDC',"Ery"]

rgg = enhancer.uns['rank_genes_groups']

for key in ['names', 'scores', 'logfoldchanges', 'pvals', 'pvals_adj']:
    arr = rgg[key]
    reordered = np.array([arr[name] for name in desired_order])
    reordered = np.core.records.fromarrays(reordered, names=order)
    enhancer.uns['rank_genes_groups'][key] = reordered               # Reorder columns based on desired order


sc.pl.rank_genes_groups_matrixplot(enhancer, n_genes=10, groupby="minor_class", gene_symbols='name',
                                key='rank_genes_groups',use_raw=False,dendrogram=False,standard_scale="var",
                                categories_order=order,swap_axes=True,save="enhancer_heatmap.pdf")

names_array = enhancer.uns['rank_genes_groups']['names']  
group_names = names_array.dtype.names 
n_genes = names_array.shape[0]

## map enhancer to annotation
gene_symbol = []

for group in group_names:
    varnames = names_array[group]  # top genes (as var_names) # Map to gene symbols using adata.var
    symbols = [enhancer.var['gene_name'].get(gene, gene) for gene in varnames]
    gene_symbol.append(symbols)

enhancer_df=sc.get.rank_genes_groups_df(enhancer,group=None,key="wilcoxon")
enhancer_df=enhancer_df.merge(enhancer_anno,left_on="names",right_index=True)