In [None]:
#Load packages
import numpy as np
import pandas as pd
import scanpy as sc
import scrublet as scr
import os
import warnings
import functools
import seaborn as sns
import scipy.stats
import anndata
#import dandelion as ddl
import tables
from rpy2.robjects import r

In [None]:
from collections import defaultdict

In [None]:
#Change working directory
os.chdir("/home/jovyan/data/ClatCov/")


In [None]:
#load data 
adata = sc.read_h5ad('COV_recluster_nd_broad_label_080721.h5ad')

In [None]:
adata

In [None]:
#Visualise
sc.pl.umap(adata, color = 'fine_label')

In [None]:
sc.pl.umap(adata, color = 'broad_label')

In [None]:
#remove doublets
adata = adata[adata.obs['broad_label'] != 'DOUBLET']
adata = adata[adata.obs['fine_label'] != 'DOUBLET']
adata

In [None]:
# Identify highly-variable genes
sc.pp.highly_variable_genes(adata, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
# plot highly_variable_genes
sc.pl.highly_variable_genes(adata)

In [None]:
## remove TRBV/TRAV/TRGV/TRDV and IGHV/IGLV/IGKV from the highly variable genes
#Decided not to remove TRGV/TRDV as need these for gdT subset identification. 
import re
for i in adata.var.index:
    if re.search('^TR[AB]V|^IG[HKL]V', i):
        adata.var.at[i, 'highly_variable'] = False
sc.pl.highly_variable_genes(adata)

In [None]:
# filter to only highly variable
adata = adata[:, adata.var['highly_variable']]
adata

In [None]:
import multiprocessing
# regress and scale for PCA
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'],n_jobs = multiprocessing.cpu_count()-4)


In [None]:
sc.pp.scale(adata, max_value = 10)


In [None]:
# Principal component analysis
sc.tl.pca(adata, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(adata, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(adata, n_neighbors = 10, n_pcs = 50)

In [None]:
#What is split between subsets
pd.crosstab(adata.obs['fine_label'],adata.obs['COVID_severity'])

In [None]:
# run UMAP
sc.tl.umap(adata, n_components = 2, min_dist = 0.3)
sc.pl.umap(adata, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
# run harmony for batch correction 
sc.external.pp.harmony_integrate(adata, 'Patient')
'X_pca_harmony' in adata.obsm


In [None]:
adata

In [None]:
# Compute the neighborhood graph with harmoyn batch correction. Seurat uses k = 20 as default
sc.pp.neighbors(adata, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP with harmony batch correction
sc.tl.umap(adata, n_components = 2, min_dist = 0.3)
sc.pl.umap(adata, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
# find broad clusters
sc.tl.leiden(adata, resolution =0.5)
sc.pl.umap(adata, color = 'leiden')

In [None]:
sc.pl.umap(adata, color = 'leiden', legend_loc='on data')

In [None]:
#What is split between meta objects
sc.pl.umap(adata, color=['Sampleid','Patient', 'Sampletype', 'Gender', 'leiden', 'Cohort'], ncols = 3)

In [None]:
sc.pl.umap(adata, color=['Sampletype', 'Gender', 'Cohort'], ncols = 3)

In [None]:
sc.pl.umap(adata, color=['leiden'], legend_loc = 'on data', ncols = 3)

In [None]:
# create a palette for umap
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
viridis = cm.get_cmap('viridis', 256)
newcolors = viridis(np.linspace(0, 1, 256))
grey = np.array([215/256, 215/256, 215/256, 1])
newcolors[:1, :] = grey
newcmp = ListedColormap(newcolors)

In [None]:
# run marker gene test
sc.tl.rank_genes_groups(adata, groupby = 'leiden', method = 'wilcoxon', n_genes = 30000)

In [None]:
#Dotplot of top 10 DEG by cluster 
sc.pl.rank_genes_groups_dotplot(adata, n_genes = 10, standard_scale = 'var', color_map = 'viridis')

In [None]:
#cluster labels from Chua et al. Nat Biotech 2020
Chuafeat = ['TP63', 'KRT5', 'S100A2', 'FABP5', 'SERPINB3', 'TMSB4X', 'IFIT1', 'IFIT2', 'IFITM3', 'ISG15', 'ISG20', 'OAS1', 'SCGB1A1', 'SCGB3A1', 'XBP1', 'VMO1', 'MUC5AC', 'PIGR', 'FOXN4', 'CCNO', 'MYCL', 'CDC20B', 'TUBA1B', 'PCM1', 'FOXJ1', 'EFHC1', 'CCDC153', 
              'CCDC113', 'MLF1', 'LZTFL1', 'FOXI1', 'CFTR', 'ASCL3', 'FOXI2', 'SFTPB', 'ANK2', 'SFTPB', 'ANK2', 'SPRR1A', 'SPRR2A', 'SPRR2D', 'SPRR2E', 'SPRR3', 'TMPRSS11E', 'IL1B', 'VCAN', 'CD14', 'CCL2', 'FCGR3A', 'CXCL10', 'IFIT1', 'CD68', 'FABP5', 'FCER1A', 'CD74', 'HLA-DQB1',
              'HLA-DRA', 'JCHAIN', 'APOE', 'NCAM1', 'HMGB2', 'STMN1', 'FOXP3', 'CTLA4', 'TNFRSF18', 'CD4', 'CD8B', 'CD8A', 'PRF1', 'GZMA', 'GZMB', 'GNLY', 'NKG7', 'CD3G', 'CD3E', 'KLRB1', 'IL32', 'S100A4', 'CD27', 'CD19', 'MS4A1', 'CD79A', 'IRF7', 'TLR7', 'CLEC4C', 'IL3RA', 'LYN', 
              'FCGR3B', 'ITGAX', 'HPGD', 'LTC4S', 'CPA3','CD69', 'ITGA1', 'KIT', 'HBB', 'PPBP']
Chuafeat

In [None]:
#Marker genes for celltypes from Travaglini 2020 nature lung
Trav_list = ['FOXJ1','KRT5','MUC5B',
   'MUC5B',
   'PRR4',
   'CFTR',
   'CALCA',
   'DCLK1',
   'AGER',
   'SFTPB',
   'ACKR1',
   'CA4',
   'PROX1',
   'CNN1',
   'CNN1',
   'COL1A1',
   'COL1A1',
   'COL1A1',
   'CSPG4',
   'MSLN',
   'SNAP25',
   'CD79A',
   'CD79A',
   'CD3E',
   'CD3E',
   'CD3E',
   'CD3E',
   'KLRD1',
   'CD3E',
   'S100A8',
   'MS4A2',
   'MS4A2',
   'SIGLEC8',
   'NRGN',
   'MARCO',
   'LILRB4',
   'CLEC9A',
   'CD14',
  'SCGB3A2',
   'TUBB1',
   'KRT14',
   'MUC5AC',
   'LPO',
   'FOXI1',
   'CHGA',
   'ASCL2',
   'PDPN',
   'BMX',
   'PDPN',
   'ACTA2',
   'PDGFRA',
   'TRPC6',
   'UPK3B',
   'CD24',
   'CD27',
   'CD8A',
   'CD4',
   'NKG7',
   'CD8B',
   'S100A9',
   'CPA3',
   'PPBP',
   'MSR1',
   'IRF8',
   'LAMP3',
   'CD1C',
   'S100A8',
   'TP73',
   'TP63',
   'SPDEF',
   'LTF',
   'ASCL3',
   'ASCL1',
   'CLIC5',
   'TAGLN',
   'ELN',
   'PLIN2',
   'PDGFRB',
   'MS4A1',
   'SLAMF7',
   'GZMK',
   'COTL1',
   'CCR7',
   'TYROBP',
   'FCER1G',
   'IFITM2',
   'TPSAB1',
   'PF4',
   'MRC1',
   'LILRA4',
   'PLD4',
   'FCGR3A',
   'CCDC78',
   'DAPL1',
   'MUC1',
   'RGS5',
   'DES',
   'ACTA2',
   'APOE',
   'CD19',
   'DUSP2',
   'GZMB',
   'LDHB',
   'LEF1',
   'TYROBP',
   'FCGR3B',
   'OST4',
   'ETV5', 
   'LGR6']
Trav_list

In [None]:
#Chua marker gene expression by cluster 
sc.pl.dotplot(adata, Chuafeat, groupby = 'leiden')

In [None]:
#Travalini marker genes by cluster
sc.pl.dotplot(adata,Trav_list, groupby = 'leiden', title = 'COV_ctrl_wholeobject_Travaglini')

In [None]:
ILC_Bjorkland = ['CCL3',
'CXCR3',
'IFNG',
'IL12RB1',
'TBX21',
'PTGDR2',
'IL17RB',
'IL1RL1',
'IL13',
'GATA3',
'NCR2',
'IL22',
'RORC',
'AHR',
'IL23R',
'IL1R1',
'EOMES',
'GZMA',
'GNLY',
'KLRC1']

In [None]:
ILC_Bjorkland

In [None]:
#Bjorkland 2016 Nature immunology 
sc.pl.dotplot(adata,ILC_Bjorkland, groupby = 'leiden', title = 'COV_ctrl_wholeobject_BjorklandILC')

In [None]:
#Zhang 2020 nature immunology t cell subsets single cell markers 
sc.pl.dotplot(adata, Zhang2020_T, groupby = 'leiden')

In [None]:
#Collection of key marker genes for each broad cell cluster
clustermark = ['CD3E', 'CD19', 'JCHAIN','CD4','CD8B','NCAM1','TRDC','FCGR3B','KIT',
               'GATA2','CD68',
              'GZMK','GZMA','GZMB','CCR7','SELL','FOXP3','EPCAM','MUC5AC','HBB','PPBP',
              'HLA-DRA','CLEC10A','CLEC4C']

In [None]:
#Plotted as matrixplot with scaling 
sc.pl.matrixplot(adata, clustermark, groupby = 'broad_label',dendrogram = True, standard_scale ='var')

In [None]:
sc.pl.umap(adata,color = 'leiden', legend_loc = 'on data')

In [None]:
sc.pl.umap(adata,color = 'broad_label', legend_loc = 'on data')

In [None]:
# give broad annotations based on gene expression profiles 
#new_dict = {'0':'B',
           '1':'CD4 T',
           '2':'MMP',
           '3':'Plasma',
           '4':'NK',
           '5':'CD8 CTL',
           '6':'Treg',
           '7':'CD4 T',
           '8':'Epithelial',
           '9':'CD8 Tn',
           '10':'Plasma',
           '11':'CD4 T',
           '12':'gdT',
           '13':'CD4 T',
           '14':'Megakaryocyte',
           '15':'cDC',
           '16':'CD4 T',
           '17':'CD4 T',
           '18':'?DOUBLET',
           '19':'pDC',
           '20':'Mast',
           '21':'Epithelial',
           '22':'B',
           '23':'Erythrocyte'}
#adata.obs['broad_label'] = [new_dict[l] for l in adata.obs['leiden']]
#sc.pl.umap(adata, color=['leiden', 'broad_label'], legend_fontoutline=2, legend_loc = 'on data')

In [None]:
sc.pl.umap(adata, color = 'broad_label')

In [None]:
sc.pl.umap(adata, color = 'fine_label')

In [None]:
#What is split of cells across data? 
pd.crosstab(adata.obs['Patient'],adata.obs['broad_label'])

In [None]:
adata.obs

In [None]:
#add Mark's missing metadata
adata.obs['Gender'][adata.obs['Patient'] == 'N01-001'] = 'M'

In [None]:
#add missing metadata
adata.obs['Gender'][adata.obs['Patient'] == 'N01-002'] = 'F'

In [None]:
#add missing metadata
adata.obs['Gender'][adata.obs['Patient'] == 'N01-005'] = 'M'

In [None]:
#add missing metadata
adata.obs['Gender'][adata.obs['Patient'] == 'N01-011'] = 'M'

In [None]:
adata.obs['Gender'].value_counts()

In [None]:
adata

In [None]:
pd.crosstab(adata.obs['broad_label'], adata.obs['Diseasetype'])

In [None]:
adata.obs['COVID_severity'].value_counts()

In [None]:
adata.obs['Patient']

In [None]:
# Create new 'moderate-severe covid' variable and add to metadata. 
new_dict = {'IRVAS_39':'No',
            'IRVAS_44':'No',
            'N01-001':'No',
            'N01-002':'No',
            'N01-005':'No',
          'N01-011':'No',
           'N01-022':'No',
           '1':'Yes',
           '2':'Yes',
           '3':'No',
           'C20':'No',
           'C19':'No',
           'C24':'No',
           'C21':'No',
           'C17':'No',
           'C26':'No',
           'CV004_2_4':'Yes',
           'CV004_2_5':'Yes',
           'CV004_2_6':'No',
           'CV004_2_7':'No',
           'CV004_2_8':'No',
           'CV004_2_9':'No',
           'CV004_2_10':'No',
           'CV004_2_11':'No',
           'CV004_2_12':'No'}
adata.obs['moderate-severe'] = [new_dict[l] for l in adata.obs['Patient']]
adata.obs['moderate-severe'].value_counts()

In [None]:
#Check 
pd.crosstab(adata.obs['broad_label'],adata.obs['moderate-severe'])

In [None]:
pd.crosstab(adata.obs['Patient'],adata.obs['moderate-severe'])

In [None]:
pd.crosstab(adata.obs['Sampletype'],adata.obs['moderate-severe'])

In [None]:
#Save
adata.write('COV_recluster_nd_broad_label_080721.h5ad', compression = 'gzip')
adata.raw.to_adata().write('COV_recluster_nd_broad_label_080721_raw.h5ad', compression = 'gzip')

In [None]:
adata

In [None]:
sc.pl.umap(adata, color=['broad_label','leiden', 'Sampletype'])

## Sub-clustering for fine cell type annotation


In [None]:
sc.pl.umap(adata, color=['leiden', 'broad_label'], legend_fontoutline=2, legend_loc = 'on data')

### T cell

In [None]:
# subset T cell broad clusters 
rna_ = adata[adata.obs['leiden'].isin(['1','5','6','7','9','11','13','16','17'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
## remove TRBV/TRAV and IGHV/IGLV/IGKV from the highly variable genes
## allow for TRGV and TRDV so that i can do fine labelling for them
import re
for i in rna_x.var.index:
    if re.search('^TR[AB]V|^IG[HKL]V', i):
        rna_x.var.at[i, 'highly_variable'] = False
sc.pl.highly_variable_genes(rna_x)

In [None]:
# subset to highly variable
rna_x = rna_x[:, rna_x.var['highly_variable']]


In [None]:
# regress and scale for PCA
sc.pp.regress_out(rna_x, ['total_counts', 'pct_counts_mt'])


In [None]:
sc.pp.scale(rna_x, max_value = 10)

In [None]:
# Principal component analysis
sc.tl.pca(rna_x, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(rna_x, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50)

In [None]:
# run UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# run harmony
sc.external.pp.harmony_integrate(rna_x, 'Patient')
'X_pca_harmony' in rna_x.obsm


In [None]:
rna_x

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# find clusters
sc.tl.leiden(rna_x, resolution = 0.5, key_added = 'leiden_R')
sc.pl.umap(rna_x, color=['leiden', 'leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split specific clusters
sc.tl.leiden(rna_x, resolution = .4, key_added = 'leiden_R2', restrict_to =('leiden_R', ['1']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split specific clusters
sc.tl.leiden(rna_x, resolution = .2, key_added = 'leiden_R2.1', restrict_to =('leiden_R2', ['2']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2.1'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split specific clusters
sc.tl.leiden(rna_x, resolution = .3, key_added = 'leiden_R2.2', restrict_to =('leiden_R2.1', ['5']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2.2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color = ['TRDV2', 'TRGV9', 'CD4', 'CD8B', 'FOXP3', 'CXCR5', 'PDCD1', 'CD8A','TRAV1-2', 'CD69', 'KLRB1','ITGAE'], size = 20)

In [None]:
sc.pl.umap(rna_x, color = ['SELL', 'CD44','IFNG', 'PTGDR2','CCR7','CCR8','CXCR3','CCR5','CD27', 'IL5', 'IL4', 'IL17A', 'TRGC2', 'TRDV1', 'CD3E','TRGC1'], size = 20)

In [None]:
sc.tl.rank_genes_groups(rna_x, groupby = 'leiden_R2.2', method = 'wilcoxon')
sc.tl.filter_rank_genes_groups(rna_x, min_fold_change=1)
sc.tl.dendrogram(rna_x, groupby = 'leiden_R2.2')
sc.pl.rank_genes_groups_dotplot(rna_x, n_genes = 10, standard_scale = 'var', color_map = 'viridis', key = 'rank_genes_groups_filtered')

In [None]:
sc.pl.dotplot(rna_x, Chuafeat, groupby = 'leiden_R2.2')

In [None]:
sc.pl.dotplot(rna_x, Trav_list, groupby = 'leiden_R2.2')

In [None]:
sc.pl.dotplot(rna_x, ILC_Bjorkland, groupby = 'leiden_R2.2')

In [None]:
Zhang2020_T = ['CD3E',
              'CD4',
              'CD8A',
              'CCR7',
              'SELL',
              'TCF7',
              'LEF1',
              'LTB',
              'S100A4',
              'GPR183',
              'CD69',
              'GZMK',
              'GZMA',
              'GZMB',
              'GNLY',
              'NKG7',
              'FOXP3',
              'IL2RA',
              'TIGIT',
              'HAVCR2',
              'CTLA4',
              'LAG3',
              'PDCD1',
              'TOX',
              'FCGR3A',
              'KIR3DL2',
              'TYROBP',
              'NCAM1',
              'CD160']

In [None]:
sc.pl.matrixplot(rna_x, Zhang2020_T, groupby = 'leiden_R2.2', standard_scale = 'var')

In [None]:
faber = ['TRAC',
        'TRDC',
         'CD3E',
        'CD4',
        'CD8A',
        'CD8B',
        'SELL',
        'CCR7',
         'CD44',
        'LEF1',
        'KLF2',
        'TCF7',
        'ENO1',
        'LDHA',
        'FOXP3',
        'IL2RA',
        'TIGIT',
        'CD28',
        'PDCD1',
        'CXCR5',
        'ICOS',
        'CCL5',
        'GNLY',
        'KLRB1',
        'CD69',
        'ITGAE',
        'CXCR6',
        'ITGA1',
        'B3GAT1',
        'HNRNPLL',
        'CD244',
        'PRF1',
        'IL17A',
        'PTGDR2',
         'EOMES',
         'TBX21',
        'CXCR3',
         'GATA3',
        'CCR5',
        'IL7R',
        'STAT4']

In [None]:
Tcellplot = ['CD3E',
            'CD4',
            'CD8B',
            'SELL',
            'CCR7',
            'CD44',
            'IL17A',
            'STAT4',
            'PTGDR2',
            'GATA3',
             'CD69',
             'ITGAE',
             'ITGA1',
            'PRF1',
             'GZMA',
            'GZMB',
             'GZMK',
             'TRAV1-2',
             'KLRB1',
            'TIGIT',
            'PDCD1',
            'ICOS',
            'FOXP3',
            'CTLA4',
            'HLA-DRA',
            'S100A4']

In [None]:
sc.pl.matrixplot(rna_x, faber, groupby = 'leiden_R2.2', dendrogram = True, standard_scale = 'var')

In [None]:
sc.pl.matrixplot(rna_x, Tcellplot, groupby = 'leiden_R2.2', dendrogram = True, standard_scale = 'var')

In [None]:
sc.pl.umap(rna_x, color=['leiden_R2.2', 'Sampletype', 'celltypist'], legend_loc='on data')

In [None]:
sc.pl.umap(rna_x, color=['leiden'], legend_loc='on data')

In [None]:
sc.pl.matrixplot(rna_x, Tcellplot, groupby = 'fine_label', standard_scale = 'var')

In [None]:
sc.pl.dotplot(rna_x, Tcellplot, groupby = 'leiden_R2.1', standard_scale = 'var')

In [None]:
sc.pl.dotplot(rna_x, faber, groupby = 'leiden_R2.1', standard_scale = 'var')

In [None]:
new_dict1 = {'0':'CD4 Tn',
'1,0':'CD4 Tcm',
'1,1':'CD4 Tem',
'1,2':'CD4 Trm',
'2,0':'CD8 Trm',
'2,1':'CD8 CTL',
'2,2':'MAIT',
'3':'CD4 Th2',
'4':'CD8 Tn',
'5,0':'Treg',
'5,1':'Tfh',
'5,2':'CD4 Th17',
'5,3':'CD4 Act',
'6':'DOUBLET'}
rna_x.obs['subset_annotations'] = [new_dict1[l] for l in rna_x.obs['leiden_R2.2']]
sc.pl.umap(rna_x, color=['subset_annotations'], size=20, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(adata, color = ['Sampletype'], size =10)

In [None]:
sc.pl.matrixplot(rna_x, Tcellplot, groupby = 'subset_annotations', standard_scale = 'var')

In [None]:
adata

In [None]:
adata.obs['subset_annotations'] = adata.obs['leiden'] 

In [None]:
# update the original object
adata.obs['subset_annotations'] = adata.obs['subset_annotations'].astype('object')
adata.obs['subset_annotations'].update(rna_x.obs['subset_annotations'].astype('object'))
sc.pl.umap(adata, color = ['leiden', 'subset_annotations'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
#save for now
adata_concat.write('COV_nd_T-fine_label_120721.h5ad', compression = 'gzip')
adata_concat.raw.to_adata().write('COV_nd_T_fine_label_raw_120721.h5ad', compression = 'gzip')

In [None]:
sc.pl.umap(adata, color=['leiden','broad_label'], legend_loc = 'on data')

In [None]:
# also subset T/NK etc together just for graphical purposes
rna_ = adata[adata.obs['leiden'].isin(['1','5','6','7','9','11','13','16','17',
                                      '4', '12','20'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
Tcellplot

In [None]:
CD8NK

In [None]:
TNK = ['CD3E',
 'CD4',
 'CD8B',
 'SELL',
 'CCR7',
 'CD44',
 'IL17A',
 'STAT4',
 'PTGDR2',
 'GATA3',
 'CD69',
 'ITGAE',
 'ITGA1',
 'PRF1',
 'GZMA',
 'GZMB',
 'GZMK',
 'TRAV1-2',
 'KLRB1',
 'TIGIT',
 'PDCD1',
 'ICOS',
 'FOXP3',
 'CTLA4',
 'HLA-DRA',
 'S100A4',
 'TRDV1',
 'TRDV2',
 'TRDV3',
 'TYROBP',
 'KLRC2',
 'FCGR3A',
 'NCAM1',
 'KIT',
 'RORC',
 'GATA2',
 'FCER1A',
 'IL3RA']

In [None]:
sc.pl.matrixplot(rna_x, TNK, groupby = 'subset_annotations', dendrogram = True, standard_scale = 'var')

In [None]:
rna_x.obs['subset_annotations'].value_counts()

In [None]:
#need to recluster T cells without doublets, but do this later. 

### NK/mast/gdT cell

In [None]:
# subset
rna_ = adata[adata.obs['leiden'].isin(['3', '13','18'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var, uns = rna_.uns, obsm = rna_.obsm, obsp = rna_.obsp)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
## remove TRBV/TRAV and IGHV/IGLV/IGKV from the highly variable genes
## keep TCR gammadelta genes
import re
for i in rna_x.var.index:
    if re.search('^TR[AB]V|^IG[HKL]V', i):
        rna_x.var.at[i, 'highly_variable'] = False
sc.pl.highly_variable_genes(rna_x)

In [None]:
# subset to highly variable
rna_x = rna_x[:, rna_x.var['highly_variable']]
# regress and scale for PCA
sc.pp.regress_out(rna_x, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(rna_x, max_value = 10)

In [None]:
# Principal component analysis
sc.tl.pca(rna_x, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(rna_x, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50)

In [None]:
# run UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# run harmony
sc.external.pp.harmony_integrate(rna_x, 'Patient')
'X_pca_harmony' in rna_x.obsm


In [None]:
rna_x

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# find clusters
sc.tl.leiden(rna_x, resolution = 1, key_added = 'leiden_R')
sc.pl.umap(rna_x, color=['leiden', 'leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
sc.tl.leiden(rna_x, resolution = .3, key_added = 'leiden_R2', restrict_to = ('leiden_R', ['6']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
sc.tl.leiden(rna_x, resolution = .3, key_added = 'leiden_R2.1', restrict_to = ('leiden_R2', ['9']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2.1'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
sc.tl.leiden(rna_x, resolution = .2, key_added = 'leiden_R2.2', restrict_to = ('leiden_R2.1', ['10']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2.2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color = ['TRDC', 'TRGV9', 'CD4', 'CD8B', 'FOXP3', 'CXCR5', 'PDCD1', 'CD8A','TRAV1-2', 'CD69', 'KLRB1','ITGAE'], size = 20)

In [None]:
sc.pl.umap(rna_x, color = ['SELL', 'IFNG', 'PTGDR2','CCR3','CCR8','CXCR3','CCR5','IL13', 'IL5', 'IL4', 'IL17A', 'TRGC2', 'TRDV1', 'CD3E','TRGC1', 'TRDV2'], size = 20)

In [None]:
sc.pl.umap(rna_x, color = ['FCGR3A', 'NKG7','CD8B','CD3E','NCAM1','KLRC2','RORC','KIT', 'SIGLEC8','GATA3','GATA2'], size = 20)

In [None]:
# split specific clusters
#sc.tl.leiden(rna_x, resolution = .3, key_added = 'leiden_R2', restrict_to =('leiden_R', ['1']))
#sc.pl.umap(rna_x, color=['leiden', 'leiden_R2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.tl.rank_genes_groups(rna_x, groupby = 'leiden_R2.2', method = 'wilcoxon')
sc.tl.filter_rank_genes_groups(rna_x, min_fold_change=1)
sc.tl.dendrogram(rna_x, groupby = 'leiden_R2.2')
sc.pl.rank_genes_groups_dotplot(rna_x, n_genes = 10, standard_scale = 'var', color_map = 'viridis', key = 'rank_genes_groups_filtered')

In [None]:
sc.pl.dotplot(rna_x, Chuafeat, groupby = 'leiden_R2.2')

In [None]:
sc.pl.dotplot(rna_x, Trav_list, groupby = 'leiden_R2.2')

In [None]:
sc.pl.dotplot(rna_x, ILC_Bjorkland, groupby = 'leiden_R2.2')

In [None]:
sc.pl.dotplot(rna_x, Zhang2020_T, groupby = 'leiden_R2.2')

In [None]:
sc.pl.matrixplot(rna_x,faber,groupby = 'leiden_R2.2',standard_scale ='var')

In [None]:
CD8NK = ['CD3E',
       'CD8B',
         'SELL',
         'CCR7',
       'TRDV1',
       'TRDV2',
       'TRDV3',
       'TYROBP',
       'KLRC2',
         'FCGR3A',
       'NCAM1',
       'GZMA',
       'GZMB',
       'GZMK',
       'CD69',
       'ITGAE',
       'ITGA1',
         'GATA3',
         'KIT',
         'RORC',
        'GATA2',
        'FCER1A',
        'IL3RA']

In [None]:
sc.pl.matrixplot(rna_x,CD8NK,groupby = 'leiden_R2.2',dendrogram = True,standard_scale = 'var')

In [None]:
sc.pl.umap(rna_x, color=[ 'Sampletype'], legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color=['fine_label', 'Sampletype'], legend_loc='on data', legend_fontoutline=2)

In [None]:
new_dict1 = {'0':'NK CD16+',
            '1':'CD8 CTL',
            '2':'NK CD16+',
            '3':'gdT vd2',
            '4':'NK CD56+',
            '5':'aNK KLRC2+',
            '6,0':'DOUBLET',
            '6,1':'CD8 Trm',
            '7':'ILC',
            '8':'gdT vd3',
            '9,0':'Basophil',
            '9,1':'DOUBLET',
            '10,0':'Mast',
            '10,1':'DOUBLET',
            '11':'gdT vd1',
            '12':'DOUBLET',
            '13':'NK CD16+'}
rna_x.obs['subset_annotations'] = [new_dict1[l] for l in rna_x.obs['leiden_R2.2']]
sc.pl.umap(rna_x, color=['subset_annotations'], size=20, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.matrixplot(rna_x,CD8NK,groupby = 'subset_annotations',dendrogram = True,standard_scale = 'var')

In [None]:
# update the original object
adata.obs['subset_annotations'] = adata.obs['subset_annotations'].astype('object')
adata.obs['subset_annotations'].update(rna_x.obs['subset_annotations'].astype('object'))
sc.pl.umap(adata, color = ['leiden', 'subset_annotations'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
#save for now
adata.write('COV_nd_TNK_fine_label_100721.h5ad', compression = 'gzip')
adata.raw.to_adata().write('COV_nd_TNK_fine_label_raw_100721.h5ad', compression = 'gzip')

In [None]:
sc.pl.umap(adata, color=['leiden','broad_label','subset_annotations'], legend_loc = 'on data')

In [None]:
#save for now
adata.write('COV_nd2_TNK_fine_label_110721.h5ad', compression = 'gzip')
adata.raw.to_adata().write('COV_nd2_TNK_fine_label_raw_110721.h5ad', compression = 'gzip')

### B cells

In [None]:
# subset
rna_ = adata[adata.obs['leiden'].isin(['0', '3', '10','22'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var, uns = rna_.uns, obsm = rna_.obsm, obsp = rna_.obsp)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
rna_x.obs['subset_annotations'].value_counts()

In [None]:
## remove TRBV/TRAV and IGHV/IGLV/IGKV from the highly variable genes
import re
for i in rna_x.var.index:
    if re.search('^TR[AB]V|^IG[HKL]V', i):
        rna_x.var.at[i, 'highly_variable'] = False
sc.pl.highly_variable_genes(rna_x)

In [None]:
# subset to highly variable
rna_x = rna_x[:, rna_x.var['highly_variable']]
# regress and scale for PCA
sc.pp.regress_out(rna_x, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(rna_x, max_value = 10)

In [None]:
# Principal component analysis
sc.tl.pca(rna_x, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(rna_x, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50)

In [None]:
# run UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# run harmony
sc.external.pp.harmony_integrate(rna_x, 'Patient')
'X_pca_harmony' in rna_x.obsm


In [None]:
rna_x

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# find clusters
sc.tl.leiden(rna_x, resolution = 0.3, key_added = 'leiden_R')
sc.pl.umap(rna_x, color=['leiden', 'leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
#sc.tl.leiden(rna_x, resolution = .2, key_added = 'leiden_R2', restrict_to = ('leiden_R', ['3']))
#sc.pl.umap(rna_x, color=['leiden', 'leiden_R2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color = ['CD27', 'ITGA2B', 'CD19', 'CD3D', 'GNLY', 'JCHAIN', 'S100A8', 'CD3E'], size = 20)

In [None]:
sc.pl.umap(rna_x, color = ['PPBP', 'HBB','CD38'], size = 20)

In [None]:
sc.tl.rank_genes_groups(rna_x, groupby = 'leiden_R', method = 'wilcoxon')
sc.tl.filter_rank_genes_groups(rna_x, min_fold_change=1)
sc.tl.dendrogram(rna_x, groupby = 'leiden_R')
sc.pl.rank_genes_groups_dotplot(rna_x, n_genes = 10, standard_scale = 'var', color_map = 'viridis', key = 'rank_genes_groups_filtered')

In [None]:
sc.pl.dotplot(rna_x, Chuafeat, groupby = 'leiden_R')

In [None]:
sc.pl.dotplot(rna_x, Trav_list, groupby = 'leiden_R')

In [None]:
sc.pl.dotplot(rna_x, ILC_Bjorkland, groupby = 'leiden_R')

In [None]:
sc.pl.dotplot(rna_x, Zhang2020_T, groupby = 'leiden_R')

In [None]:
#Selection of key B cell subset markers
Bmark = ['CD19', 'MS4A1','CD38','JCHAIN','MME','CD27','IGHD','IGHM','IGHE','IGHA1','IGHG1','TNFRSF13C', 'TNFRSF13B',
         'TNFRSF17','CR2','CD5','FCER2','CD24', 'HLA-DRA', 'CD79A',
        'CXCR4','CXCR5','CD34','CD86','CD1D','CD74', 'CD40']
Bmark

In [None]:
Bmarkshort = ['CD19', 'MS4A1','CD38','JCHAIN','MME','CD27','IGHD','IGHM','IGHE','IGHA1','IGHG1', 'HLA-DRA',
        'CXCR4','CXCR5','CD1D']
Bmarkshort

In [None]:
sc.pl.dotplot(rna_x,Bmark,groupby='leiden_R')

In [None]:
sc.pl.umap(rna_x, color=['leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.matrixplot(rna_x,Bmark,groupby='leiden_R', standard_scale='var', dendrogram = True)

In [None]:
sc.pl.matrixplot(rna_x,Bmarkshort,groupby='leiden_R', standard_scale='var', dendrogram = True)

In [None]:
new_dict1 = {'0':'Bmem switched',
             '1':'DOUBLET',
             '2':'Bmem switched',
             '3':'Plasmablast IgM+',
             '4':'B FO',
             '5':'Plasma',
             '6':'Bmem non-switched',
             '7':'Bmem non-switched',
             '8':'Bmem switched',
             '9':'Bmem switched',
             '10':'B Naive',
             '11':'Bmem switched',
             '12':'Bmem switched',
             '13':'Bmem switched',
             '14':'DOUBLET'
             
             
             
             
             
}
rna_x.obs['subset_annotations'] = [new_dict1[l] for l in rna_x.obs['leiden_R']]
sc.pl.umap(rna_x, color=['subset_annotations'], size=20, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.matrixplot(rna_x,Bmarkshort,groupby='subset_annotations', standard_scale='var', dendrogram = True)

In [None]:
# update the original object
adata.obs['subset_annotations'] = adata.obs['subset_annotations'].astype('object')
adata.obs['subset_annotations'].update(rna_x.obs['subset_annotations'].astype('object'))
sc.pl.umap(adata, color = ['leiden', 'subset_annotations'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
#save for now
adata_concat.write('COV_combined_TNKB-fine_label_060721.h5ad', compression = 'gzip')
adata_concat.raw.to_adata().write('COV_combined_TNKB_fine_label_raw_060721.h5ad', compression = 'gzip')

In [None]:
#load as kernel died
adata_concat = sc.read_h5ad('COV_combined_TNKB-fine_label_060721.h5ad')
adata_concat

In [None]:
sc.pl.umap(adata, color = ['leiden', 'broad_label'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

### monocyte/macrophage/DC/pDC

In [None]:
# subset
rna_ = adata[adata.obs['leiden'].isin(['2','15','19'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var, uns = rna_.uns, obsm = rna_.obsm, obsp = rna_.obsp)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
rna_x.obs['subset_annotations'].value_counts()

In [None]:
# subset to highly variable
rna_x = rna_x[:, rna_x.var['highly_variable']]
# regress and scale for PCA
sc.pp.regress_out(rna_x, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(rna_x, max_value = 10)

In [None]:
# Principal component analysis
sc.tl.pca(rna_x, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(rna_x, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50)

In [None]:
# run UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# run harmony
sc.external.pp.harmony_integrate(rna_x, 'Patient')
'X_pca_harmony' in rna_x.obsm


In [None]:
rna_x

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# find clusters
sc.tl.leiden(rna_x, resolution = 0.5, key_added = 'leiden_R')
sc.pl.umap(rna_x, color=['leiden', 'leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
sc.tl.leiden(rna_x, resolution = .2, key_added = 'leiden_R2', restrict_to = ('leiden_R', ['3']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
sc.tl.leiden(rna_x, resolution = .1, key_added = 'leiden_R2.1', restrict_to = ('leiden_R2', ['5']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2.1'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color = ['CD14', 'FCGR3A', 'MRC1', 'CD68', 'GNLY', 'CD3E', 'CD1C', 'CLEC9A', 'C1QC', 'CLEC4C', 'PPBP', 'HBB'], size = 20)

In [None]:
sc.tl.rank_genes_groups(rna_x, groupby = 'leiden_R2.1', method = 'wilcoxon')
sc.tl.filter_rank_genes_groups(rna_x, min_fold_change=1)
sc.tl.dendrogram(rna_x, groupby = 'leiden_R2.1')
sc.pl.rank_genes_groups_dotplot(rna_x, n_genes = 10, standard_scale = 'var', color_map = 'viridis', key = 'rank_genes_groups_filtered')

In [None]:
sc.pl.dotplot(rna_x, Chuafeat, groupby = 'leiden_R2.1')

In [None]:
sc.pl.dotplot(rna_x, Trav_list, groupby = 'leiden_R2.1')

In [None]:
MMPlist = ['CD14','FCGR3A','CD68','C1QC','APOE','LILRA4','CLEC4C','CLEC9A','CLEC10A']

In [None]:
sc.pl.umap(rna_x, color = ['leiden_R2.1','Sampletype'], legend_loc = 'on data')

In [None]:
new_dict1 = {'0':'Monocyte classical',
             '1':'Monocyte classical',
             '2':'Monocyte classical',
             '3,0':'Monocyte non-classical',
             '3,1':'Macrophage non-resident',
             '4':'DOUBLET',
             '5,0':'cDC CD1c+',
             '5,1':'cDC CD141+',
             '6':'pDC',
             '7':'Macrophage resident',
             '8':'Monocyte classical'
}
rna_x.obs['subset_annotations'] = [new_dict1[l] for l in rna_x.obs['leiden_R2.1']]
sc.pl.umap(rna_x, color=['subset_annotations'], size=20, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color=['Sampletype'], size=20, legend_fontoutline=2)

In [None]:
sc.pl.matrixplot(rna_x,MMPlist,groupby='subset_annotations', standard_scale='var', dendrogram = True)

In [None]:
# update the original object
adata.obs['subset_annotations'] = adata.obs['subset_annotations'].astype('object')
adata.obs['subset_annotations'].update(rna_x.obs['subset_annotations'].astype('object'))
sc.pl.umap(adata, color = ['leiden', 'subset_annotations'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(adata, color = ['leiden', 'broad_label'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
#save for now
adata_concat.write('COV_combined_TNKBMMP-fine_label_070721.h5ad', compression = 'gzip')
adata_concat.raw.to_adata().write('COV_combined_TNKBMMP_fine_label_raw_070721.h5ad', compression = 'gzip')

### Epithelial

In [None]:
# subset
rna_ = adata[adata.obs['leiden'].isin(['8','21'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var, uns = rna_.uns, obsm = rna_.obsm, obsp = rna_.obsp)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
# subset to highly variable
rna_x = rna_x[:, rna_x.var['highly_variable']]
# regress and scale for PCA
sc.pp.regress_out(rna_x, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(rna_x, max_value = 10)

In [None]:
# Principal component analysis
sc.tl.pca(rna_x, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(rna_x, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50)

In [None]:
# run UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# run harmony
sc.external.pp.harmony_integrate(rna_x, 'Patient')
'X_pca_harmony' in rna_x.obsm


In [None]:
rna_x

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# find clusters
sc.tl.leiden(rna_x, resolution = 0.5, key_added = 'leiden_R')
sc.pl.umap(rna_x, color=['leiden', 'leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# split clusters
sc.tl.leiden(rna_x, resolution = .2, key_added = 'leiden_R2', restrict_to = ('leiden_R', ['6']))
sc.pl.umap(rna_x, color=['leiden', 'leiden_R2'], size=10, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color = ['EPCAM','KRT19','MUC5AC','PRR4','COL1A1','ACTA2','FOXJ1','ACKR1','PPBP', 'HBB','CD1A','ASCL2','FDCSP'], size = 20)

In [None]:
sc.tl.rank_genes_groups(rna_x, groupby = 'leiden_R2', method = 'wilcoxon')
sc.tl.filter_rank_genes_groups(rna_x, min_fold_change=1)
sc.tl.dendrogram(rna_x, groupby = 'leiden_R2')
sc.pl.rank_genes_groups_dotplot(rna_x, n_genes = 10, standard_scale = 'var', color_map = 'viridis', key = 'rank_genes_groups_filtered')

In [None]:
sc.pl.dotplot(rna_x, Chuafeat, groupby = 'leiden_R2')

In [None]:
sc.pl.dotplot(rna_x, Trav_list, groupby = 'leiden_R2')

In [None]:
sc.pl.dotplot(rna_x, ILC_Bjorkland, groupby = 'leiden_R2')

In [None]:
epimark = ['EPCAM','KRT5','KRT19','MUC5AC','FDCSP','FOXJ1','FOXI1','CFTR','ANK2','SPRR3','TMPRSS11E',
          'FCER1A', 'COL1A1','ACTA2','ACKR1']

In [None]:
sc.pl.matrixplot(rna_x,epimark,groupby = 'subset_annotations',dendrogram = True,
                standard_scale = 'var')

In [None]:
rna_x.obs['subset_annotations'].value_counts()

In [None]:
sc.pl.umap(rna_x, color = 'leiden_R2', legend_loc = 'on data')

In [None]:
new_dict1 = {'0':'Secretory epithelial',
             '1':'DOUBLET',
             '2':'Ciliated epithelial',
             '3':'Basal epithelial',
             '4':'DOUBLET',
             '5':'Secretory epithelial',
             '6,0':'Secretory epithelial',
             '6,1':'Follicular DC',
             '6,2':'Ionocyte',
             '7':'Squamous epithelial',
             '8':'Secretory epithelial',
             '9':'Secretory epithelial',
             '10':'Endothelial-Stromal',
             '11':'Ciliated epithelial'
                         
}
rna_x.obs['subset_annotations'] = [new_dict1[l] for l in rna_x.obs['leiden_R2']]
sc.pl.umap(rna_x, color=['subset_annotations'], size=20, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# update the original object
adata.obs['subset_annotations'] = adata.obs['subset_annotations'].astype('object')
adata.obs['subset_annotations'].update(rna_x.obs['subset_annotations'].astype('object'))
sc.pl.umap(adata, color = ['leiden', 'subset_annotations'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(adata, color = ['leiden', 'broad_label'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
#save for now
adata_concat.write('COV_combined_TNKBMMPepi-fine_label_070721.h5ad', compression = 'gzip')
adata_concat.raw.to_adata().write('COV_combined_TNKBMMPepi_fine_label_raw_070721.h5ad', compression = 'gzip')

### Megakaryocyte and erythrocyte

In [None]:
# subset
rna_ = adata[adata.obs['leiden'].isin(['14', '18','23'])]
rna_x = sc.AnnData(X = rna_.raw.X, obs = rna_.obs, var = rna_.raw.var, uns = rna_.uns, obsm = rna_.obsm, obsp = rna_.obsp)
rna_x.raw = rna_x
sc.pp.highly_variable_genes(rna_x, min_mean = 0.0125, max_mean = 3, min_disp = 0.5)
sc.pl.highly_variable_genes(rna_x)
rna_x

In [None]:
# subset to highly variable
rna_x = rna_x[:, rna_x.var['highly_variable']]
# regress and scale for PCA
sc.pp.regress_out(rna_x, ['total_counts', 'pct_counts_mt'])
sc.pp.scale(rna_x, max_value = 10)

In [None]:
# Principal component analysis
sc.tl.pca(rna_x, svd_solver = 'arpack')
sc.pl.pca_variance_ratio(rna_x, log = True, n_pcs = 50)

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50)

In [None]:
# run UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# run harmony
sc.external.pp.harmony_integrate(rna_x, 'Patient')
'X_pca_harmony' in rna_x.obsm


In [None]:
rna_x

In [None]:
# Computing the neighborhood graph. Seurat uses k = 20 as default
sc.pp.neighbors(rna_x, n_neighbors = 10, n_pcs = 50, use_rep='X_pca_harmony')

In [None]:
# UMAP
sc.tl.umap(rna_x, n_components = 2, min_dist = 0.3)
sc.pl.umap(rna_x, color=['Sampleid','Patient', 'Sampletype'])

In [None]:
sc.pl.umap(rna_x, color=['leiden'])

In [None]:
# find clusters
sc.tl.leiden(rna_x, resolution = 0.3, key_added = 'leiden_R')
sc.pl.umap(rna_x, color=['leiden', 'leiden_R'], size=6, legend_loc ='on data', legend_fontoutline=2)

In [None]:
sc.pl.umap(rna_x, color = ['CD14', 'FCGR3A', 'FCGR3B', 'CD68', 'GNLY', 'CD3E', 'CD1C', 'CLEC9A', 'C1QC', 'CD86', 'PPBP', 'HBB'], size = 20)

In [None]:
sc.tl.rank_genes_groups(rna_x, groupby = 'leiden_R', method = 'wilcoxon')
sc.tl.filter_rank_genes_groups(rna_x, min_fold_change=1)
sc.tl.dendrogram(rna_x, groupby = 'leiden_R')
sc.pl.rank_genes_groups_dotplot(rna_x, n_genes = 10, standard_scale = 'var', color_map = 'viridis', key = 'rank_genes_groups_filtered')

In [None]:
sc.pl.dotplot(rna_x,Chuafeat,groupby ='leiden_R')

In [None]:
new_dict1 = {'0':'Megakaryocyte',
            '1':'DOUBLET',
            '2':'Megakaryocyte',
            '3':'Erythrocyte',
            '4':'DOUBLET',
            '5':'DOUBLET',
            '6':'DOUBLET',
            '7':'DOUBLET',
            '8':'DOUBLET',
            '9':'DOUBLET',
            '10':'DOUBLET'}
rna_x.obs['subset_annotations'] = [new_dict1[l] for l in rna_x.obs['leiden_R']]
sc.pl.umap(rna_x, color=['subset_annotations'], size=20, legend_loc ='on data', legend_fontoutline=2)

In [None]:
# update the original object
adata.obs['subset_annotations'] = adata.obs['subset_annotations'].astype('object')
adata.obs['subset_annotations'].update(rna_x.obs['subset_annotations'].astype('object'))
sc.pl.umap(adata, color = ['leiden', 'subset_annotations'], size = 10, legend_loc = 'on data', legend_fontoutline=2)

In [None]:
#save for now
adata.write('COV_subsetanno1_120721.h5ad', compression = 'gzip')
adata.raw.to_adata().write('COV_subsetanno1_raw_120721.h5ad', compression = 'gzip')