In [2]:
import os

import numpy as np
import pandas as pd

import anndata
import scanpy as sc

import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
from matplotlib.collections import PatchCollection
from matplotlib import gridspec
import matplotlib as mpl
import seaborn as sns

from collections import defaultdict
import random
import itertools
from numpy import *    

In [None]:
def cluster (adata):
    sc.pp.highly_variable_genes(adata, flavor='cell_ranger', n_top_genes=2000)
    sc.pp.pca(adata, n_comps=40, use_highly_variable=True, svd_solver='arpack')
    sc.pp.neighbors(adata, n_pcs = 15)
    sc.tl.umap(adata)
    sc.tl.leiden(adata, resolution=1, key_added = 'leiden_r1')
    sc.tl.leiden(adata, resolution=0.5, key_added = 'leiden_r0.5')
    sc.tl.rank_genes_groups(adata, groupby='leiden_r1', key_added='rank_genes_r1')
    sc.tl.rank_genes_groups(adata, groupby='leiden_r0.5', key_added='rank_genes_r0.5')
    
def removegenes(adata):
    """remove human HLA genes from the dataset 
    Parameters:
    ----------
    adata: scanpy.adata
        scanpy adata object

    Returns:
    -------
    temp: scanpy.adata
        scanpy adata object
    """

    IGKV = [x for x in adata.var_names if x.startswith('IGKV')]
    IGHV = [x for x in adata.var_names if x.startswith('IGHV')]
    IGLV = [x for x in adata.var_names if x.startswith('IGLV')]
    IGLC = [x for x in adata.var_names if x.startswith('IGLC')]
    IGLL = [x for x in adata.var_names if x.startswith('IGLL')]
    IGKC = [x for x in adata.var_names if x.startswith('IGKC')]
    IGHC = [x for x in adata.var_names if x.startswith('IGHC')]
    TRAV = [x for x in adata.var_names if x.startswith('TRAV')]
    TRBV = [x for x in adata.var_names if x.startswith('TRBV')]
    
    #try removing IGHG genes and MZB1 and JCHAIN
#     IGHG = [x for x in adata.var_names if x.startswith('IGHG')]
    exclude = IGKV + IGHV + IGLV + IGLC + IGLL + IGKC + IGHC + TRAV + TRBV 
    gene = [x for x in adata.var_names if x not in exclude]
    temp = adata[:,gene].copy()
    return(temp)

In [4]:
# load data
path = '.././mergedata_20211001.h5ad'
adata = sc.read_h5ad(path)

# get children data
adata_kid = adata[(adata.obs.cell_quality == quality) & 
                 (adata.obs.platform == platform) & 
                 (adata.obs.doublets == doublets)]

#normalize data
sc.pp.normalize_total(adata_kid, target_sum=1e6)
sc.pp.log1p(adata_kid, base=2)

# remove human HLA genes from the dataset
adata_kid = removegenes(adata_kid)
adata_kid = adata_kid[(adata_kid.obs.cell_subtype_new != 'doublets') & 
                        (adata_kid.obs.cell_subtype_new != 'unknown')].copy()

# filter genes before clustering
sc.pp.filter_genes(adata_kid, min_cells=3)

In [5]:
#clustering
cluster (adata_kid)

  res = method(*args, **kwargs)


In [6]:
# #rename and reorder some celltypes 
adata_kid = adata_kid.copy()
reorg_celltype(adata_kid) # 'cell_type_new', 'cell_subtype_new'

In [7]:
X_umap = np.loadtxt('.././X_umap.txt')

In [None]:
adata_kid.obsm['X_umap'] = X_umap

In [None]:
# then do UMAP