In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import time
import scipy
import sklearn
import copy
import importlib




In [2]:
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, roc_auc_score, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.metrics import make_scorer, matthews_corrcoef


In [3]:
# Read Hao dataset
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/batch_corrected/Hao_PBMC_Harmony_unscaled.h5ad')
print('Original adata:', adata.shape)
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)


Original adata: (161764, 33538)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


In [26]:
adata.obs.columns

Index(['celltype.l1', 'celltype.l2', 'celltype.l3', 'Batch', 'donor', 'time',
       'lane', 'Phase', 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA',
       'nFeature_RNA', 'leiden'],
      dtype='object')

In [4]:
# Read Zheng query dataset
query_adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_68k_Zheng/Zheng_PBMC.h5ad')
query_adata.shape

(68579, 32738)

In [6]:
adata.var_names

Index(['MIR1302-2HG', 'FAM138A', 'OR4F5', 'AL627309.1', 'AL627309.3',
       'AL627309.2', 'AL627309.4', 'AL732372.1', 'OR4F29', 'AC114498.1',
       ...
       'AC007325.2', 'BX072566.1', 'AL354822.1', 'AC023491.2', 'AC004556.1',
       'AC233755.2', 'AC233755.1', 'AC240274.1', 'AC213203.1', 'FAM231C'],
      dtype='object', length=33538)

In [7]:
query_adata.var_names

Index(['ENSG00000243485', 'ENSG00000237613', 'ENSG00000186092',
       'ENSG00000238009', 'ENSG00000239945', 'ENSG00000237683',
       'ENSG00000239906', 'ENSG00000241599', 'ENSG00000228463',
       'ENSG00000237094',
       ...
       'ENSG00000217792', 'ENSG00000268276', 'ENSG00000148828',
       'ENSG00000215700', 'ENSG00000215699', 'ENSG00000215635',
       'ENSG00000268590', 'ENSG00000251180', 'ENSG00000215616',
       'ENSG00000215611'],
      dtype='object', length=32738)

In [8]:
query_adata.var.head()

Unnamed: 0,gene_symbols
ENSG00000243485,MIR1302-10
ENSG00000237613,FAM138A
ENSG00000186092,OR4F5
ENSG00000238009,RP11-34P13.7
ENSG00000239945,RP11-34P13.8


In [17]:
symbols_query = query_adata.var['gene_symbols'].tolist()

In [18]:
symbols_adata = adata.var_names.tolist()

In [24]:
len(list(set(symbols_query) & set(symbols_adata)))

20453

## Convert Hao gene symbols to ensembl ENSG
using the gene info provided by cellxgene

In [27]:
adata.shape

(161764, 33538)

In [28]:
adata.var.head()

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3


In [29]:
ensembl2symbol = pd.read_csv('Hao_PBMC_ensembl2symbol_cellxgene.csv', index_col=0)
ensembl2symbol.head()

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000238009,False,RP11-34P13.7,NCBITaxon:9606,gene
ENSG00000237491,False,LINC01409,NCBITaxon:9606,gene
ENSG00000225880,False,LINC00115,NCBITaxon:9606,gene
ENSG00000230368,False,FAM41C,NCBITaxon:9606,gene
ENSG00000188976,False,NOC2L,NCBITaxon:9606,gene


In [35]:
symbol2ensembl = ensembl2symbol.reset_index().set_index('feature_name')['gene_ids'].to_dict()
symbol2ensembl

{'RP11-34P13.7': 'ENSG00000238009',
 'LINC01409': 'ENSG00000237491',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'RP11-54O7.17': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AGRN': 'ENSG00000188157',
 'C1orf159': 'ENSG00000131591',
 'TTLL10': 'ENSG00000162571',
 'TNFRSF18': 'ENSG00000186891',
 'TNFRSF4': 'ENSG00000186827',
 'SDF4': 'ENSG00000078808',
 'B3GALT6': 'ENSG00000176022',
 'C1QTNF12': 'ENSG00000184163',
 'RP5-902P8.12': 'ENSG00000260179',
 'UBE2J2': 'ENSG00000160087',
 'SCNN1D': 'ENSG00000162572',
 'ACAP3': 'ENSG00000131584',
 'PUSL1': 'ENSG00000169972',
 'INTS11': 'ENSG00000127054',
 'CPTP': 'ENSG00000224051',
 'TAS1R3': 'ENSG00000169962',
 'DVL1': 'ENSG00000107404',
 'MXRA8': 'ENSG00000162576',
 'AURKAIP1': 'ENSG00000175756',
 'CCNL2': 'ENSG00000221978',
 'MRPL20': 'ENSG00000242485',
 'MRPL20-DT': 'ENSG00000272455',
 'ANKRD65': 'ENSG0

In [38]:
adata.var['ensembl'] = adata.var.index.map(symbol2ensembl).fillna('')
adata.var

Unnamed: 0,ensembl
MIR1302-2HG,
FAM138A,
OR4F5,
AL627309.1,
AL627309.3,
...,...
AC233755.2,
AC233755.1,
AC240274.1,
AC213203.1,


In [40]:
adata.var[adata.var['ensembl'] != '']

Unnamed: 0,ensembl
FAM87B,ENSG00000177757
LINC00115,ENSG00000225880
FAM41C,ENSG00000230368
SAMD11,ENSG00000187634
NOC2L,ENSG00000188976
...,...
MT-ND4L,ENSG00000212907
MT-ND4,ENSG00000198886
MT-ND5,ENSG00000198786
MT-ND6,ENSG00000198695


In [41]:
# check if Hao gene symbols incorporates all of the CellxGene gene symbols
len(set(adata.var_names) & set(ensembl2symbol['feature_name']))

16494

In [42]:
len(set(name.lower() for name in adata.var_names) & set(name.lower() for name in ensembl2symbol['feature_name']))

16494

## Compare CellxGene ensembl with Zheng ensembl

In [43]:
Hao_cellxgene = ensembl2symbol.index.tolist()
Zheng_ensembl = query_adata.var_names.tolist()
print('Hao_cellxgene:', len(Hao_cellxgene))
print('Zheng_ensembl:', len(Zheng_ensembl))
print('intersection:', len(set(Hao_cellxgene) & set(Zheng_ensembl)))

Hao_cellxgene: 20568
Zheng_ensembl: 32738
intersection: 19377
