In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mygene



In [2]:
# Read Hao dataset
adata = sc.read_h5ad('/Users/evanli/Documents/Research_datasets/PBMC_Hao/GSE164378_Hao/batch_corrected/Hao_PBMC_Harmony_unscaled.h5ad')
print('Original adata:', adata.shape)
adata.obs['celltype.l1'] = adata.obs['celltype.l1'].str.replace(' ', '_')
label = adata.obs['celltype.l1'].tolist()
types = np.unique(label).tolist()
print('all cell types:', types)


Original adata: (161764, 33538)
all cell types: ['B', 'CD4_T', 'CD8_T', 'DC', 'Mono', 'NK', 'other', 'other_T']


In [3]:
adata.var.head()

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3


In [4]:
gene_symbols = adata.var_names.tolist()

# Initialize the mygene query object
mg = mygene.MyGeneInfo()

# Query the gene symbols to get Ensembl IDs
result = mg.querymany(gene_symbols, scopes=['symbol', 'alias', 'refseq'], fields=['ensembl.gene', 'entrezgene', 'symbol', 'name'], species='human', as_dataframe=True)
result.head(30)

2505 input query terms found dup hits:	[('MIR1302-2HG', 2), ('OR4F5', 2), ('LINC00115', 2), ('LINC01786', 2), ('DVL1', 2), ('LINC01770', 2)
10182 input query terms found no hit:	['AL627309.1', 'AL627309.3', 'AL627309.2', 'AL627309.4', 'AL732372.1', 'AC114498.1', 'AL669831.2', '


Unnamed: 0_level_0,_id,_score,name,symbol,ensembl.gene,entrezgene,notfound,ensembl
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MIR1302-2HG,ENSG00000243485,26.271759,MIR1302-2 host gene,MIR1302-2HG,ENSG00000243485,,,
MIR1302-2HG,107985730,26.271759,MIR1302-2 host gene,MIR1302-2HG,,107985730.0,,
FAM138A,645520,27.063152,family with sequence similarity 138 member A,FAM138A,ENSG00000237613,645520.0,,
OR4F5,79501,25.752413,olfactory receptor family 4 subfamily F member 5,OR4F5,ENSG00000186092,79501.0,,
OR4F5,81099,24.230534,olfactory receptor family 4 subfamily F member 17,OR4F17,ENSG00000176695,81099.0,,
AL627309.1,,,,,,,True,
AL627309.3,,,,,,,True,
AL627309.2,,,,,,,True,
AL627309.4,,,,,,,True,
AL732372.1,,,,,,,True,


In [5]:
print(result.shape)
result.to_csv('mygene_result_raw_v3.csv')

(36434, 8)


In [6]:
# result = pd.read_csv('mygene_result_raw_v3.csv', index_col=0)
# result.shape

In [7]:
# Fill all NaN with None
result = result.fillna('None')
# Fill all None in "notfound" with False
result['notfound'] = result['notfound'].replace('None', False)
result.head()

Unnamed: 0_level_0,_id,_score,name,symbol,ensembl.gene,entrezgene,notfound,ensembl
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MIR1302-2HG,ENSG00000243485,26.271759,MIR1302-2 host gene,MIR1302-2HG,ENSG00000243485,,False,
MIR1302-2HG,107985730,26.271759,MIR1302-2 host gene,MIR1302-2HG,,107985730.0,False,
FAM138A,645520,27.063152,family with sequence similarity 138 member A,FAM138A,ENSG00000237613,645520.0,False,
OR4F5,79501,25.752413,olfactory receptor family 4 subfamily F member 5,OR4F5,ENSG00000186092,79501.0,False,
OR4F5,81099,24.230534,olfactory receptor family 4 subfamily F member 17,OR4F17,ENSG00000176695,81099.0,False,


In [9]:
import ast
row = result[result['ensembl'] != 'None'].iloc[0]
# ast.literal_eval(row['ensembl'])[0]['gene']
row['ensembl'][0]['gene']

'ENSG00000149527'

In [10]:
import ast
from tqdm import tqdm

count = 0
for i, row in tqdm(result.iterrows(), total=result.shape[0]):
    if row['ensembl'] != 'None':
        # result.at[i, 'ensembl.gene'] = ast.literal_eval(row['ensembl'])[0]['gene']
        result.at[i, 'ensembl.gene'] = row['ensembl'][0]['gene']
        count += 1
    else:
        pass

100%|██████████| 36434/36434 [00:02<00:00, 16943.32it/s]


In [11]:
count

1694

In [12]:
# Create a mask for duplicated rows
duplicated_mask = result.index.duplicated(keep='first')

# Create a mask for rows where "ensembl.gene" is not None
not_none_mask = result['ensembl.gene'] != 'None'

# Use the masks to select the desired rows
# result = result[(not_none_mask & ~duplicated_mask) | (not_none_mask & duplicated_mask)]
result = result[(~duplicated_mask) | (not_none_mask & duplicated_mask)]
print(result.shape)
result.head(30)

(35331, 8)


Unnamed: 0_level_0,_id,_score,name,symbol,ensembl.gene,entrezgene,notfound,ensembl
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MIR1302-2HG,ENSG00000243485,26.271759,MIR1302-2 host gene,MIR1302-2HG,ENSG00000243485,,False,
FAM138A,645520,27.063152,family with sequence similarity 138 member A,FAM138A,ENSG00000237613,645520.0,False,
OR4F5,79501,25.752413,olfactory receptor family 4 subfamily F member 5,OR4F5,ENSG00000186092,79501.0,False,
OR4F5,81099,24.230534,olfactory receptor family 4 subfamily F member 17,OR4F17,ENSG00000176695,81099.0,False,
AL627309.1,,,,,,,True,
AL627309.3,,,,,,,True,
AL627309.2,,,,,,,True,
AL627309.4,,,,,,,True,
AL732372.1,,,,,,,True,
OR4F29,729759,25.746635,olfactory receptor family 4 subfamily F member 29,OR4F29,ENSG00000284733,729759.0,False,


In [13]:
# check for further duplicates and keep first instance
dup2 = result.index.duplicated(keep='first')
result = result[~dup2]
result.shape

(33538, 8)

## Check the genes in results

In [14]:
# check whether the index of result in corporates most of the gene symbols in Hao
gene_symbols = adata.var_names.tolist()
result_symbols = result.index.tolist()
print(len(gene_symbols), len(result_symbols))
print(len(set(gene_symbols) & set(result_symbols)))

33538 33538
33538


In [15]:
set(gene_symbols) == set(result_symbols)

True

In [29]:
gene_symbols == result_symbols

True

In [30]:
# Manually modify two genes
result.loc['PBK', 'ensembl.gene'] = 'ENSG00000168078'
result.loc['ZNF573', 'ensembl.gene'] = 'ENSG00000189144'

In [31]:
result.to_csv('mygene_result_v3.csv')

In [34]:
# How many genes are matched with ensembl ID
# sum(result['ensembl.gene'] != 'None')
sum([g.startswith('ENSG') for g in result['ensembl.gene']])

23192

In [35]:
# How many genes are found
sum(result['notfound'] == False)

23356

In [22]:
df = result[(result['notfound'] == False) & (result['ensembl.gene'] == 'None')]
df.shape

(164, 8)

In [24]:
df

Unnamed: 0_level_0,_id,_score,name,symbol,ensembl.gene,entrezgene,notfound,ensembl
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LINC00337,148645,25.039667,ICMT divergent transcript,ICMT-DT,,148645,False,
NPPA-AS1,100379251,27.063152,NPPA antisense RNA 1,NPPA-AS1,,100379251,False,
NBL1,100532736,20.138813,MICOS10-NBL1 readthrough,MICOS10-NBL1,,100532736,False,
MINOS1-NBL1,100532736,25.044989,MICOS10-NBL1 readthrough,MICOS10-NBL1,,100532736,False,
TRNP1,7207,22.07316,tRNA-Leu (anticodon AAG) 2-3,TRL-AAG2-3,,7207,False,
...,...,...,...,...,...,...,...,...
PCAT14,101978785,27.065735,prostate cancer associated transcript 14,PCAT14,,101978785,False,
IGLVIVOR22-2,84087,8.730177,immunoglobulin lambda variable (IV)/OR22-2 (ps...,IGLVIVOR22-2,,84087,False,
ISX-AS1,101926957,24.21738,long intergenic non-protein coding RNA 2885,LINC02885,,101926957,False,
LINC02246,107987295,25.044989,adipocyte associated metabolic related lncRNA 1,ASMER1,,107987295,False,


## Entrez as query for the remaining 164 genes (failed)

In [26]:
entrez = df['entrezgene'].tolist()

In [27]:
# Initialize the mygene query object
mg = mygene.MyGeneInfo()

# Query the gene symbols to get Ensembl IDs
new_result = mg.querymany(entrez, scopes=['entrezgene'], fields=['ensembl.gene', 'symbol', 'name'], species='human', as_dataframe=True)

2 input query terms found dup hits:	[('100532736', 2), ('100302652', 2)]


In [28]:
new_result

Unnamed: 0_level_0,_id,_score,name,symbol
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
148645,148645,26.992018,ICMT divergent transcript,ICMT-DT
100379251,100379251,26.988840,NPPA antisense RNA 1,NPPA-AS1
100532736,100532736,26.989302,MICOS10-NBL1 readthrough,MICOS10-NBL1
100532736,100532736,26.989254,MICOS10-NBL1 readthrough,MICOS10-NBL1
7207,7207,26.989716,tRNA-Leu (anticodon AAG) 2-3,TRL-AAG2-3
...,...,...,...,...
101978785,101978785,26.985958,prostate cancer associated transcript 14,PCAT14
84087,84087,8.705148,immunoglobulin lambda variable (IV)/OR22-2 (ps...,IGLVIVOR22-2
101926957,101926957,26.197802,long intergenic non-protein coding RNA 2885,LINC02885
107987295,107987295,26.989582,adipocyte associated metabolic related lncRNA 1,ASMER1
