In [1]:
import requests
import sys
from tqdm import tqdm

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad

In [2]:
adata = sc.datasets.ebi_expression_atlas("E-MTAB-5061")



In [3]:
adata.obs.columns

Index(['Sample Characteristic[organism]',
       'Sample Characteristic Ontology Term[organism]',
       'Sample Characteristic[individual]',
       'Sample Characteristic Ontology Term[individual]',
       'Sample Characteristic[sex]',
       'Sample Characteristic Ontology Term[sex]',
       'Sample Characteristic[age]',
       'Sample Characteristic Ontology Term[age]',
       'Sample Characteristic[body mass index]',
       'Sample Characteristic Ontology Term[body mass index]',
       'Sample Characteristic[organism status]',
       'Sample Characteristic Ontology Term[organism status]',
       'Sample Characteristic[clinical information]',
       'Sample Characteristic Ontology Term[clinical information]',
       'Sample Characteristic[organism part]',
       'Sample Characteristic Ontology Term[organism part]',
       'Sample Characteristic[cell type]',
       'Sample Characteristic Ontology Term[cell type]',
       'Sample Characteristic[disease]',
       'Sample Characteristic

In [4]:
hide = [
    "Factor Value[inferred cell type - authors labels]",
    'Sample Characteristic[submitted single cell quality]',
]
keep = [
    'Sample Characteristic[individual]',
    'Sample Characteristic[sex]',
    'Sample Characteristic[age]',
    'Sample Characteristic[body mass index]',
    'Sample Characteristic[disease]',
    
]

In [5]:
adata.obs[hide].to_csv("./E-MTAB-5061.hidden.csv")

In [6]:
adata.obs = adata.obs[keep].copy()

In [7]:
def get_gene_names(gene_ids):
  genes = '", "'.join(gene_ids)
  data = '{ "ids" : ["' + genes + '" ] }'
  
  server = "https://rest.ensembl.org"
  ext = "/lookup/id"
  headers={ "Content-Type" : "application/json", "Accept" : "application/json"}
  
  r = requests.post(server+ext, headers=headers, data=data)
  
  if not r.ok:
    r.raise_for_status()
    sys.exit()
  
  decoded = r.json()
  return decoded

In [8]:
def decode_gene_names(decoded):
    gene_names = {}
    for gene in decoded.values():
        try:
            gene_id = gene["id"]
            gene_symbol = gene.get("display_name", "")
            gene_names[gene_id] = gene_symbol
        except TypeError:
            continue
    return gene_names

Actually translate names:

In [9]:
step = 1000
for i in tqdm(range(0, adata.shape[1], step)):
    gene_ids = adata.var.index[i:i+step]
    decoded = get_gene_names(gene_ids)
    gene_names = decode_gene_names(decoded)
    adata.var.loc[gene_ids, "gene_names"] = pd.Series(gene_names)

100%|███████████████████████████████████████████| 31/31 [06:12<00:00, 12.02s/it]


In [11]:
adata.var.to_csv("pancreas_genes.csv")

In [12]:
adata.write("blank.h5ad")