# Obtain Projects and specimens from Single Cell Expression Atlas

## Imports

In [49]:
import requests
import json
import re
import time

from lxml import html
from IPython.display import clear_output

## Get the projects and save them

In [54]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [55]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [56]:
answer = requests.get(seed_url, headers=headers)

In [None]:
accessing_error = []
projects = []
n_experiments = len(answer.json()["experiments"])

avoid_collections = ["Human Cell Atlas"]

for n, experiment in enumerate(answer.json()['experiments']):
    experiment_ID = experiment['experimentAccession']
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/results/tsne"
    
    # Print loop information
    print("Getting project \"" + experiment_ID + "\"")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{n_experiments}")
    
    # If the projects is in a repository we already have we skip it
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    # Wait between request so we dont overcharge server
    time.sleep(2.0)
    
    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
        
    # Parse response so we get the useful information
    parser = html.fromstring(answer.text)
    publication_link = parser.xpath(".//a[@class='pubmed-id']/@href")
    publication_title = parser.xpath(".//a[@class='pubmed-id']/text()")

    experiment['publication_link'] = publication_link
    experiment['publication_title'] = publication_title
    experiment['supplementary_link'] = [seed_url]
    
    # Get supplementary information
    seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments/" + experiment_ID + "/resources/SUPPLEMENTARY_INFORMATION"
    ## Wait between request so we dont overcharge server
    time.sleep(2.0)
    
    answer = requests.get(seed_url, headers=headers)
    answer_json = answer.json()
    experiment['ENA_ID'] = None
    experiment['ArrayExpress_ID'] = None

    # For each repository information, save the id and the url of the project
    for group in answer.json():
        if group['type'] == 'icon-ena': # If ENA repository
            ena_id_re = re.search('([SE]RP\d+)', group['description'])
            
            if not ena_id_re:
                continue
            
            ena_id = ena_id_re.group(1)
            
            experiment['ENA_ID'] = ena_id
            experiment['supplementary_link'].append(group['url'])
            
        elif group['type'] == 'icon-ae': # If Array Expression repository
            ae_id_re = re.search('(E-MTAB-\d+)', group['description'])
            
            if not ae_id_re:
                continue
            
            ae_id = ae_id_re.group(1)
            
            experiment['ArrayExpress_ID'] = ae_id
            experiment['supplementary_link'].append(group['url'])
            
        else:
            continue
    
    # Add the project to the list
    projects.append(experiment)
    
    clear_output(wait=True)

Getting project "E-MTAB-4547"
Number of errors: 0
99/175


In [None]:
print(len(projects))

In [None]:
with open('../SingleCell-Files/raw_data/SCAE_projects.json', 'w') as outfile:
    json.dump({"experiments": projects}, outfile)

# Get the specimens and save them 

In [38]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [41]:
# https://www.ebi.ac.uk/gxa/sc/experiments/*NOMBRE EXPERIMENTO*/experiment-design
experiment_ID = projects[100]['experimentAccession']
seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"

In [42]:
answer = requests.get(seed_url, headers=headers)

In [43]:
parser = html.fromstring(answer.text)

In [44]:
script_text = parser.xpath(".//div[@id='content']//script[3]/text()")

In [45]:
print(answer.text)




<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="utf-8">
    <title>Experiment < Single Cell Expression Atlas &lt; EMBL-EBI</title>
    <meta name="description" content="EMBL-EBI Single Cell Expression Atlas, an open public repository of single cell gene expression data">
    <meta name="keywords" content="expression atlas, single cell expression, gene expression, baseline expression, functional genomics, public repository, repository, bioinformatics, europe, institute">
    <meta name="author" content="EBI Gene Expression Team – https://www.ebi.ac.uk/about/people/irene-papatheodorou">
    <meta name="HandheldFriendly" content="true" />
    <meta name="MobileOptimized" content="width" />
    <meta name="viewport" content="width=device-width,initial-scale=1">
    <meta name="theme-color" content="#70BDBD"> <!-- Android Chrome mobile browser tab color -->

    <!-- Add information on the life cycle of this page -->
    <meta name="ebi:owner" content="Irene Papatheodorou <ir

In [46]:
match = re.search(r'content: (?P<value>{.*})', script_text[0])

print(match.group('value'))

{"experimentAccession":"E-MTAB-6386","accessKey":"","species":"homo sapiens","disclaimer":"","tabs":[{"type":"results","name":"Results","props":{"ks":[1,2,3,18,33,45,59],"ksWithMarkerGenes":[2,3,18],"selectedK":3,"perplexities":[10,35,1,25,45,5,30,15,50,40,20],"metadata":[{"value":"cell_type","label":"Cell type"}],"units":["CPM"],"suggesterEndpoint":"json/suggestions"}},{"type":"experiment-design","name":"Experiment Design","props":{"table":{"headers":[{"name":"","values":["Assay"]},{"name":"Sample Characteristics","values":["organism","developmental stage","individual","age","sex","disease","organism part","cell type","phenotype","single cell quality"]},{"name":"Experimental Variables","values":["cell type","single cell identifier"]}],"data":[{"properties":{"analysed":true},"values":[["ERR2632411"],["Homo sapiens","adult","A","33 year","female","normal","blood","memory B cell","CD19+ IgD- CD27+ IgG+","OK"],["memory B cell","cell 13"]]},{"properties":{"analysed":true},"values":[["ERR26

In [49]:
specimens = []
accessing_error = []

for n, experiment in enumerate(projects):
    # Get the ID of the proyect and make the url
    experiment_ID = experiment['experimentAccession']
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"
  
    # Print loop information
    print("Getting specimens from experiment \"" + experiment_ID + "\"")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{len(projects)}")
    # Wait between request so we dont overcharge server
    time.sleep(2.0)

    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
    
    # Parse response so we get the sample information
    parser = html.fromstring(answer.text)
    script_text = parser.xpath(".//div[@id='content']//script[3]/text()") 
    match = re.search(r'content: (?P<value>{.*})', script_text[0])
    value = match.group('value')
    
    # Add the samples to the list
    specimens.append(json.loads(value))
    
    clear_output(wait=True)

Getting specimens from experiment "E-MTAB-9221"
Number of errors: 0
153/153


In [50]:
accessing_error[0].url

IndexError: list index out of range

In [51]:
specimens_json = {'specimens': specimens}

In [52]:
with open('../SingleCell-Files/raw_data/SCAE_samples.json', 'w') as outfile:
    json.dump(specimens_json, outfile)

# Parse project to ontology format

In [53]:
specimens_json['specimens'][0]

{'experimentAccession': 'E-CURD-10',
 'accessKey': '',
 'species': 'homo sapiens',
 'disclaimer': '',
 'tabs': [{'type': 'results',
   'name': 'Results',
   'props': {'ks': [2, 3, 4, 20, 37, 50],
    'ksWithMarkerGenes': [2, 3, 4],
    'selectedK': 3,
    'perplexities': [10, 35, 1, 25, 45, 5, 30, 15, 50, 40, 20],
    'metadata': [{'value': 'growth_condition', 'label': 'Growth condition'}],
    'units': ['CPM'],
    'suggesterEndpoint': 'json/suggestions'}},
  {'type': 'experiment-design',
   'name': 'Experiment Design',
   'props': {'table': {'headers': [{'name': '', 'values': ['Assay']},
      {'name': 'Sample Characteristics',
       'values': ['organism',
        'age',
        'developmental stage',
        'sex',
        'growth condition',
        'organism part',
        'metastatic site',
        'sampling site',
        'disease',
        'clinical history',
        'single cell quality']},
      {'name': 'Experimental Variables',
       'values': ['single cell identifier', '