# Obtain Projects and specimens from Single Cell Expression Atlas

## Imports

In [7]:
import requests
import json
import re
import time

from pprint import pprint
from lxml import html
from IPython.display import clear_output

## Get the projects and save them

In [12]:
def get_info_from_html(experiment_ID):
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"
  
    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        return None
    
    # Parse response so we get the sample information
    parser = html.fromstring(answer.text)
    script_text = parser.xpath(".//div[@id='content']//script[3]/text()") 
    match = re.search(r'content: (?P<value>{.*})', script_text[0])
    value = match.group('value')
    
    return value

In [14]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [15]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [16]:
answer = requests.get(seed_url, headers=headers)

In [17]:
accessing_error = []
projects = []
n_experiments = len(answer.json()["experiments"])

avoid_collections = ["Human Cell Atlas"]

for n, experiment in enumerate(answer.json()['experiments']):
    experiment_ID = experiment['experimentAccession']
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/results/tsne"
    
    # Print loop information
    print("Getting project \"" + experiment_ID + "\"")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{n_experiments}")
    
    # If the projects is in a repository we already have we skip it
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    # Wait between request so we dont overcharge server
    time.sleep(2.0)
    
    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
        
    # Parse response so we get the useful information
    parser = html.fromstring(answer.text)
    publication_link = parser.xpath(".//a[@class='pubmed-id']/@href")
    publication_title = parser.xpath(".//a[@class='pubmed-id']/text()")

    experiment['publication_link'] = publication_link
    experiment['publication_title'] = publication_title
    experiment['repository_link'] = seed_url
    experiment['supplementary_link'] = []
    
    # Get supplementary information
    seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments/" + experiment_ID + "/resources/SUPPLEMENTARY_INFORMATION"
    ## Wait between request so we dont overcharge server
    time.sleep(2.0)
    
    answer = requests.get(seed_url, headers=headers)
    answer_json = answer.json()
    experiment['ENA_ID'] = None
    experiment['ArrayExpress_ID'] = None

    # For each repository information, save the id and the url of the project
    for group in answer.json():
        if group['type'] == 'icon-ena': # If ENA repository
            ena_id_re = re.search('([SE]RP\d+)', group['description'])
            
            if not ena_id_re:
                continue
            
            ena_id = ena_id_re.group(1)
            
            experiment['ENA_ID'] = ena_id
            experiment['supplementary_link'].append(group['url'])
            
        elif group['type'] == 'icon-ae': # If Array Expression repository
            ae_id_re = re.search('(E-MTAB-\d+)', group['description'])
            
            if not ae_id_re:
                continue
            
            ae_id = ae_id_re.group(1)
            
            experiment['ArrayExpress_ID'] = ae_id
            experiment['supplementary_link'].append(group['url'])
            
        else:
            continue
    
    # Get download information
    time.sleep(2.0)
   
    info = get_info_from_html(experiment_ID)
    
    print(info)
    
    experiment['downloads'] = info['tabs'][3]['props']['data']
    
    # Add the project to the list
    projects.append(experiment)
    
    clear_output(wait=True)

Getting project "E-CURD-10"
Number of errors: 0
1/175


TypeError: string indices must be integers

In [None]:
print(len(projects))

In [None]:
with open('../SingleCell-Files/raw_data/SCAE_projects.json', 'w') as outfile:
    json.dump({"experiments": projects}, outfile)

# Get the specimens and save them 

In [8]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [9]:
# https://www.ebi.ac.uk/gxa/sc/experiments/*NOMBRE EXPERIMENTO*/experiment-design
experiment_ID = projects[100]['experimentAccession']
seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"

In [10]:
answer = requests.get(seed_url, headers=headers)

In [11]:
parser = html.fromstring(answer.text)

In [12]:
script_text = parser.xpath(".//div[@id='content']//script[3]/text()")

In [13]:
print(answer.text)




<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="utf-8">
    <title>Experiment < Single Cell Expression Atlas &lt; EMBL-EBI</title>
    <meta name="description" content="EMBL-EBI Single Cell Expression Atlas, an open public repository of single cell gene expression data">
    <meta name="keywords" content="expression atlas, single cell expression, gene expression, baseline expression, functional genomics, public repository, repository, bioinformatics, europe, institute">
    <meta name="author" content="EBI Gene Expression Team – https://www.ebi.ac.uk/about/people/irene-papatheodorou">
    <meta name="HandheldFriendly" content="true" />
    <meta name="MobileOptimized" content="width" />
    <meta name="viewport" content="width=device-width,initial-scale=1">
    <meta name="theme-color" content="#70BDBD"> <!-- Android Chrome mobile browser tab color -->

    <!-- Add information on the life cycle of this page -->
    <meta name="ebi:owner" content="Irene Papatheodorou <ir

In [18]:
match = re.search(r'content: (?P<value>{.*})', script_text[0])

pprint(match.group('value'))

('{"experimentAccession":"E-ENAD-21","accessKey":"","species":"homo '
 'sapiens","disclaimer":"","tabs":[{"type":"results","name":"Results","props":{"ks":[3,4,7,9,10,14,17,24,28],"ksWithMarkerGenes":[3,4,7,9,10,14,17,24],"selectedK":3,"perplexities":[10,35,1,25,45,5,30,15,50,40,20],"metadata":[{"value":"cell_type","label":"Cell '
 'type"},{"value":"individual","label":"Individual"},{"value":"age","label":"Age"}],"units":["CPM"],"suggesterEndpoint":"json/suggestions"}},{"type":"experiment-design","name":"Experiment '
 'Design","props":{"table":{"headers":[{"name":"","values":["Assay"]},{"name":"Sample '
 'Characteristics","values":["organism","individual","sex","age","ethnic '
 'group","clinical information","organism part","facs marker","cell '
 'type","fluidigm c1 run"]},{"name":"Experimental Variables","values":["single '
 'cell identifier","cell '
 'type"]}],"data":[{"properties":{"analysed":true},"values":[["SRR7008457"],["Homo '
 'sapiens","individual1","female","37 year","Caucasi

 'epithelial cell of mammary gland","L4_I1"],["L4_I1_LUM_A10","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008691"],["Homo '
 'sapiens","individual1","female","37 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L4_I1"],["L4_I1_LUM_A11","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008692"],["Homo '
 'sapiens","individual1","female","37 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L4_I1"],["L4_I1_LUM_A12","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008693"],["Homo '
 'sapiens","individual1","female","37 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary

 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L1_I2"],["L1_I2_BAS_D11","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008797"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L1_I2"],["L1_I2_BAS_D12","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008798"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L1_I2"],["L1_I2_BAS_E1","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008799"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49

 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L2_I2"],["L2_I2_LUM_G1","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008897"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L2_I2"],["L2_I2_LUM_G2","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008898"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L2_I2"],["L2_I2_LUM_G4","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008899"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epitheli

 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L3_I2"],["L3_I2_BAS_H3","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008997"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L3_I2"],["L3_I2_BAS_H4","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008998"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L3_I2"],["L3_I2_BAS_H5","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7008999"],["Homo '
 'sapiens","individual2","female","35 year","Caucasian","redu

 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L2_I3"],["L2_I3_LUM_A12","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010636"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L2_I3"],["L2_I3_LUM_B1","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010637"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","luminal '
 'epithelial cell of mammary gland","L2_I3"],["L2_I3_LUM_B2","luminal '
 'epithelial cell of mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010638"],["Homo '
 'sapiens","individual3","fema

 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010763"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L3_I3"],["L3_I3_BAS_F6","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010764"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L3_I3"],["L3_I3_BAS_F7","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010765"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L3_I3"],["L3_I3_BAS_F9","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR70

 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010835"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L5_I3"],["L5_I3_BAS_E6","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010836"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L5_I3"],["L5_I3_BAS_E7","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR7010837"],["Homo '
 'sapiens","individual3","female","20 year","Caucasian","reduction '
 'mammoplasty","breast epithelium","CD31, CD45, EpCAM, CD49f","myoepithelial '
 'cell of mammary gland","L5_I3"],["L5_I3_BAS_E8","myoepithelial cell of '
 'mammary '
 'gland"]]},{"properties":{"analysed":true},"values":[["SRR70

In [15]:
specimens = []
accessing_error = []

for n, experiment in enumerate(projects):
    # Get the ID of the proyect and make the url
    experiment_ID = experiment['experimentAccession']
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"
  
    # Print loop information
    print("Getting specimens from experiment \"" + experiment_ID + "\"")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{len(projects)}")
    # Wait between request so we dont overcharge server
    time.sleep(2.0)

    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
    
    # Parse response so we get the sample information
    parser = html.fromstring(answer.text)
    script_text = parser.xpath(".//div[@id='content']//script[3]/text()") 
    match = re.search(r'content: (?P<value>{.*})', script_text[0])
    value = match.group('value')
    
    # Add the samples to the list
    specimens.append(json.loads(value))
    
    clear_output(wait=True)

Getting specimens from experiment "E-ENAD-27"
Number of errors: 0
24/153


KeyboardInterrupt: 

In [None]:
accessing_error[0].url

In [None]:
specimens_json = {'specimens': specimens}

In [None]:
with open('../SingleCell-Files/raw_data/SCAE_samples.json', 'w') as outfile:
    json.dump(specimens_json, outfile)