# Obtain Projects and specimens from Single Cell Expression Atlas

## Imports

In [1]:
import requests
import json
import re
import time
import numpy as np
import pandas as pd

from pprint import pprint
from lxml import html
from IPython.display import clear_output

## Get the projects and save them

In [2]:
def get_info_from_html(experiment_ID):
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"
  
    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        return None
    
    # Parse response so we get the sample information
    parser = html.fromstring(answer.text)
    script_text = parser.xpath(".//div[@id='content']//script[3]/text()") 
    match = re.search(r'content: (?P<value>{.*})', script_text[0])
    value = match.group('value')
    
    return json.loads(value)

In [3]:
def get_headers_dict_from_sample(sample):
    headers_aux = {
        'sample_characteristics': {},
        'experimental_variables': {}
    }

    header_order = {}
    header_types = {
        'sample_characteristics': [],
        'experimental_variables': []        
    }

    # Init dictionary and the header types for the sample
    sample_headers = sample['tabs'][1]['props']['table']['headers']
    sample_characteristics_headers = []
    experimental_variables_headers = []

    for n, sample_header in enumerate(sample_headers):
        if sample_header['name'] == 'Sample Characteristics':
            header_order['sample_characteristics'] = n
            for sample_characteristic in sample_header['values']:
                # If key has not been viewed yet, add it to dictionary
                if sample_characteristic not in headers_aux['sample_characteristics'].keys():
                    headers_aux['sample_characteristics'][sample_characteristic] = set()

                # Add the type of the header
                header_types['sample_characteristics'] += [sample_characteristic]
        elif sample_header['name'] == 'Experimental Variables':
            header_order['experimental_variables'] = n
            for experimental_variable in sample_header['values']:
                # If key has not been viewed yet, add it to dictionary
                if sample_characteristic not in headers_aux['experimental_variables'].keys():
                    headers_aux['experimental_variables'][sample_characteristic] = set()

                # Add the types of the header
                header_types['experimental_variables'] += [sample_characteristic]
                
    return headers_aux, header_types, header_order

In [4]:
def sample_to_dict(sample):
    # Get headers, initiating dict, headers types and the order of the headers
    dictionary, header_types, header_order = get_headers_dict_from_sample(sample)
    
    # Get sample data
    sample_data = sample['tabs'][1]['props']['table']['data']

    # For each cell in data
    for cell in sample_data:
        # For each header type
        for header_type in ['sample_characteristics', 'experimental_variables']:
            # Add the values to the dictionary
            values = cell['values'][header_order[header_type]]
            headers = header_types[header_type]
            for value, header_name in zip(values, headers):
                dictionary[header_type][header_name].add(value)
        
    return dictionary

In [5]:
def process_dict(dictionary):
    for key in dictionary:
        for key2 in dictionary[key]:
            dictionary[key][key2] = list(dictionary[key][key2])
    return dictionary

In [14]:
seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments"

In [15]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [16]:
answer = requests.get(seed_url, headers=headers)

In [17]:
accessing_error = []
projects = []
n_experiments = len(answer.json()["experiments"])

avoid_collections = ["Human Cell Atlas"]

for n, experiment in enumerate(answer.json()['experiments']):
    experiment_ID = experiment['experimentAccession']
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/results/tsne"
    
    # Print loop information
    print("Getting project \"" + experiment_ID + "\"")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{n_experiments}")
    
    # If the projects is in a repository we already have we skip it
    if [i for i in experiment["experimentProjects"] if i in avoid_collections]:
        clear_output(wait=True)
        continue
    
    # Wait between request so we dont overcharge server
    time.sleep(2.0)
    
    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
        
    # Parse response so we get the useful information
    parser = html.fromstring(answer.text)
    publication_link = parser.xpath(".//a[@class='pubmed-id']/@href")
    publication_title = parser.xpath(".//a[@class='pubmed-id']/text()")

    experiment['publication_link'] = publication_link
    experiment['publication_title'] = publication_title
    experiment['repository_link'] = seed_url
    experiment['supplementary_link'] = []
    
    # Get supplementary information
    seed_url = "https://www.ebi.ac.uk/gxa/sc/json/experiments/" + experiment_ID + "/resources/SUPPLEMENTARY_INFORMATION"
    ## Wait between request so we dont overcharge server
    time.sleep(2.0)
    
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
    
    answer_json = answer.json()
    experiment['ENA_ID'] = None
    experiment['ArrayExpress_ID'] = None

    # For each repository information, save the id and the url of the project
    for group in answer.json():
        if group['type'] == 'icon-ena': # If ENA repository
            ena_id_re = re.search('([SE]RP\d+)', group['description'])
            
            if not ena_id_re:
                continue
            
            ena_id = ena_id_re.group(1)
            
            experiment['ENA_ID'] = ena_id
            experiment['supplementary_link'].append(group['url'])
            
        elif group['type'] == 'icon-ae': # If Array Expression repository
            ae_id_re = re.search('(E-MTAB-\d+)', group['description'])
            
            if not ae_id_re:
                continue
            
            ae_id = ae_id_re.group(1)
            
            experiment['ArrayExpress_ID'] = ae_id
            experiment['supplementary_link'].append(group['url'])
            
        else:
            continue
    
    # Get download information
    time.sleep(2.0)
   
    exp_info = get_info_from_html(experiment_ID)
    
    experiment['downloads'] = exp_info['tabs'][3]['props']['data']
    
    # Experiment design
    
    exp_info_headers = exp_info['tabs'][1]['props']['table']['headers']
    column_names = exp_info_headers[0]['values'] + \
                   exp_info_headers[1]['values'] + \
                   exp_info_headers[2]['values']

    # Read tsv with experiment design data
    df = pd.read_csv(
        "https://www.ebi.ac.uk/gxa/sc/experiment/" + experiment_ID + "/download?fileType=experiment-design&accessKey=",
        sep="\t",
        low_memory=False)

    # Delete Ontology terms columns
    cols = [c for c in df.columns if 'ontology term' not in c.lower()]
    df = df[cols]

    # Rename columns
    df.columns = column_names
    df = df.loc[:, ~df.columns.duplicated()]  # Drop duplicated columns

    # Remove assay column from df
    assay = df.Assay
    df = df.drop('Assay', axis=1)

    # Remove duplicated rows
    df = df.drop_duplicates()

    if 'single cell identifier' in column_names:
        column_names.remove('single cell identifier')
        df = df.drop('single cell identifier', axis=1)
    
    sample_type = get_sample_type(df)
    experiment['sample_type'] = sample_type
    
    experiment['project_info'] = {}
    for column in df:
        experiment['project_info'][column] = list(df[column].unique().astype(str))
    
    if 'individual' in column_names:
        experiment['donors'] = len(experiment['project_info']['individual'])
    else:
        experiment['donors'] = -1
    
    # Add the project to the list
    projects.append(experiment)
    
    clear_output(wait=True)

Getting project "E-MTAB-9221"
Number of errors: 0
181/181


In [18]:
print(len(projects))

159


In [19]:
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

In [20]:
with open('../../SingleCell-Files/raw_data/SCAE_projects.json', 'w') as outfile:
    json.dump({"experiments": projects}, outfile)

# Get the specimens and save them 

In [54]:
with open('../SingleCell-Files/raw_data/SCAE_projects.json') as f:
    projects = json.load(f)['experiments']

In [55]:
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [56]:
# https://www.ebi.ac.uk/gxa/sc/experiments/*NOMBRE EXPERIMENTO*/experiment-design
experiment_ID = projects[100]['experimentAccession']
seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"

In [57]:
answer = requests.get(seed_url, headers=headers)

In [58]:
parser = html.fromstring(answer.text)

In [228]:
script_text = parser.xpath(".//div[@id='content']//script[3]/text()")

In [229]:
print(answer.text)




<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="utf-8">
    <title>Experiment < Single Cell Expression Atlas &lt; EMBL-EBI</title>
    <meta name="description" content="EMBL-EBI Single Cell Expression Atlas, an open public repository of single cell gene expression data">
    <meta name="keywords" content="expression atlas, single cell expression, gene expression, baseline expression, functional genomics, public repository, repository, bioinformatics, europe, institute">
    <meta name="author" content="EBI Gene Expression Team – https://www.ebi.ac.uk/about/people/irene-papatheodorou">
    <meta name="HandheldFriendly" content="true" />
    <meta name="MobileOptimized" content="width" />
    <meta name="viewport" content="width=device-width,initial-scale=1">
    <meta name="theme-color" content="#70BDBD"> <!-- Android Chrome mobile browser tab color -->

    <!-- Add information on the life cycle of this page -->
    <meta name="ebi:owner" content="Irene Papatheodorou <ir

In [230]:
match = re.search(r'content: (?P<value>{.*})', script_text[0])

pprint(match.group('value'))

('{"experimentAccession":"E-CURD-55","accessKey":"","species":"homo '
 'sapiens","disclaimer":"","tabs":[{"type":"results","name":"Results","props":{"ks":[7,12,17,21,26,38,50,61,72],"ksWithMarkerGenes":[7,12,17,21,26,38,50,61,72],"selectedK":26,"perplexities":[10,35,1,25,45,5,30,15,50,40,20],"metadata":[{"value":"individual","label":"Individual"},{"value":"disease","label":"Disease"},{"value":"disease_staging","label":"Disease '
 'staging"}],"units":["CPM"],"suggesterEndpoint":"json/suggestions"}},{"type":"experiment-design","name":"Experiment '
 'Design","props":{"table":{"headers":[{"name":"","values":["Assay"]},{"name":"Sample '
 'Characteristics","values":["organism","developmental '
 'stage","individual","disease","disease staging","sampling time '
 'point","organism part"]},{"name":"Experimental '
 'Variables","values":["disease","disease '
 'staging"]}],"data":[{"properties":{"analysed":true},"values":[["SAMC150711-AAAAACGACGCTCTTC"],["Homo '
 'sapiens","adult","ERS1","COVID-19"

 'sapiens","adult","ERS1","COVID-19","early recovery stage","less than 7 days '
 'after negative nucleic acid blood test","blood"],["COVID-19","early recovery '
 'stage"]]},{"properties":{"analysed":true},"values":[["SAMC150711-ACTGAGTCAGCCAGAA"],["Homo '
 'sapiens","adult","ERS1","COVID-19","early recovery stage","less than 7 days '
 'after negative nucleic acid blood test","blood"],["COVID-19","early recovery '
 'stage"]]},{"properties":{"analysed":true},"values":[["SAMC150711-ACTGAGTGTAAATGAC"],["Homo '
 'sapiens","adult","ERS1","COVID-19","early recovery stage","less than 7 days '
 'after negative nucleic acid blood test","blood"],["COVID-19","early recovery '
 'stage"]]},{"properties":{"analysed":true},"values":[["SAMC150711-ACTGAGTGTCTGATCA"],["Homo '
 'sapiens","adult","ERS1","COVID-19","early recovery stage","less than 7 days '
 'after negative nucleic acid blood test","blood"],["COVID-19","early recovery '
 'stage"]]},{"properties":{"analysed":true},"values":[["SAMC150711-ACTG

In [25]:
specimens = []
accessing_error = []

for n, experiment in enumerate(projects):
    # Get the ID of the proyect and make the url
    experiment_ID = experiment['experimentAccession']
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"
  
    # Print loop information
    print("Getting specimens from experiment \"" + experiment_ID + "\"")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{len(projects)}")
    
    # Wait between request so we dont overcharge server
    time.sleep(2.0)

    # Get specimens info
    experiment_specimens = get_specimens(experiment_ID)
    for item in experiment_specimens:
        item.update({
            "project_title": experiment['experimentDescription'],
            "experiment_projects": experiment['experimentProjects']
        })   
    # Add the samples to the list
    specimens += experiment_specimens
    
    clear_output(wait=True)

Getting specimens from experiment "E-MTAB-9221"
Number of errors: 0
159/159


In [26]:
specimens_json = {'specimens': specimens}

In [27]:
with open('../../SingleCell-Files/raw_data/SCAE_samples.json', 'w') as outfile:
    json.dump(specimens_json, outfile, default=convert)

# Tests

In [31]:
import requests
import json
import re
import time
import pandas as pd

from pprint import pprint
from lxml import html
from IPython.display import clear_output

In [10]:
def get_gb_columns(columns):
    gb_columns = []
    
    if 'individual' in columns:
        gb_columns += ['individual']
    elif 'disease' in columns:
        gb_columns += ['disease']
    elif 'strain' in columns:
        gb_columns += ['strain']
    elif 'cell line' in columns:
        gb_columns += ['cell line']

    if 'organism part' in columns:
        gb_columns += ['organism part']

    if 'sampling site' in columns:
        gb_columns += ['sampling site']
    elif 'metastatic site' in columns:
        gb_columns += ['metastatic site']
        
    if 'growth condition' in columns:
        gb_columns += ['growth condition']
    
    if 'stimulus' in columns:
        gb_columns += ['stimulus']
        
    return gb_columns

In [11]:
def get_sample_type(dataframe):
    sample_type = []
    
    columns = dataframe.columns
    
    # Cell line
    if 'cell line' in columns:
        sample_type += ['CellLines']
        
    # Organoid
    if 'growth condition' in columns:
        # print(list(dataframe['growth condition']))
        aux = [s for s in list(dataframe['growth condition']) if 'organoid' in str(s)]
        if aux:
            sample_type += ['Organoids']
    elif 'individual' in columns:
        # print(list(dataframe['individual']))
        aux = [s for s in list(dataframe['individual']) if 'organoid' in str(s)]
        if aux:
            sample_type += ['Organoids']
    elif 'organism part' in columns:
        # print(list(dataframe['organism part']))
        aux = [s for s in list(dataframe['organism part']) if 'organoid' in str(s)]
        if aux:
            sample_type += ['Organoids']
    
    # Specimen
    if not sample_type:
        sample_type = ['Specimens']
    
    return sample_type

In [23]:
def get_specimens(project_id):
    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    }
    exp_info = get_info_from_html(project_id)
    
    exp_info_headers = exp_info['tabs'][1]['props']['table']['headers']
    column_names = exp_info_headers[0]['values'] +\
                   exp_info_headers[1]['values'] +\
                   exp_info_headers[2]['values']
    
    # Get columns for the group by
    gb_columns = get_gb_columns(column_names)
    # Read tsv with experiment design data
    df = pd.read_csv("https://www.ebi.ac.uk/gxa/sc/experiment/" + project_id + "/download?fileType=experiment-design&accessKey=", sep="\t", low_memory=False)
        
    # Delete Ontology terms columns
    cols = [c for c in df.columns if 'ontology term' not in c.lower()]
    df = df[cols]
    # Rename columns
    df.columns = column_names
    df = df.loc[:,~df.columns.duplicated()] # Drop duplicated columns

    if 'single cell identifier' in column_names:
        column_names.remove('single cell identifier')
        df = df.drop('single cell identifier', axis=1)

    specimens = []
    # Group columns by individual and organ
    if gb_columns:
        df_gb = df.groupby(by=gb_columns)
        
        # For each group (specimen) get the data
        for name, group in df_gb:
            assay = group.Assay
            group = group.drop('Assay', axis=1)
            group = group.drop_duplicates()

            sample_type = get_sample_type(group)
            
            specimen = {
                'project_id': project_id,
                'cells': list(assay),
                'num_cells': len(list(assay)),
                'sample_type': sample_type,
                'specimen_info': {}
            }
            for column in group:
                specimen['specimen_info'][column] = list(group[column].unique())

            specimens.append(specimen)
    # If there is no column to group by
    else:
        assay = df.Assay
        df = df.drop('Assay', axis=1)
        df = df.drop_duplicates()
        
        sample_type = get_sample_type(df)

        specimen = {
            'project_id': project_id,
            'cells': list(assay),
            'num_cells': len(list(assay)),
            'sample_type': sample_type,
            'specimen_info': {}
        }
        for column in df:
            specimen['specimen_info'][column] = list(df[column].unique().astype(str))

        specimens.append(specimen)
    
    return specimens

In [24]:
get_specimens('E-CURD-11')

[{'project_id': 'E-CURD-11',
  'cells': ['SRR2049340',
   'SRR2049341',
   'SRR2049342',
   'SRR2049343',
   'SRR2049344',
   'SRR2049345',
   'SRR2049346',
   'SRR2049347',
   'SRR2049348',
   'SRR2049349',
   'SRR2049350',
   'SRR2049351',
   'SRR2049352',
   'SRR2049353',
   'SRR2049354',
   'SRR2049355',
   'SRR2049356',
   'SRR2049357',
   'SRR2049358',
   'SRR2049359',
   'SRR2049360',
   'SRR2049361',
   'SRR2049362',
   'SRR2049363',
   'SRR2049364',
   'SRR2049365',
   'SRR2049366',
   'SRR2049367',
   'SRR2049368',
   'SRR2049369',
   'SRR2049370',
   'SRR2049371',
   'SRR2049372',
   'SRR2049373',
   'SRR2049374',
   'SRR2049375',
   'SRR2049376',
   'SRR2049377',
   'SRR2049378',
   'SRR2049379',
   'SRR2049380',
   'SRR2049381',
   'SRR2049382',
   'SRR2049383',
   'SRR2049384',
   'SRR2049385',
   'SRR2049386',
   'SRR2049387',
   'SRR2049388',
   'SRR2049389'],
  'num_cells': 50,
  'sample_type': ['CellLines'],
  'specimen_info': {'organism': ['Homo sapiens'],
   'indivi