In [1]:
import json

In [2]:
with open('../SingleCell-Files/raw_data/SCAE_projects.json') as f:
    SCAE_projects = json.load(f)

In [3]:
projects = SCAE_projects['experiments']

In [4]:
print(json.dumps(projects[0], indent=2, sort_keys=True))

{
  "ArrayExpress_ID": null,
  "ENA_ID": "SRP063840",
  "downloads": [
    {
      "files": [
        {
          "description": "Experiment metadata (SDRF and IDF files archive)",
          "isDownload": true,
          "type": "icon-tsv",
          "url": "experiment/E-CURD-10/download/zip?fileType=experiment-metadata&accessKey="
        },
        {
          "description": "Experiment design file (TSV format)",
          "isDownload": true,
          "type": "icon-experiment-design",
          "url": "experiment/E-CURD-10/download?fileType=experiment-design&accessKey="
        }
      ],
      "title": "Metadata files"
    },
    {
      "files": [
        {
          "description": "Clustering file (TSV format)",
          "isDownload": true,
          "type": "icon-tsv",
          "url": "experiment/E-CURD-10/download?fileType=cluster&accessKey="
        },
        {
          "description": "Filtered TPMs files (MatrixMarket archive)",
          "isDownload": true,
          "ty

In [28]:
experiment_accession = {}
experiment_description = {}
experiment_projects = {}
experiment_type = {}
experimental_factors = {}
kingdoms = {}
raw_experiment_type = {}
species = []
technology_types = []

In [29]:
for project in projects:
    technology_types += project['technologyType']
    species.append(project['species'])

In [30]:
species = set(species)
print(species)

{'Mus Musculus', 'Anopheles gambiae', 'Schistosoma mansoni', 'Homo sapiens', 'Danio rerio', 'Mus musculus', 'Rattus norvegicus', 'Callithrix jacchus', 'Gallus gallus', 'Caenorhabditis elegans', 'Drosophila melanogaster', 'Saccharomyces cerevisiae', 'Arabidopsis thaliana', 'Plasmodium berghei', 'Plasmodium falciparum 3D7'}


In [31]:
technology_types = set(technology_types)
print(technology_types)

{'smart-seq2', 'seq-well', '10xv2', '10xv3', 'smart-like', 'smart-seq', 'drop-seq'}


# Samples

In [3]:
with open('../SingleCell-Files/raw_data/SCAE_samples.json') as f:
    SCAE_samples = json.load(f)

In [7]:
print(json.dumps(SCAE_samples['specimens'][0], indent=2, sort_keys=True))

{
  "accessKey": "",
  "disclaimer": "",
  "experimentAccession": "E-CURD-10",
  "species": "homo sapiens",
  "tabs": [
    {
      "name": "Results",
      "props": {
        "ks": [
          2,
          3,
          4,
          20,
          37,
          50
        ],
        "ksWithMarkerGenes": [
          2,
          3,
          4
        ],
        "metadata": [
          {
            "label": "Growth condition",
            "value": "growth_condition"
          }
        ],
        "perplexities": [
          10,
          35,
          1,
          25,
          45,
          5,
          30,
          15,
          50,
          40,
          20
        ],
        "selectedK": 3,
        "suggesterEndpoint": "json/suggestions",
        "units": [
          "CPM"
        ]
      },
      "type": "results"
    },
    {
      "name": "Experiment Design",
      "props": {
        "downloadUrl": "experiment/E-CURD-10/download?fileType=experiment-design&accessKey=",
        "

In [4]:
samples = SCAE_samples['specimens']

In [5]:
len(samples)

153

## Downloads

In [35]:
import pprint
data_format = {}

for sample in samples:
    samples_data = sample['tabs'][3]['props']['data']
    for sample_data in samples_data:
        titulo = sample_data['title']
        
        if titulo not in data_format.keys():
            data_format[titulo] = set()
        
        files = sample_data['files']
        
        for file in files:
            if file['isDownload']:
                data_format[titulo].add(file['description'])

pprint.pprint(data_format)

{'Metadata files': {'Experiment design file (TSV format)',
                    'Experiment metadata (SDRF and IDF files archive)'},
 'Result files': {'Clustering file (TSV format)',
                  'Filtered TPMs files (MatrixMarket archive)',
                  'Marker gene files (TSV files archive)',
                  'Normalised counts files (MatrixMarket archive)',
                  'Raw counts files (MatrixMarket archive)'}}


## Experiment design

Headers

In [27]:
import pprint

headers = {
    'sample_characteristics': set(),
    'experimental_variables': set()
}

len_headers = set()

for sample in samples:
    sample_headers = sample['tabs'][1]['props']['table']['headers']
    
    for sample_header in sample_headers:
        if sample_header['name'] == 'Sample Characteristics':
            headers['sample_characteristics'] |= set(sample_header['values'])
        elif sample_header['name'] == 'Experimental Variables':
            headers['experimental_variables'] |= set(sample_header['values'])
    
pprint.pprint(headers)

{'experimental_variables': {'age',
                            'biopsy site',
                            'block',
                            'cell cycle phase',
                            'cell line',
                            'cell type',
                            'clinical history',
                            'clinical information',
                            'compound',
                            'developmental stage',
                            'disease',
                            'dose',
                            'environmental stress',
                            'facs sorting',
                            'fraction',
                            'genotype',
                            'growth condition',
                            'histology',
                            'immunophenotype',
                            'individual',
                            'infect',
                            'infection',
                            'inferred cell type - author

In [6]:
def get_headers_dict_from_sample(sample):
    headers_aux = {
        'sample_characteristics': {},
        'experimental_variables': {}
    }

    header_order = {}
    header_types = {
        'sample_characteristics': [],
        'experimental_variables': []        
    }

    # Init dictionary and the header types for the sample
    sample_headers = sample['tabs'][1]['props']['table']['headers']
    sample_characteristics_headers = []
    experimental_variables_headers = []

    for n, sample_header in enumerate(sample_headers):
        if sample_header['name'] == 'Sample Characteristics':
            header_order['sample_characteristics'] = n
            for sample_characteristic in sample_header['values']:
                # If key has not been viewed yet, add it to dictionary
                if sample_characteristic not in headers_aux['sample_characteristics'].keys():
                    headers_aux['sample_characteristics'][sample_characteristic] = set()

                # Add the type of the header
                header_types['sample_characteristics'] += [sample_characteristic]
        elif sample_header['name'] == 'Experimental Variables':
            header_order['experimental_variables'] = n
            for experimental_variable in sample_header['values']:
                # If key has not been viewed yet, add it to dictionary
                if sample_characteristic not in headers_aux['experimental_variables'].keys():
                    headers_aux['experimental_variables'][sample_characteristic] = set()

                # Add the types of the header
                header_types['experimental_variables'] += [sample_characteristic]
                
    return headers_aux, header_types, header_order

In [7]:
def sample_to_dict(sample):
    # Get headers, initiating dict, headers types and the order of the headers
    dictionary, header_types, header_order = get_headers_dict_from_sample(sample)
    
    # Get sample data
    sample_data = sample['tabs'][1]['props']['table']['data']

    # For each cell in data
    for cell in sample_data:
        # For each header type
        for header_type in ['sample_characteristics', 'experimental_variables']:
            # Add the values to the dictionary
            values = cell['values'][header_order[header_type]]
            headers = header_types[header_type]
            for value, header_name in zip(values, headers):
                dictionary[header_type][header_name].add(value)
        
    return dictionary

In [66]:
sample_to_dict(samples[0])

{'sample_characteristics': {'organism': {'Homo sapiens'},
  'age': {'43 year'},
  'developmental stage': {'adult'},
  'sex': {'male'},
  'growth condition': {'biopsy from metastatic neoplasm',
   'biopsy from primary neoplasm',
   'patient-derived mouse xenograft from mixed primary and metastatic neoplasm'},
  'organism part': {'kidney'},
  'metastatic site': {'lung', 'not applicable'},
  'sampling site': {'neoplasm'},
  'disease': {'renal cell carcinoma'},
  'clinical history': {'tumor refractory to pazopanib, everolimus and high-dose interleukin-2'},
  'single cell quality': {'OK'}},
 'experimental_variables': {'single cell quality': {'PDX_mRCC_SC_01',
   'PDX_mRCC_SC_03',
   'PDX_mRCC_SC_04',
   'PDX_mRCC_SC_05',
   'PDX_mRCC_SC_06',
   'PDX_mRCC_SC_08',
   'PDX_mRCC_SC_12',
   'PDX_mRCC_SC_14',
   'PDX_mRCC_SC_15',
   'PDX_mRCC_SC_20',
   'PDX_mRCC_SC_21',
   'PDX_mRCC_SC_34',
   'PDX_mRCC_SC_41',
   'PDX_mRCC_SC_44',
   'PDX_mRCC_SC_52',
   'PDX_mRCC_SC_56',
   'PDX_mRCC_SC_60',
 

In [90]:
def merge_dict(dict1, dict2):
    ''' Merge dictionaries and keep values of common keys in list'''
    dict3 = {}
    for key in dict1.keys():
        dict3[key] = {**dict1[key], **dict2[key]}
        
        for key2, value in dict3[key].items():
            if key2 in dict1[key] and key2 in dict2[key]:
                dict3[key][key2] |= dict1[key][key2]
    
    return dict3

{'a': {'f': {1, 2, 10, 11, 12}}, 'b': {'u': {10, 14, 15}, 'f': {11}}}

In [8]:
def process_dict(dictionary):
    for key in dictionary:
        for key2 in dictionary[key]:
            dictionary[key][key2] = list(dictionary[key][key2])
    return dictionary

In [109]:
dictionary_values = {
    'sample_characteristics': {},
    'experimental_variables': {}
}

for sample in samples:
    sample_dict = sample_to_dict(sample)
    
    dictionary_values = merge_dict(dictionary_values, sample_dict)

dictionary_values = process_dict(dictionary_values)

with open('../SingleCell-Files/values.json', 'w') as outfile:
    json.dump(dictionary_values, outfile)

In [34]:
headers_aux = {
    'sample_characteristics': {},
    'experimental_variables': {}
}


for sample in [samples[0]]:
    # Init dictionary and the header types for the sample
    sample_headers = sample['tabs'][1]['props']['table']['headers']
    header_types = []

    for sample_header in sample_headers:
        if sample_header['name'] == 'Sample Characteristics':
            for sample_characteristic in sample_header['values']:
                # If key has not been viewed yet, add it to dictionary
                if sample_characteristic not in headers_aux['sample_characteristics'].keys():
                    headers_aux['sample_characteristics'][sample_characteristic] = set()
                
            # Add the types of the header
            header_types += ['sample_characteristics'] * len(sample_header['values'])
        elif sample_header['name'] == 'Experimental Variables':
            for experimental_variable in sample_header['values']:
                # If key has not been viewed yet, add it to dictionary
                if sample_characteristic not in headers_aux['experimental_variables'].keys():
                    headers_aux['experimental_variables'][sample_characteristic] = set()
            
            # Add the types of the header
            header_types += ['experimental_variables'] * len(sample_header['values'])    
    
    # Read the experiment data from the sample
    sample_data = sample['tabs'][1]['props']['table']['data']
    
    
pprint.pprint(headers_aux)
pprint.pprint(header_types)

{'experimental_variables': {'single cell quality': set()},
 'sample_characteristics': {'age': set(),
                            'clinical history': set(),
                            'developmental stage': set(),
                            'disease': set(),
                            'growth condition': set(),
                            'metastatic site': set(),
                            'organism': set(),
                            'organism part': set(),
                            'sampling site': set(),
                            'sex': set(),
                            'single cell quality': set()}}
['sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'sample_characteristics',
 'experimental_variables',
 'experimental_variables']


In [9]:
import re
import requests
from lxml import html

def get_info_from_html(experiment_ID):
    seed_url = "https://www.ebi.ac.uk/gxa/sc/experiments/" + experiment_ID + "/experiment-design"
    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
    }
    # Get the project samples information
    answer = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
       # accessing_error.append(answer)
        clear_output(wait=True)
        return None
    
    # Parse response so we get the sample information
    parser = html.fromstring(answer.text)
    script_text = parser.xpath(".//div[@id='content']//script[3]/text()") 
    match = re.search(r'content: (?P<value>{.*})', script_text[0])
    value = match.group('value')
    
    return value

In [49]:
sample = get_info_from_html('E-CURD-10')
sample

'{"experimentAccession":"E-CURD-10","accessKey":"","species":"homo sapiens","disclaimer":"","tabs":[{"type":"results","name":"Results","props":{"ks":[2,3,4,20,37,50],"ksWithMarkerGenes":[2,3,4],"selectedK":3,"perplexities":[10,35,1,25,45,5,30,15,50,40,20],"metadata":[{"value":"growth_condition","label":"Growth condition"}],"units":["CPM"],"suggesterEndpoint":"json/suggestions"}},{"type":"experiment-design","name":"Experiment Design","props":{"table":{"headers":[{"name":"","values":["Assay"]},{"name":"Sample Characteristics","values":["organism","age","developmental stage","sex","growth condition","organism part","metastatic site","sampling site","disease","clinical history","single cell quality"]},{"name":"Experimental Variables","values":["single cell identifier","growth condition"]}],"data":[{"properties":{"analysed":true},"values":[["SRR2431344"],["Homo sapiens","43 year","adult","male","patient-derived mouse xenograft from mixed primary and metastatic neoplasm","kidney","lung","neo

In [51]:
json.loads(sample)

{'experimentAccession': 'E-CURD-10',
 'accessKey': '',
 'species': 'homo sapiens',
 'disclaimer': '',
 'tabs': [{'type': 'results',
   'name': 'Results',
   'props': {'ks': [2, 3, 4, 20, 37, 50],
    'ksWithMarkerGenes': [2, 3, 4],
    'selectedK': 3,
    'perplexities': [10, 35, 1, 25, 45, 5, 30, 15, 50, 40, 20],
    'metadata': [{'value': 'growth_condition', 'label': 'Growth condition'}],
    'units': ['CPM'],
    'suggesterEndpoint': 'json/suggestions'}},
  {'type': 'experiment-design',
   'name': 'Experiment Design',
   'props': {'table': {'headers': [{'name': '', 'values': ['Assay']},
      {'name': 'Sample Characteristics',
       'values': ['organism',
        'age',
        'developmental stage',
        'sex',
        'growth condition',
        'organism part',
        'metastatic site',
        'sampling site',
        'disease',
        'clinical history',
        'single cell quality']},
      {'name': 'Experimental Variables',
       'values': ['single cell identifier', '

# Check if a donor has diferent values

In [6]:
samples[0]

{'experimentAccession': 'E-CURD-10',
 'accessKey': '',
 'species': 'homo sapiens',
 'disclaimer': '',
 'tabs': [{'type': 'results',
   'name': 'Results',
   'props': {'ks': [2, 3, 4, 20, 37, 50],
    'ksWithMarkerGenes': [2, 3, 4],
    'selectedK': 3,
    'perplexities': [10, 35, 1, 25, 45, 5, 30, 15, 50, 40, 20],
    'metadata': [{'value': 'growth_condition', 'label': 'Growth condition'}],
    'units': ['CPM'],
    'suggesterEndpoint': 'json/suggestions'}},
  {'type': 'experiment-design',
   'name': 'Experiment Design',
   'props': {'table': {'headers': [{'name': '', 'values': ['Assay']},
      {'name': 'Sample Characteristics',
       'values': ['organism',
        'age',
        'developmental stage',
        'sex',
        'growth condition',
        'organism part',
        'metastatic site',
        'sampling site',
        'disease',
        'clinical history',
        'single cell quality']},
      {'name': 'Experimental Variables',
       'values': ['single cell identifier', '

# Convert data to dicts

In [9]:
specimen_dict, headers, headers_order = get_headers_dict_from_sample(samples[0])

({'sample_characteristics': {'organism': set(),
   'age': set(),
   'developmental stage': set(),
   'sex': set(),
   'growth condition': set(),
   'organism part': set(),
   'metastatic site': set(),
   'sampling site': set(),
   'disease': set(),
   'clinical history': set(),
   'single cell quality': set()},
  'experimental_variables': {'single cell quality': set()}},
 {'sample_characteristics': ['organism',
   'age',
   'developmental stage',
   'sex',
   'growth condition',
   'organism part',
   'metastatic site',
   'sampling site',
   'disease',
   'clinical history',
   'single cell quality'],
  'experimental_variables': ['single cell quality', 'single cell quality']},
 {'sample_characteristics': 1, 'experimental_variables': 2})

In [16]:
samples_values = {}
for sample in samples:
    
    sample_id = sample['experimentAccession']
    
    sample_values = sample_to_dict(sample)
    
    samples_values[sample_id] = process_dict(sample_values)
    
with open('../SingleCell-Files/projects_values.json', 'w') as outfile:
    json.dump(samples_values, outfile)

In [15]:
samples_values = {}
for sample in samples:
    
    sample_id = sample['experimentAccession']
    
    sample_values = sample_to_dict(sample)
    
    if 'individual' not in sample_values['sample_characteristics']:
        samples_values[sample_id] = process_dict(sample_values)
    
with open('../SingleCell-Files/projects_values_without_individual.json', 'w') as outfile:
    json.dump(samples_values, outfile)

In [11]:
sample_to_dict(samples[0])

{'sample_characteristics': {'organism': {'Homo sapiens'},
  'age': {'43 year'},
  'developmental stage': {'adult'},
  'sex': {'male'},
  'growth condition': {'biopsy from metastatic neoplasm',
   'biopsy from primary neoplasm',
   'patient-derived mouse xenograft from mixed primary and metastatic neoplasm'},
  'organism part': {'kidney'},
  'metastatic site': {'lung', 'not applicable'},
  'sampling site': {'neoplasm'},
  'disease': {'renal cell carcinoma'},
  'clinical history': {'tumor refractory to pazopanib, everolimus and high-dose interleukin-2'},
  'single cell quality': {'OK'}},
 'experimental_variables': {'single cell quality': {'PDX_mRCC_SC_01',
   'PDX_mRCC_SC_03',
   'PDX_mRCC_SC_04',
   'PDX_mRCC_SC_05',
   'PDX_mRCC_SC_06',
   'PDX_mRCC_SC_08',
   'PDX_mRCC_SC_12',
   'PDX_mRCC_SC_14',
   'PDX_mRCC_SC_15',
   'PDX_mRCC_SC_20',
   'PDX_mRCC_SC_21',
   'PDX_mRCC_SC_34',
   'PDX_mRCC_SC_41',
   'PDX_mRCC_SC_44',
   'PDX_mRCC_SC_52',
   'PDX_mRCC_SC_56',
   'PDX_mRCC_SC_60',
 

In [14]:
'individual' in sample_to_dict(samples[0])['sample_characteristics']

False