# Read data from HCA API

In [1]:
import requests
import json
import sys

### API URL Request

The data base has 800 samples, we indicate `size=801` to get them all.

In [2]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/samples?filters=%7B%7D&size=801"

It is necessary to indicate the user agent in the request so the server do not detect we are web scraping.

In [3]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

We do the request and get all the data in json format

In [4]:
answer = requests.get(seed_url, headers=headers)

If we take a look into the json, we can see that the data is in the object "hits".

In [5]:
# print(json.dumps(answer.json(), indent=2, sort_keys=True))

In [6]:
hits = answer.json()['hits']
print(len(hits))

800


We save the hits in this raw json format.

In [7]:
with open('../SingleCell-Files/hits_raw.json', 'w') as outfile:
    json.dump(hits, outfile)

### Format data

For a more in-depth analysis of the data, as well as the formatting to the ontology format, see HCA_conversor notebook.

In [8]:
from OntologyConversor import format_HCD

In [9]:
print(json.dumps(format_HCD(hits[0]), indent=2, sort_keys=True))

{
  "Classes": {
    "Accesion": null,
    "AnalysisProtocol": [
      null
    ],
    "CellType": [
      "CD4+TCell"
    ],
    "Collection": [
      "HumanCellAtlas"
    ],
    "Disease": [
      "Melanoma"
    ],
    "Experimental": null,
    "FileFormat": [
      "fastq.gz"
    ],
    "FileType": null,
    "GenusSpecies": [
      "MusMusculus"
    ],
    "InstrumentModel": [
      "IlluminaHiSeq2500",
      null
    ],
    "Kingdom": null,
    "Library": [
      "Smart-seq2",
      null
    ],
    "Organ": [
      "LymphNode"
    ],
    "OrganPart": [
      null
    ],
    "Preservation": [
      null
    ],
    "Repository": [
      "HumanCellAtlasDataCoordinationPlatform",
      "InstituteOfCellularMedicine",
      "MRCCancerUnit",
      "SarahTeichmann"
    ],
    "SampleType": [
      "Specimens"
    ]
  },
  "DataProperties": {
    "hasAccessId": null,
    "hasAgeOf": [
      "6-12"
    ],
    "hasAgeRangeOf": [
      {
        "gte": 3628800.0,
        "lte": 7257600.0
     

In [10]:
hits_processed = list(map(format_HCD, hits))
#print(list(hits_processed))
print(json.dumps(hits_processed, indent=2, sort_keys=True))

[
  {
    "Classes": {
      "Accesion": null,
      "AnalysisProtocol": [
        null
      ],
      "CellType": [
        "CD4+TCell"
      ],
      "Collection": [
        "HumanCellAtlas"
      ],
      "Disease": [
        "Melanoma"
      ],
      "Experimental": null,
      "FileFormat": [
        "fastq.gz"
      ],
      "FileType": null,
      "GenusSpecies": [
        "MusMusculus"
      ],
      "InstrumentModel": [
        "IlluminaHiSeq2500",
        null
      ],
      "Kingdom": null,
      "Library": [
        "Smart-seq2",
        null
      ],
      "Organ": [
        "LymphNode"
      ],
      "OrganPart": [
        null
      ],
      "Preservation": [
        null
      ],
      "Repository": [
        "HumanCellAtlasDataCoordinationPlatform",
        "InstituteOfCellularMedicine",
        "MRCCancerUnit",
        "SarahTeichmann"
      ],
      "SampleType": [
        "Specimens"
      ]
    },
    "DataProperties": {
      "hasAccessId": null,
      "hasAgeOf":

In [11]:
json_processed = {}
json_processed['hits'] = hits_processed

print(json.dumps(json_processed, indent=2, sort_keys=True))

{
  "hits": [
    {
      "Classes": {
        "Accesion": null,
        "AnalysisProtocol": [
          null
        ],
        "CellType": [
          "CD4+TCell"
        ],
        "Collection": [
          "HumanCellAtlas"
        ],
        "Disease": [
          "Melanoma"
        ],
        "Experimental": null,
        "FileFormat": [
          "fastq.gz"
        ],
        "FileType": null,
        "GenusSpecies": [
          "MusMusculus"
        ],
        "InstrumentModel": [
          "IlluminaHiSeq2500",
          null
        ],
        "Kingdom": null,
        "Library": [
          "Smart-seq2",
          null
        ],
        "Organ": [
          "LymphNode"
        ],
        "OrganPart": [
          null
        ],
        "Preservation": [
          null
        ],
        "Repository": [
          "HumanCellAtlasDataCoordinationPlatform",
          "InstituteOfCellularMedicine",
          "MRCCancerUnit",
          "SarahTeichmann"
        ],
        "SampleType

In [12]:
with open('../SingleCell-Files/hits_processed.json', 'w') as outfile:
    json.dump(json_processed, outfile)