# Read data from HCA API

In [1]:
import requests
import json
import sys

### API URL Request

The data base has 800 samples, we indicate `size=801` to get them all.

In [2]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/samples?filters=%7B%7D&size=801"

It is necessary to indicate the user agent in the request so the server do not detect we are web scraping.

In [3]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

We do the request and get all the data in json format

In [4]:
answer = requests.get(seed_url, headers=headers)

If we take a look into the json, we can see that the data is in the object "hits".

In [14]:
# print(json.dumps(answer.json(), indent=2, sort_keys=True))

{
  "hits": [
    {
      "cellLines": [],
      "cellSuspensions": [
        {
          "organ": [
            "lymph node"
          ],
          "organPart": [
            null
          ],
          "selectedCellType": [
            "CD4+ T cell"
          ],
          "totalCells": 91
        }
      ],
      "donorOrganisms": [
        {
          "biologicalSex": [
            "female"
          ],
          "disease": [
            "melanoma (disease)"
          ],
          "donorCount": 1,
          "genusSpecies": [
            "Mus musculus"
          ],
          "id": [
            "1104"
          ],
          "organismAge": [
            "6-12"
          ],
          "organismAgeRange": [
            {
              "gte": 3628800.0,
              "lte": 7257600.0
            }
          ],
          "organismAgeUnit": [
            "week"
          ]
        }
      ],
      "entryId": "46b58d7b-7143-4c0a-88f3-73d0409eb453",
      "fileTypeSummaries": [
        {
    

In [6]:
hits = answer.json()['hits']
print(len(hits))

800


We save the hits in this raw json format.

In [7]:
with open('../SingleCell-Files/hits_raw.json', 'w') as outfile:
    json.dump(hits, outfile)

### Format data

For a more in-depth analysis of the data, as well as the formatting to the ontology format, see HCA_conversor notebook.

In [8]:
from OntologyConversor import format_HCD

In [9]:
print(json.dumps(hits[555], indent=2))

{
  "protocols": [
    {
      "libraryConstructionApproach": [
        "10X v2 sequencing",
        null
      ],
      "instrumentManufacturerModel": [
        "Illumina HiSeq 4000",
        null
      ],
      "pairedEnd": [
        false,
        null
      ],
      "workflow": [
        "optimus_v1.3.5",
        null
      ],
      "assayType": [
        null
      ]
    }
  ],
  "entryId": "a57ebdf9-1225-47a8-81cc-384ac452d83d",
  "projects": [
    {
      "projectTitle": [
        "Ischaemic sensitivity of human tissue by single cell RNA seq"
      ],
      "projectShortname": [
        "TissueStability"
      ],
      "laboratory": [
        "CGaP",
        "Cambridge Biorepository for Translational Medicine",
        "Human Cell Atlas (Mike Stubbington)",
        "Human Cell Atlas (Sarah Teichmann)",
        "Human Cell Atlas Data Coordination Platform",
        "Human Cell Atlas UK",
        "Molecular Immunity Unit, Department of Medicine",
        "Oliver Stegle Group",
   

In [10]:
print(json.dumps(format_HCD(hits[555]), indent=2))

{
  "ID": "A8-Spl-0-TL-12h-1",
  "ObjectProperties": {
    "SR.hasAnalysisProtocol": [
      "Optimus_v1.3.5",
      null
    ],
    "SR.hasCellLineType": null,
    "SR.hasDiseaseStatus": [
      "Normal"
    ],
    "SR.hasGenusSpecie": [
      "HomoSapiens"
    ],
    "SR.hasInstrument": [
      "IlluminaHiSeq4000",
      null
    ],
    "SR.hasLibrary": [
      "10Xv2Sequencing",
      null
    ],
    "SR.hasObjectOfStudy": [
      "Spleen"
    ],
    "SR.hasPreservation": [
      "HypothermicPreservationMediaAt2-8C"
    ],
    "SR.hasSampleType": "Specimens",
    "SR.hasSelectedCellType": [
      "Splenocyte"
    ]
  },
  "DataProperties": {
    "hasAgeUnit": [
      "year"
    ],
    "hasBiologicalSex": [
      "male"
    ],
    "hasAvailableDownloadsFormat": [
      "fastq.gz",
      "csv.gz",
      "npz",
      "npy",
      "bam",
      "csv"
    ],
    "hasAvailableDownloadsType": [
      "metadata",
      "results",
      "results",
      "results",
      "results",
      "resu

In [11]:
hits_processed = list(map(format_HCD, hits))
#print(list(hits_processed))
print(json.dumps(hits_processed, indent=2))

[
  {
    "ID": "1104_LN",
    "ObjectProperties": {
      "SR.hasAnalysisProtocol": [
        null
      ],
      "SR.hasCellLineType": null,
      "SR.hasDiseaseStatus": [
        "Melanoma"
      ],
      "SR.hasGenusSpecie": [
        "MusMusculus"
      ],
      "SR.hasInstrument": [
        "IlluminaHiSeq2500",
        null
      ],
      "SR.hasLibrary": [
        "Smart-seq2",
        null
      ],
      "SR.hasObjectOfStudy": [
        "LymphNode"
      ],
      "SR.hasPreservation": [
        null
      ],
      "SR.hasSampleType": "Specimens",
      "SR.hasSelectedCellType": [
        "CD4+TCell"
      ]
    },
    "DataProperties": {
      "hasAgeUnit": [
        "week"
      ],
      "hasBiologicalSex": [
        "female"
      ],
      "hasAvailableDownloadsFormat": [
        "fastq.gz"
      ],
      "hasAvailableDownloadsType": [
        "metadata",
        "results"
      ],
      "hasLaboratory": [
        "HumanCellAtlasDataCoordinationPlatform",
        "InstituteOf

In [12]:
json_processed = {}
json_processed['hits'] = hits_processed

# print(json.dumps(json_processed, indent=2, sort_keys=True))

In [13]:
with open('../SingleCell-Files/hits_processed.json', 'w') as outfile:
    json.dump(json_processed, outfile)