# Read samples from HCA API

In [1]:
import requests
import json
import sys

### API URL Request

The data base has 800 samples, we indicate `size=801` to get them all.

In [2]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/samples?filters=%7B%7D&size=801"

It is necessary to indicate the user agent in the request so the server do not detect we are web scraping.

In [3]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

We do the request and get all the data in json format

In [4]:
answer = requests.get(seed_url, headers=headers)

If we take a look into the json, we can see that the data is in the object "hits".

In [5]:
# print(json.dumps(answer.json(), indent=2, sort_keys=True))

In [6]:
hits = answer.json()['hits']
print(len(hits))

800


We save the hits in this raw json format.

In [7]:
with open('../SingleCell-Files/hits_raw.json', 'w') as outfile:
    json.dump(hits, outfile)

### Format data

For a more in-depth analysis of the data, as well as the formatting to the ontology format, see HCA_conversor notebook.

In [8]:
from OntologyCreator import OntologyCreator

ont_creator = OntologyCreator()

In [9]:
# print(json.dumps(hits[555], indent=2))

In [10]:
# print(json.dumps(ont_creator.create_hca_individual(hits[555]), indent=2))

In [11]:
hits_processed = list(map(ont_creator.create_hca_specimen, hits))

print(json.dumps(hits_processed, indent=2))

[
  {
    "ID": "SPECIMEN_ID_0",
    "ObjectProperties": {
      "SPR.hasSpecie": [
        "MusMusculus"
      ],
      "SPR.hasAnalysisProtocol": [
        null
      ],
      "SPR.hasCellLineType": null,
      "SPR.hasDiseaseStatus": [
        "Melanoma"
      ],
      "SPR.hasInstrument": [
        "IlluminaHiSeq2500",
        null
      ],
      "SPR.hasLibrary": [
        "Smart-seq2",
        null
      ],
      "SPR.hasModel": null,
      "SPR.hasObjectOfStudy": [
        null,
        "LymphNode"
      ],
      "SPR.hasPreservation": [
        null
      ],
      "SPR.hasSampleType": "Specimens",
      "SPR.hasSelectedCellType": [
        "CD4+TCell"
      ]
    },
    "DataProperties": {
      "SPR.hasAgeUnit": [
        "week"
      ],
      "SPR.hasBiologicalSex": [
        "female"
      ],
      "SPR.hasMaxAge": 12,
      "SPR.hasMinAge": 6,
      "SPR.hasTotalCellCount": 91,
      "SPR.hasTotalSizeOfFilesInMB": 7825.95156288147,
      "SPR.isPairedEnd": [
        true,
 

In [12]:
json_processed = {}
json_processed['specimens'] = hits_processed

# print(json.dumps(json_processed, indent=2, sort_keys=True))

# Read projects from HCA

In [13]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/projects?filters=%7B%7D&size=30&sort=projectTitle&order=asc"

In [14]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [15]:
answer = requests.get(seed_url, headers=headers)

In [16]:
print(json.dumps(answer.json(), indent=2, sort_keys=True))

{
  "hits": [
    {
      "cellLines": [],
      "cellSuspensions": [
        {
          "organ": [
            "brain"
          ],
          "organPart": [
            "cortex"
          ],
          "selectedCellType": [
            "neuron"
          ],
          "totalCells": 1330000
        }
      ],
      "donorOrganisms": [
        {
          "biologicalSex": [
            "unknown"
          ],
          "disease": [
            "normal"
          ],
          "donorCount": 2,
          "genusSpecies": [
            "Mus musculus"
          ],
          "id": [
            "E18_20160930",
            "E18_20161004"
          ],
          "organismAge": [
            "18"
          ],
          "organismAgeRange": [
            {
              "gte": 1555200.0,
              "lte": 1555200.0
            }
          ],
          "organismAgeUnit": [
            "day"
          ]
        }
      ],
      "entryId": "74b6d569-3b11-42ef-b6b1-a0454522b4a0",
      "fileTypeSummari

In [17]:
import time

project_hits = []

for hit in answer.json()["hits"]:
    entry_id = hit["entryId"]
    print("Getting " + entry_id + "...")
    time.sleep(2)
    seed_url = "https://service.explore.data.humancellatlas.org/repository/projects/" + entry_id
    
    answer_project = requests.get(seed_url, headers=headers)
    project_hits.append(answer_project.json())
    
print(len(project_hits))

Getting 74b6d569-3b11-42ef-b6b1-a0454522b4a0...
Getting f86f1ab4-1fbb-4510-ae35-3ffd752d4dfc...
Getting 1defdada-a365-44ad-9b29-443b06bd11d6...
Getting 4a95101c-9ffc-4f30-a809-f04518a23803...
Getting 8185730f-4113-40d3-9cc3-929271784c2b...
Getting 005d611a-14d5-4fbf-846e-571a1f874f70...
Getting a29952d9-925e-40f4-8a1c-274f118f1f51...
Getting f81efc03-9f56-4354-aabb-6ce819c3d414...
Getting cc95ff89-2e68-4a08-a234-480eca21ce79...
Getting a9c022b4-c771-4468-b769-cabcf9738de3...
Getting 4d6f6c96-2a83-43d8-8fe1-0f53bffd4674...
Getting c4077b3c-5c98-4d26-a614-246d12c2e5d7...
Getting 8c3c290d-dfff-4553-8868-54ce45f4ba7f...
Getting 90bd6933-40c0-48d4-8d76-778c103bf545...
Getting 091cf39b-01bc-42e5-9437-f419a66c8a45...
Getting f83165c5-e2ea-4d15-a5cf-33f3550bffde...
Getting 027c51c6-0719-469f-a7f5-640fe57cbece...
Getting 116965f3-f094-4769-9d28-ae675c1b569c...
Getting cddab57b-6868-4be4-806f-395ed9dd635a...
Getting 2043c65a-1cf8-4828-a656-9e247d4e64f1...
Getting ae71be1d-ddd8-4feb-9bed-24c3ddb6

In [18]:
with open('../SingleCell-Files/projects_raw.json', 'w') as outfile:
    json.dump({"hits": project_hits}, outfile)

In [19]:
# print(json.dumps(project_hits[0], indent=2))

In [20]:
hits_processed = list(map(ont_creator.create_hca_project, project_hits))

# print(json.dumps(hits_processed, indent=2))

In [21]:
json_processed['projects'] = hits_processed

In [22]:
with open('../SingleCell-Files/hits_processed.json', 'w') as outfile:
    json.dump(json_processed, outfile)