# Read samples from HCA API

In [10]:
import requests
import json
import sys

### API URL Request

The data base has 800 samples, we indicate `size=801` to get them all.

In [11]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/samples?filters=%7B%7D&size=801"

It is necessary to indicate the user agent in the request so the server do not detect we are web scraping.

In [12]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

We do the request and get all the data in json format

In [13]:
answer = requests.get(seed_url, headers=headers)

If we take a look into the json, we can see that the data is in the object "hits".

In [14]:
# print(json.dumps(answer.json(), indent=2, sort_keys=True))

In [15]:
hits = answer.json()['hits']
print(len(hits))

800


We save the hits in this raw json format.

In [16]:
with open('../SingleCell-Files/hits_raw.json', 'w') as outfile:
    json.dump(hits, outfile)

### Format data

For a more in-depth analysis of the data, as well as the formatting to the ontology format, see HCA_conversor notebook.

In [17]:
from OntologyCreator import OntologyCreator

ont_creator = OntologyCreator()

In [18]:
# print(json.dumps(hits[555], indent=2))

In [19]:
# print(json.dumps(ont_creator.create_hca_individual(hits[555]), indent=2))

In [20]:
hits_processed = list(map(ont_creator.create_hca_individual, hits))

# print(json.dumps(hits_processed, indent=2))

In [21]:
json_processed = {}
json_processed['hits'] = hits_processed

# print(json.dumps(json_processed, indent=2, sort_keys=True))

In [22]:
with open('../SingleCell-Files/hits_processed.json', 'w') as outfile:
    json.dump(json_processed, outfile)

# Read projects from HCA

In [1]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/projects?filters=%7B%7D&size=30&sort=projectTitle&order=asc"

In [2]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [5]:
answer = requests.get(seed_url, headers=headers)

In [7]:
print(json.dumps(answer.json(), indent=2, sort_keys=True))

{
  "hits": [
    {
      "cellLines": [],
      "cellSuspensions": [
        {
          "organ": [
            "brain"
          ],
          "organPart": [
            "cortex"
          ],
          "selectedCellType": [
            "neuron"
          ],
          "totalCells": 1330000
        }
      ],
      "donorOrganisms": [
        {
          "biologicalSex": [
            "unknown"
          ],
          "disease": [
            "normal"
          ],
          "donorCount": 2,
          "genusSpecies": [
            "Mus musculus"
          ],
          "id": [
            "E18_20160930",
            "E18_20161004"
          ],
          "organismAge": [
            "18"
          ],
          "organismAgeRange": [
            {
              "gte": 1555200.0,
              "lte": 1555200.0
            }
          ],
          "organismAgeUnit": [
            "day"
          ]
        }
      ],
      "entryId": "74b6d569-3b11-42ef-b6b1-a0454522b4a0",
      "fileTypeSummari

In [8]:
hits = answer.json()['hits']
print(len(hits))

28


In [9]:
with open('../SingleCell-Files/projects_raw.json', 'w') as outfile:
    json.dump(hits, outfile)