# Obtain Projects and specimens from Human Cell Atlas

In [1]:
import requests
import json
import time

from IPython.display import clear_output

## Get the samples and save them

The data base has 800 samples, we indicate `size=801` to get them all.

In [26]:
seed_url = "https://service.azul.data.humancellatlas.org/index/samples?filters=%7B%7D&size=999&catalog=dcp1"

It is necessary to indicate the user agent in the request so the server do not detect we are web scraping.

In [27]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

We do the request and get all the data in json format

In [28]:
answer = requests.get(seed_url, headers=headers)

If we take a look into the json, we can see that the data is in the object "hits".

In [29]:
# print(json.dumps(answer.json(), indent=2, sort_keys=True))

In [30]:
hits = answer.json()['hits']
print(len(hits))

800


We save the hits in this raw json format.

In [31]:
with open('../../SingleCell-Files/raw_data/HCA_samples.json', 'w') as outfile:
    json.dump({'samples': hits}, outfile)

## Get the projects and save them

In [36]:
seed_url = "https://service.azul.data.humancellatlas.org/index/projects?size=999&catalog=dcp1"

In [37]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [38]:
answer = requests.get(seed_url, headers=headers)

In [39]:
project_hits = answer.json()["hits"]

# print(json.dumps(project_hits, indent=2, sort_keys=True))

In [41]:
projects = []
accessing_error = []

n_projects = len(project_hits)

for n, hit in enumerate(project_hits):
    entry_id = hit["entryId"]
    seed_url = f"https://service.azul.data.humancellatlas.org/index/projects/{entry_id}?catalog=dcp1"

    # Print loop information
    print("Getting project with id \"" + entry_id + "\"...")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{n_projects}")
    
    time.sleep(2)
    
    answer_project = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
    
    projects.append(answer_project.json())
    
    clear_output(wait=True)
        

Getting project with id "9c20a245-f2c0-43ae-82c9-2232ec6b594f"...
Number of errors: 0
28/28


In [42]:
with open('../../SingleCell-Files/raw_data/HCA_projects.json', 'w') as outfile:
    json.dump({"projects": projects}, outfile)