# Obtain Projects and specimens from Human Cell Atlas

In [2]:
import requests
import json
import time

from IPython.display import clear_output

## Get the samples and save them

The data base has 800 samples, we indicate `size=801` to get them all.

In [3]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/samples?filters=%7B%7D&size=801"

It is necessary to indicate the user agent in the request so the server do not detect we are web scraping.

In [4]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

We do the request and get all the data in json format

In [5]:
answer = requests.get(seed_url, headers=headers)

If we take a look into the json, we can see that the data is in the object "hits".

In [6]:
# print(json.dumps(answer.json(), indent=2, sort_keys=True))

In [7]:
hits = answer.json()['hits']
print(len(hits))

800


We save the hits in this raw json format.

In [8]:
with open('../SingleCell-Files/raw_data/HCA_samples.json', 'w') as outfile:
    json.dump({'samples': hits}, outfile)

## Get the projects and save them

In [9]:
seed_url = "https://service.explore.data.humancellatlas.org/repository/projects?filters=%7B%7D&size=30&sort=projectTitle&order=asc"

In [10]:
# User-Agent obtenido del recurso del curso.
headers = {
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

In [11]:
answer = requests.get(seed_url, headers=headers)

In [14]:
project_hits = answer.json()["hits"]

print(json.dumps(projects_hits, indent=2, sort_keys=True))

[
  {
    "cellLines": [],
    "cellSuspensions": [
      {
        "organ": [
          "brain"
        ],
        "organPart": [
          "cortex"
        ],
        "selectedCellType": [
          "neuron"
        ],
        "totalCells": 1330000
      }
    ],
    "donorOrganisms": [
      {
        "biologicalSex": [
          "unknown"
        ],
        "disease": [
          "normal"
        ],
        "donorCount": 2,
        "genusSpecies": [
          "Mus musculus"
        ],
        "id": [
          "E18_20160930",
          "E18_20161004"
        ],
        "organismAge": [
          "18"
        ],
        "organismAgeRange": [
          {
            "gte": 1555200.0,
            "lte": 1555200.0
          }
        ],
        "organismAgeUnit": [
          "day"
        ]
      }
    ],
    "entryId": "74b6d569-3b11-42ef-b6b1-a0454522b4a0",
    "fileTypeSummaries": [
      {
        "count": 16377,
        "fileType": "fastq",
        "totalSize": 3206177550920
     

In [15]:
projects = []
accessing_error = []

n_projects = len(project_hits)

for n, hit in enumerate(project_hits):
    entry_id = hit["entryId"]
    seed_url = "https://service.explore.data.humancellatlas.org/repository/projects/" + entry_id

    # Print loop information
    print("Getting project with id \"" + entry_id + "\"...")
    print("Number of errors: " + str(len(accessing_error)))
    print(f"{n+1}/{n_projects}")
    
    time.sleep(2)
    
    answer_project = requests.get(seed_url, headers=headers)
    
    # If couldn't get the information save the error
    if answer.status_code != requests.codes.ok:
        accessing_error.append(answer)
        clear_output(wait=True)
        continue
    
    projects.append(answer_project.json())
    
    clear_output(wait=True)
        

Getting project with id "9c20a245-f2c0-43ae-82c9-2232ec6b594f"...
Number of errors: 0
28/28


In [16]:
with open('../SingleCell-Files/raw_data/HCA_projects.json', 'w') as outfile:
    json.dump({"projects": projects}, outfile)

# Parse HCA

In [1]:
import json

with open('../SingleCell-Files/hits_raw.json') as f:
    hits = json.load(f)

In [2]:
def hca2ont(text):
    if text is None:
        return None
    
    if type(text) is list:
        return list(map(hca2ont, text))
    
    aux = list(text.title())
    
    for i in range(len(text)):
        if text[i].isupper():
            aux[i] = text[i]
    
    return ''.join(aux).replace(' ', '')

# Full json individual

In [50]:
print(json.dumps(hits[559], indent=2, sort_keys=True))

{
  "cellLines": [
    {
      "cellLineType": [
        "primary"
      ],
      "id": [
        "Cell_line_1"
      ],
      "modelOrgan": [
        "immune system"
      ]
    }
  ],
  "cellSuspensions": [
    {
      "organ": [
        "embryo",
        "immune system"
      ],
      "organPart": [
        "skin epidermis",
        null
      ],
      "selectedCellType": [
        "dendritic cell"
      ],
      "totalCells": 0
    }
  ],
  "donorOrganisms": [
    {
      "biologicalSex": [
        "unknown"
      ],
      "disease": [
        "normal"
      ],
      "donorCount": 1,
      "genusSpecies": [
        "Homo sapiens"
      ],
      "id": [
        "Donor1"
      ],
      "organismAge": [
        null
      ],
      "organismAgeRange": null,
      "organismAgeUnit": [
        null
      ]
    }
  ],
  "entryId": "6a28bd71-9910-47e3-b01c-f9e6fd720341",
  "fileTypeSummaries": [
    {
      "count": 12,
      "fileType": "fastq.gz",
      "totalSize": 12689741805
    },
  

In [None]:
individual = {}

# Cell Lines

Nada que cambiar

In [3]:
for i in range(len(hits)):
    if hits[i]['cellLines']:
        print("#"*30)
        print("\nHIT NUMBER " + str(i))
        print(json.dumps(hits[i]['cellLines'], indent=2, sort_keys=True))
        
        if len(hits[i]['cellLines']) > 1:
            print("@"*60)
            print("More than 1")
            print("@"*60)
           
        
        # break

##############################

HIT NUMBER 559
[
  {
    "cellLineType": [
      "primary"
    ],
    "id": [
      "Cell_line_1"
    ],
    "modelOrgan": [
      "immune system"
    ]
  }
]
##############################

HIT NUMBER 560
[
  {
    "cellLineType": [
      "primary"
    ],
    "id": [
      "Cell_line_2"
    ],
    "modelOrgan": [
      "immune system"
    ]
  }
]
##############################

HIT NUMBER 590
[
  {
    "cellLineType": [
      "stem cell"
    ],
    "id": [
      "HS_BM_1_cell_line"
    ],
    "modelOrgan": [
      "hematopoietic system"
    ]
  }
]
##############################

HIT NUMBER 591
[
  {
    "cellLineType": [
      "stem cell"
    ],
    "id": [
      "HS_BM_2_cell_line"
    ],
    "modelOrgan": [
      "hematopoietic system"
    ]
  }
]
##############################

HIT NUMBER 592
[
  {
    "cellLineType": [
      "stem cell"
    ],
    "id": [
      "HS_BM_3_cell_line"
    ],
    "modelOrgan": [
      "hematopoietic system"
    ]
  }
]


In [None]:
cell_line_type = hits[559]['cellLines'][0]['cellLineType']
model_organ = hits[559]['cellLines'][0]['modelOrgan']
                         
individual['cell_line_type'] = hca2ont(cell_line_type)
individual['model_organ'] = hca2ont(model_organ)
                         
print(json.dumps(individual ,indent=2, sort_keys=True))

# Cell Suspensions

- Organ: 1P Mayus y sin espacios.
- Organ part: 1P Mayus y sin espacios.
- Selected Cell Types: 1P Mayus y sin espacios.
- Total Cells: OK

In [26]:
print(json.dumps(hits[2]['cellSuspensions'], indent=2, sort_keys=True))

[
  {
    "organ": [
      "lymph node"
    ],
    "organPart": [
      null
    ],
    "selectedCellType": [
      "CD8-positive, alpha-beta T cell"
    ],
    "totalCells": 91
  }
]


In [None]:
organ = hits[2]['cellSuspensions'][0]['organ']
organ_part = hits[2]['cellSuspensions'][0]['organPart']
selected_cell_type = hits[2]['cellSuspensions'][0]['selectedCellType']
total_cells = hits[2]['cellSuspensions'][0]['totalCells']
                         
individual['organ'] = hca2ont(organ)
individual['organ_part'] = hca2ont(organ_part)
individual['selected_cell_type'] = hca2ont(selected_cell_type)
individual['total_cells'] = total_cells
                         
print(json.dumps(individual ,indent=2, sort_keys=True))

# Donor Organism

- Biological Sex: 1P Mayus
- Disease: 1P Mayus sin espacios.
- Donor Count: OK
- Genus Species: 1P Mayus sin espacios.
- Organism Age: OK

In [25]:
print(json.dumps(hits[0]['donorOrganisms'], indent=2, sort_keys=True))

[
  {
    "biologicalSex": [
      "female"
    ],
    "disease": [
      "melanoma (disease)"
    ],
    "donorCount": 1,
    "genusSpecies": [
      "Mus musculus"
    ],
    "id": [
      "1104"
    ],
    "organismAge": [
      "6-12"
    ],
    "organismAgeRange": [
      {
        "gte": 3628800.0,
        "lte": 7257600.0
      }
    ],
    "organismAgeUnit": [
      "week"
    ]
  }
]


In [None]:
biological_sex = hits[0]['donorOrganisms'][0]['biologicalSex']
disease = hits[0]['donorOrganisms'][0]['disease']
donor_count = hits[0]['donorOrganisms'][0]['donorCount']
genus_species = hits[0]['donorOrganisms'][0]['genusSpecies']
organism_age = hits[0]['donorOrganisms'][0]['organismAge']
organism_age_range = hits[0]['donorOrganisms'][0]['organismAgeRange']
organism_age_unit = hits[0]['donorOrganisms'][0]['organismAgeUnit']

individual['biological_sex'] = hca2ont(biological_sex)
individual['disease'] = hca2ont(disease)
individual['donor_count'] = donor_count
individual['genus_species'] = hca2ont(genus_species)
individual['organism_age'] = organism_age
individual['organism_age_range'] = organism_age_range
individual['organism_age_unit'] = hca2ont(organism_age_unit)
                         
print(json.dumps(individual ,indent=2, sort_keys=True))

# Entry Id

Nada de aqui se usa.

In [None]:
print(json.dumps(hits[0]['entryId'], indent=2, sort_keys=True))

# File Type Summaries

- File Type: OK

In [None]:
print(json.dumps(hits[559]['fileTypeSummaries'], indent=2, sort_keys=True))

In [None]:
file_type = []
count = []
total_size = []

for i in range(len(hits[559]['fileTypeSummaries'])):
    file_type.append(hits[559]['fileTypeSummaries'][i]['fileType'])
    count.append(hits[559]['fileTypeSummaries'][i]['count'])
    total_size.append(hits[559]['fileTypeSummaries'][i]['totalSize'])
    
individual['file_type'] = file_type
individual['count'] = count
individual['total_size'] = total_size

print(json.dumps(individual ,indent=2, sort_keys=True))

# Organoids

Nada de aqui

In [None]:
for i in range(len(hits)):
    if hits[i]['organoids']:
        print("#"*30)
        print("\nHIT NUMBER " + str(i))
        print(json.dumps(hits[i]['organoids'], indent=2, sort_keys=True))
        
        if len(hits[i]['organoids']) > 1:
            print("@"*60)
            print("More than 1")
            print("@"*60)


In [None]:
model_organ = hits[626]['organoids'][0]['modelOrgan']
model_organ_part = hits[626]['organoids'][0]['modelOrganPart']

individual['model_organ'] = hca2ont(model_organ)
individual['model_organ_part'] = hca2ont(model_organ_part)
                         
print(json.dumps(individual ,indent=2, sort_keys=True))

# Projects

- Laboratory: 1P Mayus sin espacios ("Human Cell Atlas" en vez de "Human Cell Atlas Data Coordination Platform")

In [3]:
print(json.dumps(hits[0]['projects'], indent=2, sort_keys=True))

[
  {
    "laboratory": [
      "Human Cell Atlas Data Coordination Platform",
      "Institute of Cellular Medicine",
      "MRC Cancer Unit",
      "Sarah Teichmann"
    ],
    "projectShortname": [
      "Mouse Melanoma"
    ],
    "projectTitle": [
      "Melanoma infiltration of stromal and immune cells"
    ]
  }
]


In [None]:
laboratory = hits[0]['projects'][0]['laboratory']
project_shortname = hits[0]['projects'][0]['projectShortname']
project_title = hits[0]['projects'][0]['projectTitle']

individual['laboratory'] = hca2ont(laboratory)
individual['project_shortname'] = project_shortname
individual['project_title'] = project_title

print(json.dumps(individual ,indent=2, sort_keys=True))

# Protocols

- Instrument Manufactured Model: Sin espcios.
- Library Construct approach: 1P en mayus y sin '-'.
- Paired End: OK

In [None]:
for i in range(len(hits)):
    if hits[i]['protocols'] and hits[i]['protocols'][0]['workflow'][0] is not None:
        print("#"*30)
        print("\nHIT NUMBER " + str(i))
        print(json.dumps(hits[i]['protocols'][0]['workflow'], indent=2, sort_keys=True))
        
        if len(hits[i]['protocols']) > 1:
            print("@"*60)
            print("More than 1")
            print("@"*60)

# print(json.dumps(hits[0]['protocols'], indent=2, sort_keys=True))

In [None]:
instrument_manufacturer_model = hits[0]['protocols'][0]['instrumentManufacturerModel']
library_construction_approach = hits[0]['protocols'][0]['libraryConstructionApproach']
paired_end = hits[0]['protocols'][0]['pairedEnd']

individual['instrument_manufacturer_model'] = hca2ont(instrument_manufacturer_model)
individual['library_construction_approach'] = hca2ont(library_construction_approach)
individual['paired_end'] = paired_end

print(json.dumps(individual ,indent=2, sort_keys=True))

# Samples

Nada

In [49]:
print(json.dumps(hits[559]['samples'], indent=2, sort_keys=True))

for i in range(len(hits)):
    if hits[i]['samples'] and hits[i]['samples'][0]['preservationMethod'] is not None:
        print("#"*30)
        print("\nHIT NUMBER " + str(i))
        print(json.dumps(hits[i]['samples'][0]['preservationMethod'], indent=2, sort_keys=True))
        
        if len(hits[i]['protocols']) > 1:
            print("@"*60)
            print("More than 1")
            print("@"*60)



[
  {
    "cellLineType": "primary",
    "effectiveOrgan": "immune system",
    "id": "Cell_line_1",
    "modelOrgan": "immune system",
    "sampleEntityType": "cellLines"
  }
]
##############################

HIT NUMBER 78
"cryopreservation, other"
##############################

HIT NUMBER 79
"cryopreservation, other"
##############################

HIT NUMBER 96
"cryopreservation, other"
##############################

HIT NUMBER 97
"cryopreservation, other"
##############################

HIT NUMBER 98
"cryopreservation, other"
##############################

HIT NUMBER 99
"cryopreservation, other"
##############################

HIT NUMBER 100
"cryopreservation, other"
##############################

HIT NUMBER 101
"cryopreservation, other"
##############################

HIT NUMBER 102
"cryopreservation, other"
##############################

HIT NUMBER 103
"cryopreservation, other"
##############################

HIT NUMBER 104
"cryopreservation, other"
#########################

KeyError: 'preservationMethod'

# Specimens

- id: OK

In [27]:
print(json.dumps(hits[659]['specimens'], indent=2, sort_keys=True))

[
  {
    "disease": [
      null
    ],
    "id": [
      "PP019"
    ],
    "organ": [
      "blood"
    ],
    "organPart": [
      "venous blood"
    ],
    "preservationMethod": [
      "fresh"
    ],
    "source": [
      "specimen_from_organism"
    ]
  }
]


In [None]:
individual_id = hits[0]['specimens'][0]['id'][0]
preservation_method = hits[659]['specimens'][0]['preservationMethod'][0]

individual['id'] = individual_id
individual['preservation_method'] = hca2ont(preservation_method)

print(json.dumps(individual ,indent=2, sort_keys=True))