In [120]:
%%file a.py
# Connection to MGnify API
from jsonapi_client import Session as APISession
from jsonapi_client import Modifier
import requests

# Dataframes and display
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Data transformation
from functools import reduce
from collections import defaultdict

Writing a.py


In [121]:
%%file b.py
def get_all_biome_names():
    endpoint_name='biomes'
    r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}")
    result = r.json()
    num_pages = result['meta']['pagination']['pages']

    xx = []
    for page in range(1, num_pages + 1):
        print(f'working on page: {page}')
        r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}?page={page}")
        xx.append(r.json())

    print(f'got {len(xx)}')
    return xx

z = get_all_biome_names()

Writing b.py


In [122]:
%%file c.py
all_biomes=[]
for result in z:
    data = result['data']
    #print(data)
    for record in data:
        print(record['id'])
        print(record['attributes']['samples-count'])
        all_biomes.append((record['id'],record['attributes']['samples-count']))

orig_all_biomes = all_biomes

Writing c.py


In [88]:
all_biomes = orig_all_biomes
all_biomes.sort(key=lambda x: x[1])
print("all:", len(all_biomes))
all_biomes = [ (x, y) for (x, y) in all_biomes if y > 0 ]
print("0-filtered:", len(all_biomes))
all_biomes = [ (x, y) for (x, y) in all_biomes if y >= 10 ]
print("10-filtered:", len(all_biomes))
all_biomes = [ (x, y) for (x, y) in all_biomes if y >= 50 ]
print("50-filtered:", len(all_biomes))

all_biomes = [ (x, y) for (x, y) in all_biomes if x.count(':') == 3 ]
print("hierarchy filtered:", len(all_biomes))

all_biomes[:50]

all: 475
0-filtered: 289
10-filtered: 242
50-filtered: 196
hierarchy filtered: 44


[('root:Host-associated:Fish:Skin', 51),
 ('root:Environmental:Terrestrial:Agricultural field', 52),
 ('root:Engineered:Bioremediation:Persistent organic pollutants (POP)', 53),
 ('root:Engineered:Modeled:Simulated communities (DNA mixture)', 59),
 ('root:Engineered:Bioremediation:Hydrocarbon', 65),
 ('root:Engineered:Modeled:Simulated communities (sequence read mixture)', 67),
 ('root:Host-associated:Birds:Respiratory system', 67),
 ('root:Environmental:Aquatic:Estuary', 70),
 ('root:Environmental:Aquatic:Non-marine Saline and Alkaline', 71),
 ('root:Host-associated:Mammals:Milk', 72),
 ('root:Engineered:Lab enrichment:Defined media', 80),
 ('root:Environmental:Terrestrial:Geologic', 85),
 ('root:Host-associated:Human:Respiratory system', 105),
 ('root:Engineered:Biogas plant:Wet fermentation', 111),
 ('root:Engineered:Solid waste:Composting', 159),
 ('root:Host-associated:Algae:Brown Algae', 179),
 ('root:Host-associated:Animal:Digestive system', 233),
 ('root:Engineered:Food product

In [123]:
%%file d.py
def get_samples_for_biome(biome):
    endpoint_name='biomes'
    r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}/{biome}/samples")
    result = r.json()
    num_pages = result['meta']['pagination']['pages']

    xx = []
    for page in range(1, num_pages + 1):
        print(f'working on page: {page} of {num_pages}')
        r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}/{biome}/samples?page={page}")
        xx.append(r.json())

    print(f'got {len(xx)} pages')
    return xx

#biome_name = all_biomes[0][0]
#samples_from_biomes = get_samples_for_biome(biome_name)

Writing d.py


In [124]:
%%file e.py
def get_runs_from_samples(samples_data):
    xx = []
    for item in samples_data:
        for item2 in item['data']:
            xx.append(item2['relationships']['runs']['links']['related'])
    return xx

#runs_from_samples = get_runs_from_samples(samples_from_biomes)

Writing e.py


In [65]:
#runs_from_samples

['https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS9995124/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886824/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886826/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886825/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886828/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886827/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4409919/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413232/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413234/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413248/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413260/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413262/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413265/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413276/runs',
 'https://www.ebi.ac

In [125]:
%%file g.py
def get_run_info_for_runs(run_urls):
    zz = []
    for n, run_url in enumerate(run_urls):
        print(f"{n + 1} of {len(run_urls)}")
        r = requests.get(run_url)
        result = r.json()
        num_pages = result['meta']['pagination']['pages']
        if num_pages == 1:
            zz.append(result)
        else:
            for page in range(1, num_pages + 1):
                print(f'working on page: {page} of {num_pages} for run URL {run_url}')        
                r = requests.get(run_url + f"?page={page}")
                zz.append(r.json)

    return zz

#run_info = get_run_info_for_runs(runs_from_samples)

Writing g.py


In [67]:
len(run_info)

89

In [72]:
run_info[0]['data'][0]['attributes']

{'experiment-type': 'metagenomic',
 'is-private': False,
 'accession': 'SRR15691272',
 'secondary-accession': 'SRR15691272',
 'ena-study-accession': None,
 'instrument-platform': 'ILLUMINA',
 'instrument-model': 'Illumina NovaSeq 6000'}

In [75]:
for ri in run_info:
    for item in ri['data']:
        attr = item.get('attributes')
        print(attr['accession'], attr['experiment-type'], attr['instrument-platform'], attr['instrument-model'])

SRR15691272 metagenomic ILLUMINA Illumina NovaSeq 6000
ERR5416478 unknown ILLUMINA Illumina HiSeq 3000
ERR5416927 unknown ILLUMINA Illumina HiSeq 3000
ERR5416479 unknown ILLUMINA Illumina HiSeq 3000
ERR5416480 unknown ILLUMINA Illumina HiSeq 3000
ERR5416520 unknown ILLUMINA Illumina HiSeq 3000
ERR4007995 amplicon ILLUMINA Illumina MiSeq
ERR4008002 amplicon ILLUMINA Illumina MiSeq
ERR4008004 amplicon ILLUMINA Illumina MiSeq
ERR4008006 amplicon ILLUMINA Illumina MiSeq
ERR4008008 amplicon ILLUMINA Illumina MiSeq
ERR4008009 amplicon ILLUMINA Illumina MiSeq
ERR4008051 amplicon ILLUMINA Illumina MiSeq
ERR4008052 amplicon ILLUMINA Illumina MiSeq
ERR4008054 amplicon ILLUMINA Illumina MiSeq
ERR4008058 amplicon ILLUMINA Illumina MiSeq
ERR4008061 amplicon ILLUMINA Illumina MiSeq
ERR4008062 amplicon ILLUMINA Illumina MiSeq
SRR3341855 metagenomic ILLUMINA Illumina Genome Analyzer IIx
SRR3341747 metagenomic ILLUMINA Illumina Genome Analyzer IIx
SRR3341746 metagenomic ILLUMINA Illumina Genome Analyze

In [126]:
%%file h.py
samples_by_biome = {}
for n, (biome_name, count) in enumerate(all_biomes):
    if n > 10:
        print(f'stopping at {n} for testing purposes')
        break
    print(f"getting: {biome_name} ({n + 1} of {len(all_biomes)})")
    samples_by_biome[biome_name] = get_samples_for_biome(biome_name)
    print(f"got: {len(samples_by_biome[biome_name])}")

Writing h.py


In [82]:
# TODO: dump to file!
# - [x] get only level 2 or level 3 biomes, since presumably the hierarchy is inclusive :sweat:

In [112]:
del runs_by_biome['root:Engineered:Modeled:Simulated communities (sequence read mixture)']

In [113]:
#runs_by_biome = defaultdict(list)
for biome_name, samples_vv in samples_by_biome.items():
    if biome_name in runs_by_biome:
        print(f'skipping {biome_name} - already present.')
        continue
    zz = get_runs_from_samples(samples_vv)
    runs_by_biome[biome_name].extend(zz)

skipping root:Host-associated:Fish:Skin - already present.
skipping root:Environmental:Terrestrial:Agricultural field - already present.
skipping root:Engineered:Bioremediation:Persistent organic pollutants (POP) - already present.
skipping root:Engineered:Modeled:Simulated communities (DNA mixture) - already present.
skipping root:Engineered:Bioremediation:Hydrocarbon - already present.
skipping root:Host-associated:Birds:Respiratory system - already present.
skipping root:Environmental:Aquatic:Estuary - already present.
skipping root:Environmental:Aquatic:Non-marine Saline and Alkaline - already present.
skipping root:Host-associated:Mammals:Milk - already present.
skipping root:Engineered:Lab enrichment:Defined media - already present.


In [95]:
list(runs_by_biome.items())[:5]

[('root:Host-associated:Fish:Skin',
  ['https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS1347802/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS1347803/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS1347808/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS1347809/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS1347810/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676801/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676802/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676803/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676804/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676805/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676806/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676807/runs',
   'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS2676808/runs',
   'https://www.ebi.ac.uk/me

In [117]:
del runinfo_by_biome['root:Engineered:Modeled:Simulated communities (sequence read mixture)']

In [127]:
%%file m.py
#runinfo_by_biome = defaultdict(list)
for biome_name, runlist in runs_by_biome.items():
    if biome_name in runinfo_by_biome:
        print(f"skipping {biome_name} - already present.")
        continue
    print(f"working on {biome_name} - {len(runlist)} runs")
    zz = get_run_info_for_runs(runlist)
    runinfo_by_biome[biome_name] = zz

Writing m.py


In [119]:

for biome_name, run_info in runinfo_by_biome.items():
    for n, ri in enumerate(run_info):
        try:
            for item in ri['data']:
                attr = item.get('attributes')
                print(biome_name, attr['accession'], attr['experiment-type'], attr['instrument-platform'], attr['instrument-model'])
        except TypeError:
            print(f'ERROR: biome {biome_name}, entry {n}')
            raise


root:Host-associated:Fish:Skin SRR3236773 amplicon ILLUMINA Illumina MiSeq
root:Host-associated:Fish:Skin SRR3236772 amplicon ILLUMINA Illumina MiSeq
root:Host-associated:Fish:Skin SRR3236767 amplicon ILLUMINA Illumina MiSeq
root:Host-associated:Fish:Skin SRR3236766 amplicon ILLUMINA Illumina MiSeq
root:Host-associated:Fish:Skin SRR3236765 amplicon ILLUMINA Illumina MiSeq
root:Host-associated:Fish:Skin ERR2754619 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754620 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754621 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754622 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754623 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754624 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754625 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754626 amplicon PACBIO_SMRT Sequel
root:Host-associated:Fish:Skin ERR2754627 amplicon PACBIO_SMRT Se

In [None]:
# next: pickle it!