In [1]:
# Connection to MGnify API
from jsonapi_client import Session as APISession
from jsonapi_client import Modifier
import requests

# Dataframes and display
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Data transformation
from functools import reduce

In [20]:
def get_all_biome_names():
    endpoint_name='biomes'
    r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}")
    result = r.json()
    num_pages = result['meta']['pagination']['pages']

    xx = []
    for page in range(1, num_pages + 1):
        print(f'working on page: {page}')
        r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}?page={page}")
        xx.append(r.json())

    print(f'got {len(xx)}')
    return xx

z = get_all_biome_names()

working on page: 1
working on page: 2
working on page: 3
working on page: 4
working on page: 5
working on page: 6
working on page: 7
working on page: 8
working on page: 9
working on page: 10
working on page: 11
working on page: 12
working on page: 13
working on page: 14
working on page: 15
working on page: 16
working on page: 17
working on page: 18
working on page: 19
got 19


In [32]:
all_biomes=[]
for result in z:
    data = result['data']
    #print(data)
    for record in data:
        print(record['id'])
        print(record['attributes']['samples-count'])
        all_biomes.append((record['id'],record['attributes']['samples-count']))

orig_all_biomes = all_biomes

root:Engineered
284
root:Engineered:Biogas plant
126
root:Engineered:Biogas plant:Wet fermentation
111
root:Engineered:Bioreactor
1003
root:Engineered:Bioreactor:Continuous culture
648
root:Engineered:Bioreactor:Continuous culture:Marine intertidal flat sediment inoculum
23
root:Engineered:Bioreactor:Continuous culture:Marine intertidal flat sediment inoculum:Wadden Sea-Germany
5
root:Engineered:Bioreactor:Continuous culture:Marine sediment inoculum
35
root:Engineered:Bioreactor:Continuous culture:Marine sediment inoculum:Wadden Sea-Germany
8
root:Engineered:Bioremediation
49
root:Engineered:Bioremediation:Hydrocarbon
65
root:Engineered:Bioremediation:Hydrocarbon:Benzene
0
root:Engineered:Bioremediation:Hydrocarbon:Benzene:Bioreactor
2
root:Engineered:Bioremediation:Metal
0
root:Engineered:Bioremediation:Persistent organic pollutants (POP)
53
root:Engineered:Bioremediation:Persistent organic pollutants (POP):Polycyclic aromatic hydrocarbons
0
root:Engineered:Bioremediation:Terephthalat

In [36]:
all_biomes = orig_all_biomes
all_biomes.sort(key=lambda x: x[1])
print("all:", len(all_biomes))
all_biomes = [ (x, y) for (x, y) in all_biomes if y > 0 ]
print("0-filtered:", len(all_biomes))
all_biomes = [ (x, y) for (x, y) in all_biomes if y >= 10 ]
print("10-filtered:", len(all_biomes))
all_biomes = [ (x, y) for (x, y) in all_biomes if y >= 50 ]
print("50-filtered:", len(all_biomes))

all_biomes[:50]

all: 475
0-filtered: 289
10-filtered: 242
50-filtered: 196


[('root:Environmental:Aquatic:Non-marine Saline and Alkaline:Hypersaline', 50),
 ('root:Environmental:Terrestrial:Soil:Mine', 50),
 ('root:Environmental:Terrestrial:Soil:Loam', 51),
 ('root:Host-associated:Fish:Skin', 51),
 ('root:Environmental:Aquatic:Freshwater:Groundwater:Acid Mine Drainage', 52),
 ('root:Environmental:Terrestrial:Agricultural field', 52),
 ('root:Engineered:Bioremediation:Persistent organic pollutants (POP)', 53),
 ('root:Engineered:Lab enrichment:Defined media:Marine media', 53),
 ('root:Engineered:Modeled:Simulated communities (DNA mixture)', 59),
 ('root:Environmental:Aquatic:Freshwater:Wetlands:Marsh', 59),
 ('root:Environmental:Terrestrial:Soil:Wetlands', 59),
 ('root:Engineered:Lab enrichment:Defined media:Anaerobic media', 63),
 ('root:Environmental', 63),
 ('root:Environmental:Aquatic:Freshwater:Storm water', 63),
 ('root:Environmental:Aquatic:Marine:Cold seeps:Sediment', 63),
 ('root:Environmental:Aquatic:Lentic:Brackish', 64),
 ('root:Engineered:Bioremedi

In [63]:
def get_samples_for_biome(biome):
    endpoint_name='biomes'
    r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}/{biome}/samples")
    result = r.json()
    num_pages = result['meta']['pagination']['pages']

    xx = []
    for page in range(1, num_pages + 1):
        print(f'working on page: {page} of {num_pages}')
        r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/latest/{endpoint_name}/{biome}/samples?page={page}")
        xx.append(r.json())

    print(f'got {len(xx)} pages')
    return xx

#biome_name = all_biomes[0][0]
#samples_from_biomes = get_samples_for_biome(biome_name)

working on page: 1 of 4
working on page: 2 of 4
working on page: 3 of 4
working on page: 4 of 4
got 4


In [64]:
def get_runs_from_samples(samples_data):
    xx = []
    for item in samples_data:
        for item2 in item['data']:
            xx.append(item2['relationships']['runs']['links']['related'])
    return xx

runs_from_samples = get_runs_from_samples(samples_from_biomes)

In [65]:
runs_from_samples

['https://www.ebi.ac.uk/metagenomics/api/v1/samples/SRS9995124/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886824/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886826/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886825/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886828/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS5886827/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4409919/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413232/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413234/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413248/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413260/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413262/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413265/runs',
 'https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS4413276/runs',
 'https://www.ebi.ac

In [66]:
def get_run_info_for_runs(run_urls):
    zz = []
    for n, run_url in enumerate(run_urls):
        print(f"{n + 1} of {len(run_urls)}")
        r = requests.get(run_url)
        result = r.json()
        num_pages = result['meta']['pagination']['pages']
        if num_pages == 1:
            zz.append(result)
        else:
            assert 0

            for page in range(1, num_pages + 1):
                print(f'working on page: {page} of {num_pages} for run URL {run_url}')        
                r = requests.get(run_url + f"?page={page}")
                zz.append(r.json)

    return zz

run_info = get_run_info_for_runs(runs_from_samples)

0 of 89
1 of 89
2 of 89
3 of 89
4 of 89
5 of 89
6 of 89
7 of 89
8 of 89
9 of 89
10 of 89
11 of 89
12 of 89
13 of 89
14 of 89
15 of 89
16 of 89
17 of 89
18 of 89
19 of 89
20 of 89
21 of 89
22 of 89
23 of 89
24 of 89
25 of 89
26 of 89
27 of 89
28 of 89
29 of 89
30 of 89
31 of 89
32 of 89
33 of 89
34 of 89
35 of 89
36 of 89
37 of 89
38 of 89
39 of 89
40 of 89
41 of 89
42 of 89
43 of 89
44 of 89
45 of 89
46 of 89
47 of 89
48 of 89
49 of 89
50 of 89
51 of 89
52 of 89
53 of 89
54 of 89
55 of 89
56 of 89
57 of 89
58 of 89
59 of 89
60 of 89
61 of 89
62 of 89
63 of 89
64 of 89
65 of 89
66 of 89
67 of 89
68 of 89
69 of 89
70 of 89
71 of 89
72 of 89
73 of 89
74 of 89
75 of 89
76 of 89
77 of 89
78 of 89
79 of 89
80 of 89
81 of 89
82 of 89
83 of 89
84 of 89
85 of 89
86 of 89
87 of 89
88 of 89


In [67]:
len(run_info)

89

In [72]:
run_info[0]['data'][0]['attributes']

{'experiment-type': 'metagenomic',
 'is-private': False,
 'accession': 'SRR15691272',
 'secondary-accession': 'SRR15691272',
 'ena-study-accession': None,
 'instrument-platform': 'ILLUMINA',
 'instrument-model': 'Illumina NovaSeq 6000'}

In [75]:
for ri in run_info:
    for item in ri['data']:
        attr = item.get('attributes')
        print(attr['accession'], attr['experiment-type'], attr['instrument-platform'], attr['instrument-model'])

SRR15691272 metagenomic ILLUMINA Illumina NovaSeq 6000
ERR5416478 unknown ILLUMINA Illumina HiSeq 3000
ERR5416927 unknown ILLUMINA Illumina HiSeq 3000
ERR5416479 unknown ILLUMINA Illumina HiSeq 3000
ERR5416480 unknown ILLUMINA Illumina HiSeq 3000
ERR5416520 unknown ILLUMINA Illumina HiSeq 3000
ERR4007995 amplicon ILLUMINA Illumina MiSeq
ERR4008002 amplicon ILLUMINA Illumina MiSeq
ERR4008004 amplicon ILLUMINA Illumina MiSeq
ERR4008006 amplicon ILLUMINA Illumina MiSeq
ERR4008008 amplicon ILLUMINA Illumina MiSeq
ERR4008009 amplicon ILLUMINA Illumina MiSeq
ERR4008051 amplicon ILLUMINA Illumina MiSeq
ERR4008052 amplicon ILLUMINA Illumina MiSeq
ERR4008054 amplicon ILLUMINA Illumina MiSeq
ERR4008058 amplicon ILLUMINA Illumina MiSeq
ERR4008061 amplicon ILLUMINA Illumina MiSeq
ERR4008062 amplicon ILLUMINA Illumina MiSeq
SRR3341855 metagenomic ILLUMINA Illumina Genome Analyzer IIx
SRR3341747 metagenomic ILLUMINA Illumina Genome Analyzer IIx
SRR3341746 metagenomic ILLUMINA Illumina Genome Analyze

In [81]:
samples_by_biome = {}
for n, (biome_name, count) in enumerate(all_biomes):
    print(f"getting: {biome_name} ({n + 1} of {len(all_biomes)})")
    samples_by_biome[biome_name] = get_samples_for_biome(biome_name)
    print(f"got: {len(samples_by_biome[biome_name])}")

getting: root:Environmental:Aquatic:Non-marine Saline and Alkaline:Hypersaline (1 of 196)
working on page: 1 of 4
working on page: 2 of 4
working on page: 3 of 4
working on page: 4 of 4
got 4
got: 4
getting: root:Environmental:Terrestrial:Soil:Mine (2 of 196)
working on page: 1 of 2
working on page: 2 of 2
got 2
got: 2
getting: root:Environmental:Terrestrial:Soil:Loam (3 of 196)
working on page: 1 of 50
working on page: 2 of 50
working on page: 3 of 50
working on page: 4 of 50
working on page: 5 of 50
working on page: 6 of 50
working on page: 7 of 50
working on page: 8 of 50
working on page: 9 of 50
working on page: 10 of 50
working on page: 11 of 50
working on page: 12 of 50
working on page: 13 of 50
working on page: 14 of 50
working on page: 15 of 50
working on page: 16 of 50
working on page: 17 of 50
working on page: 18 of 50
working on page: 19 of 50
working on page: 20 of 50
working on page: 21 of 50
working on page: 22 of 50
working on page: 23 of 50
working on page: 24 of 50
wor

KeyboardInterrupt: 

In [None]:
# TODO: dump to file!