In [1]:
import requests
from pprint import pprint

In [2]:
BASE_URL = "https://api-dev.microbiomedata.org"

### Give me stuff by this identifier

In [3]:
def get_by_id(id_: str):
    rv = requests.get(f"{BASE_URL}/nmdcschema/ids/{id_}")
    return rv.json()

In [12]:
doc = get_by_id("nmdc:sty-11-dcqce727")
print(doc["type"])
print(doc["name"])
pprint(
    [(ca["applies_to_person"]["name"],
      ca["applies_to_person"]["orcid"])
     for ca in doc["has_credit_associations"]
    ])

nmdc:Study
Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States
[('Harry Beller', 'orcid:0000-0001-9637-3650'),
 ('Shi Wang', 'orcid:0000-0002-2408-2544'),
 ('Eoin Brodie', 'orcid:0000-0002-8453-8435'),
 ('Rosalie Chu', 'orcid:0000-0001-7428-7647'),
 ('Emiley Eloe-Fadrosh', 'orcid:0000-0002-8162-1276'),
 ('David Hoyt', 'orcid:0000-0002-2857-719X'),
 ('Ulas Karaoz', 'orcid:0000-0002-8238-6757'),
 ('Patrick Sorensen', 'orcid:0000-0002-0558-2789'),
 ('Tijana Glavina del Rio', 'orcid:0000-0003-1411-9814'),
 ('Malak Tfaily', 'orcid:0000-0002-3036-2833')]


In [16]:
doc = get_by_id("nmdc:bsm-11-hbdmpd66")
print(doc["type"])
print(doc["name"])
pprint(doc["lat_lon"])
print(doc["associated_studies"])

nmdc:Biosample
Bulk soil microbial communities from the East River watershed near Crested Butte, Colorado, United States - ER_153
{'has_raw_value': '38.920576363 -106.948748019',
 'latitude': 38.920576363,
 'longitude': -106.948748019,
 'type': 'nmdc:GeolocationValue'}
['nmdc:sty-11-dcqce727']


In [18]:
# TODO: DECIDE: Update rest of notebook to reflect current schema?

### Give me all the MetaG/MAGS/metaT from Study ABC

In [6]:
import json

def get(
    collection: str,
    filter_: dict,
    max_page_size=20,
    page_token=None,
    limit=0
):
    params = {
        "filter": json.dumps(filter_),
        "max_page_size": max_page_size,
    }    
    getmore = True
    results = []
    while getmore:
        response = requests.get(
            f"{BASE_URL}/nmdcschema/{collection}",
            params=params
        ).json()
        if "resources" not in response:
            return response
        results.extend(response["resources"])
        if limit and len(results) > limit:
            getmore = False
        elif response.get("next_page_token"):
            print(f"Got {len(results)} so far. Getting more...")
            params["page_token"] = response["next_page_token"]
        else:
            getmore = False
    print(f"Got {len(results)}")
    return results

In [7]:
get("lollipops", {})

{'detail': "Collection name must be one of {'study_set', 'read_QC_analysis_activity_set', 'data_object_set', 'biosample_set', 'metabolomics_analysis_activity_set', 'omics_processing_set', 'metagenome_assembly_set', 'genome_feature_set', 'mags_activity_set', 'read_based_analysis_activity_set', 'functional_annotation_set', 'metaproteomics_analysis_activity_set', 'nom_analysis_activity_set', 'metatranscriptome_activity_set', 'metagenome_annotation_activity_set'}"}

In [8]:
biosamples = get(
    "biosample_set", {"part_of": "gold:Gs0135149"}, max_page_size=100
)

Got 53


In [9]:
def ids_of(documents):
    return [d["id"] for d in documents]

In [10]:
processings = get(
    "omics_processing_set",
    {"has_input": {"$in": ids_of(biosamples)}},
    max_page_size=100
)

Got 100 so far. Getting more...
Got 200 so far. Getting more...
Got 300 so far. Getting more...
Got 400 so far. Getting more...
Got 500 so far. Getting more...
Got 600 so far. Getting more...
Got 700 so far. Getting more...
Got 800 so far. Getting more...
Got 831


In [11]:
"""
Request is too large with 831 IDs, so breaking it up here.
"""

docs = []

for i in range(0, 1000, 100):
    ids = ids_of(processings)[i:i+100]
    docs.extend(get(
    "metatranscriptome_activity_set",
    {"was_informed_by": {"$in": ids}},
    max_page_size=100))

Got 0
Got 0
Got 0
Got 0
Got 0
Got 0
Got 0
Got 12
Got 27
Got 0


In [12]:
len(docs)

39

In [13]:
def get_turbo(
    collection: str,
    filter_: dict,
    max_page_size=1000,
    limit=0,
    authorization=None
):
    json_in = {
        "find": collection,
        "filter": filter_,
        "limit": limit or max_page_size,
        "sort": {"id": 1}
    }    
    getmore = True
    results = []
    headers = {"Authorization": authorization} if authorization else None
    while getmore:
        response = requests.post(
            f"{BASE_URL}/queries:run",
            json=json_in,
            headers=headers
        ).json()
        if not response.get("ok"):
            return response
        batch = response["cursor"]["firstBatch"]
        for d in batch:
            d.pop("_id")
        results.extend(batch)
        if limit and len(results) > limit:
            getmore = False
        elif len(batch) == max_page_size:
            print(f"Got {len(results)} so far. Getting more...")
            json_in["find"]["_id"] = {"$gt": str(batch[-1]["_id"])}
        else:
            getmore = False
    print(f"Got {len(results)}")
    return results

In [14]:
def get_authorization(username, password):
    rv = requests.post(
        f"{BASE_URL}/token",
        data={
            "grant_type": "password",
            "username": username,
            "password": password,
        })
    if not str(rv.status_code).startswith("2"):
        raise Exception(rv.text)
    response = rv.json()
    print("Expires", response["expires"])
    return f'Bearer {response["access_token"]}'

In [15]:
authorization = get_authorization("<USERNAME>", "<PASSWORD>")

Expires {'days': 0, 'hours': 0, 'minutes': 30}


In [16]:
docs = get_turbo(
    "metatranscriptome_activity_set",
    {"was_informed_by": {"$in": ids_of(processings)}},
    authorization=authorization
)

Got 39


In [17]:
docs = get_turbo(
    "metagenome_assembly_set",
    {"was_informed_by": {"$in": ids_of(processings)}},
    authorization=authorization
)

Got 92


In [18]:
docs = get_turbo(
    "metagenome_annotation_activity_set",
    {"was_informed_by": {"$in": ids_of(processings)}},
    authorization=authorization
)

Got 90


In [19]:
docs = get_turbo(
    "nom_analysis_activity_set",
    {"was_informed_by": {"$in": ids_of(processings)}},
    authorization=authorization
)

Got 101
