# Scraping metadata from MGnify API

modified from: https://gist.github.com/SandyRogers/5d9eff7f1f7b08cfa40265f5e2adf9cd#file-fetch_paginated_mgnify_data-py

MGnify provides curated biome terms for some metagenomes (ex root:Engineered:Modeled:Simulated communities (DNA mixture)). This notebook downloads biome metadata.

## Download using pandas; breaks for endpoint "samples"

In [4]:
from jsonapi_client import Session
import pandas as pd

In [9]:
# See https://www.ebi.ac.uk/metagenomics/api/docs/ for endpoints and API documentation.
endpoint = 'samples'

with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    resources = map(lambda r: r.json, mgnify.iterate(endpoint))
    resources = pd.json_normalize(resources)
    resources.to_csv(f"{endpoint}.csv")

DocumentError: Error 400: ?

## Download w/o pandas

In [10]:
import urllib.request
import json
import csv

In [11]:
# See https://www.ebi.ac.uk/metagenomics/api/docs/ for endpoints and API documentation
# including attributes you may want as CSV columns.
endpoint = 'samples'

def get_page(url):
    next_url = url
    while next_url:
        with urllib.request.urlopen(next_url) as page:
            response = json.loads(page.read().decode())
            data = response['data']
            yield data
            next_url = response['links']['next']

In [78]:
attribute_columns = ["accession", "analysis-completed", "biome", "biosample", "collection-date", "environment-biome",
                     "environment-feature", "environment-material", "geo-loc-name", "host-tax-id", "last-update",
                     "latitude", "longitude", "runs", "sample-alias", "sample-desc", 'experiment-type', 'study']
all_columns = ["accession", "analysis-completed", "biome", "biosample", "collection-date", "environment-biome",
               "environment-feature", "environment-material", "geo-loc-name", "host-tax-id", "last-update",
               "latitude", "longitude", "runs", "sample-alias", "sample-desc", 'experiment-type', 'study', 
               'biome']


In [88]:
with open(f"{endpoint}.csv", "w") as csv_file:
    c = csv.writer(csv_file)
    c.writerow(all_columns)
    for page in get_page(f"https://www.ebi.ac.uk/metagenomics/api/v1/{endpoint}"):
        for resource in page:
            lst = [resource['attributes'].get(col) for col in attribute_columns] 
            lst.append(resource['relationships']['biome']['data']['id']) # biome
            #lst.append(resource['relationships']['studies']['data'][0]['id']) # study
            c.writerow(lst)    

In [86]:
# example of what resource looks like
resource

{'type': 'samples',
 'id': 'SRS458854',
 'attributes': {'longitude': None,
  'sample-metadata': [{'key': 'host taxid', 'value': '9606', 'unit': None},
   {'key': 'NCBI sample classification', 'value': '646099', 'unit': None},
   {'key': 'instrument model', 'value': '454 GS FLX Titanium', 'unit': None},
   {'key': 'sample identifier', 'value': 'UAB049_8_5', 'unit': None},
   {'key': 'host scientific name', 'value': 'Homo sapiens', 'unit': None}],
  'latitude': None,
  'biosample': 'SAMN02254088',
  'accession': 'SRS458854',
  'analysis-completed': '2016-06-13',
  'collection-date': None,
  'geo-loc-name': None,
  'sample-desc': 'vaginal_metagenome UAB049_8_5',
  'environment-biome': None,
  'environment-feature': None,
  'environment-material': None,
  'sample-name': 'RAV134_UAB049_8_5',
  'sample-alias': 'RAV134_UAB049_8_5',
  'host-tax-id': 9606,
  'species': 'Homo sapiens',
  'last-update': '2022-03-17T21:59:42'},
 'relationships': {'runs': {'links': {'related': 'https://www.ebi.ac.u