# Download Mgnify analysis results

![MGnify](../assets/figs/mgnify_banner.png)

1. search studies
2. get analyses in a dataframe and save them as `.parquet`
3. compare to local data

In [1]:
# Connection to MGnify API

# this repo is not maintained, or less than jsonapi-requests
# consider a dep change
from jsonapi_client import Session as APISession
from jsonapi_client import Modifier
import requests

# Dataframes and display
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Data transformation
from functools import reduce

# Plots
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
%matplotlib inline 

# Create signature of MAGs for comparison against database
import sourmash
import glob
import time
from pathlib import PurePath as pp
from Bio import SeqIO

# Warning verbosity
import warnings 
warnings.filterwarnings(action="ignore")



## Query analyses

Use analyses endpoint.
A complete list of endpoints can be found at https://www.ebi.ac.uk/metagenomics/api/v1/.

In [2]:
# GET /metagenomics/api/v1/
r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/v1/")
r.json()['data']



{'biomes': 'https://www.ebi.ac.uk/metagenomics/api/v1/biomes',
 'studies': 'https://www.ebi.ac.uk/metagenomics/api/v1/studies',
 'super-studies': 'https://www.ebi.ac.uk/metagenomics/api/v1/super-studies',
 'samples': 'https://www.ebi.ac.uk/metagenomics/api/v1/samples',
 'runs': 'https://www.ebi.ac.uk/metagenomics/api/v1/runs',
 'assemblies': 'https://www.ebi.ac.uk/metagenomics/api/v1/assemblies',
 'analyses': 'https://www.ebi.ac.uk/metagenomics/api/v1/analyses',
 'experiment-types': 'https://www.ebi.ac.uk/metagenomics/api/v1/experiment-types',
 'pipelines': 'https://www.ebi.ac.uk/metagenomics/api/v1/pipelines',
 'pipeline-tools': 'https://www.ebi.ac.uk/metagenomics/api/v1/pipeline-tools',
 'publications': 'https://www.ebi.ac.uk/metagenomics/api/v1/publications',
 'genomes': 'https://www.ebi.ac.uk/metagenomics/api/v1/genomes',
 'genome-search': 'https://www.ebi.ac.uk/metagenomics/api/v1/genome-search',
 'genomes-search/gather': 'https://www.ebi.ac.uk/metagenomics/api/v1/genomes-search/g

In [3]:
endpoint_name = 'analyses'
r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/v1/{endpoint_name}")
r.json()['data'][0]


{'type': 'analysis-jobs',
 'id': 'MGYA00383253',
 'attributes': {'pipeline-version': '5.0',
  'analysis-summary': [{'key': 'Submitted nucleotide sequences',
    'value': '55799'},
   {'key': 'Nucleotide sequences after format-specific filtering',
    'value': '55789'},
   {'key': 'Nucleotide sequences after length filtering', 'value': '55789'},
   {'key': 'Nucleotide sequences after undetermined bases filtering',
    'value': '55789'},
   {'key': 'Predicted SSU sequences', 'value': '0'},
   {'key': 'Predicted LSU sequences', 'value': '461'}],
  'accession': 'MGYA00383253',
  'experiment-type': 'amplicon',
  'analysis-status': 'completed',
  'is-private': False,
  'last-update': '2024-01-29T15:29:19.757516',
  'mgx-accession': None,
  'complete-time': '2020-06-11T10:51:27',
  'instrument-platform': 'ILLUMINA',
  'instrument-model': 'Illumina MiSeq'},
 'relationships': {'taxonomy-lsu': {'links': {'related': 'https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00383253/taxonomy/lsu'}},

In [10]:
# by accession
endpoint_name = 'studies'
accession = 'MGYS00002392'
r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/v1/{endpoint_name}/{accession}")
r.json()['data']

{'type': 'studies',
 'id': 'MGYS00002392',
 'attributes': {'samples-count': 1073,
  'bioproject': 'PRJEB6610',
  'accession': 'MGYS00002392',
  'is-private': False,
  'last-update': '2024-04-15T20:15:45',
  'secondary-accession': 'ERP006157',
  'centre-name': 'GSC',
  'public-release-date': None,
  'study-abstract': 'Analysis of 18S DNA in Tara Oceans Protists size fractions through amplicon sequencing: Seawater was filtered from different depths to retain small and large cell sizes. The DNA was extracted and amplified by PCR.',
  'study-name': 'Amplicon sequencing of Tara Oceans DNA samples corresponding to size fractions for protists.',
  'data-origination': 'SUBMITTED'},
 'relationships': {'analyses': {'links': {'related': 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00002392/analyses'}},
  'publications': {'links': {'related': 'https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00002392/publications'}},
  'downloads': {'links': {'related': 'https://www.ebi.ac.uk/metage

In [11]:
r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/v1/{endpoint_name}/{accession}/analyses")
r.json()['data']

[{'type': 'analysis-jobs',
  'id': 'MGYA00722743',
  'attributes': {'pipeline-version': '5.0',
   'analysis-summary': [{'key': 'Submitted nucleotide sequences',
     'value': '2050402'},
    {'key': 'Nucleotide sequences after format-specific filtering',
     'value': '1357633'},
    {'key': 'Nucleotide sequences after length filtering', 'value': '1357633'},
    {'key': 'Nucleotide sequences after undetermined bases filtering',
     'value': '1357633'},
    {'key': 'Predicted LSU sequences', 'value': '1'},
    {'key': 'Predicted SSU sequences', 'value': '1355389'}],
   'accession': 'MGYA00722743',
   'experiment-type': 'amplicon',
   'analysis-status': 'completed',
   'is-private': False,
   'last-update': '2024-02-28T22:10:08.862460',
   'mgx-accession': 'MGX0000886',
   'complete-time': '2024-02-28T22:10:07',
   'instrument-platform': 'ILLUMINA',
   'instrument-model': 'Illumina Genome Analyzer IIx'},
  'relationships': {'taxonomy-lsu': {'links': {'related': 'https://www.ebi.ac.uk/me

In [12]:
data = pd.json_normalize(r.json()['data'])

In [15]:
data.shape

(25, 35)

In [16]:
data.head(2)

Unnamed: 0,type,id,attributes.pipeline-version,attributes.analysis-summary,attributes.accession,attributes.experiment-type,attributes.analysis-status,attributes.is-private,attributes.last-update,attributes.mgx-accession,attributes.complete-time,attributes.instrument-platform,attributes.instrument-model,relationships.taxonomy-lsu.links.related,relationships.taxonomy-itsunite.links.related,relationships.interpro-identifiers.links.related,relationships.study.data.type,relationships.study.data.id,relationships.study.links.related,relationships.taxonomy-itsonedb.links.related,relationships.run.data.type,relationships.run.data.id,relationships.run.links.related,relationships.taxonomy-ssu.links.related,relationships.downloads.links.related,relationships.genome-properties.links.related,relationships.sample.data.type,relationships.sample.data.id,relationships.sample.links.related,relationships.antismash-gene-clusters.links.related,relationships.assembly.data,relationships.taxonomy.links.related,relationships.go-slim.links.related,relationships.go-terms.links.related,links.self
0,analysis-jobs,MGYA00722743,5.0,"[{'key': 'Submitted nucleotide sequences', 'value': '2050402'}, {'key': 'Nucleotide sequences after format-specific filtering', 'value': '1357633'}, {'key': 'Nucleotide sequences after length filtering', 'value': '1357633'}, {'key': 'Nucleotide sequences after undetermined bases filtering', 'value': '1357633'}, {'key': 'Predicted LSU sequences', 'value': '1'}, {'key': 'Predicted SSU sequences', 'value': '1355389'}]",MGYA00722743,amplicon,completed,False,2024-02-28T22:10:08.862460,MGX0000886,2024-02-28T22:10:07,ILLUMINA,Illumina Genome Analyzer IIx,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/taxonomy/lsu,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/taxonomy/unite,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/interpro-identifiers,studies,MGYS00002392,https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00002392,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/taxonomy/itsonedb,runs,ERR566174,https://www.ebi.ac.uk/metagenomics/api/v1/runs/ERR566174,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/taxonomy/ssu,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/downloads,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/genome-properties,samples,ERS506046,https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS506046,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/antismash-gene-clusters,,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/taxonomy,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/go-slim,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743/go-terms,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722743
1,analysis-jobs,MGYA00722744,5.0,"[{'key': 'Submitted nucleotide sequences', 'value': '2899522'}, {'key': 'Nucleotide sequences after format-specific filtering', 'value': '1261859'}, {'key': 'Nucleotide sequences after length filtering', 'value': '1261859'}, {'key': 'Nucleotide sequences after undetermined bases filtering', 'value': '1261859'}, {'key': 'Predicted LSU sequences', 'value': '0'}, {'key': 'Predicted SSU sequences', 'value': '1261348'}]",MGYA00722744,amplicon,completed,False,2024-02-28T22:10:26.857803,MGX0000887,2024-02-28T22:10:26,ILLUMINA,Illumina Genome Analyzer IIx,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/taxonomy/lsu,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/taxonomy/unite,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/interpro-identifiers,studies,MGYS00002392,https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00002392,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/taxonomy/itsonedb,runs,ERR562730,https://www.ebi.ac.uk/metagenomics/api/v1/runs/ERR562730,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/taxonomy/ssu,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/downloads,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/genome-properties,samples,ERS492004,https://www.ebi.ac.uk/metagenomics/api/v1/samples/ERS492004,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/antismash-gene-clusters,,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/taxonomy,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/go-slim,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/go-terms,https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744


In [19]:
r = requests.get(f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/downloads")
r.json()

{'links': {'first': 'https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/downloads?page=1',
  'last': 'https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/downloads?page=1',
  'next': None,
  'prev': None},
 'data': [{'type': 'analysis-job-downloads',
   'id': 'ERR562730_MERGED_FASTQ.fasta.gz',
   'attributes': {'alias': 'ERR562730_MERGED_FASTQ.fasta.gz',
    'file-format': {'name': 'FASTA',
     'extension': 'fasta',
     'compression': True},
    'description': {'label': 'Processed nucleotide reads',
     'description': 'Processed nucleotide reads'},
    'group-type': 'Sequence data',
    'file-checksum': {'checksum': '', 'checksum-algorithm': ''}},
   'relationships': {'pipeline': {'data': {'type': 'pipelines', 'id': '5.0'},
     'links': {'related': 'https://www.ebi.ac.uk/metagenomics/api/v1/pipelines/5.0'}}},
   'links': {'self': 'https://www.ebi.ac.uk/metagenomics/api/v1/analyses/MGYA00722744/file/ERR562730_MERGED_FASTQ.fasta.gz'}},
  {'type': 'analysis-jo


## Get information for a specific genus or species

Examples: Search for available ressources for a specific genus or species of interest.

- Listeria
- Listeria monocytogenes

The taxon-lineage field contains domain, phylum, class, order, family, genus, species, subspecies as

`d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Listeriaceae;g__Listeria;s__Listeria` monocytogenes(example for Listeria monocytogenes).

The filter can use the full lineage
`d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Listeriaceae;g__Listeria` or only part of it `g__Listeria` or `Listeria`.


In [None]:
genus_filter = 'Listeria'
species_filter = 'Listeria monocytogenes'

with APISession("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    search_filter = Modifier(f"taxon_lineage={genus_filter}")
    resources = map(lambda r: r.json, mgnify.iterate(endpoint_name, filter=search_filter))
    resources_df = pd.json_normalize(resources)

In [None]:
# Display the table containing the results of the query
resources_df


Query the database with the 'Listeria monocytogenes' filter and store the results in a Pandas DataFrame.



In [None]:
with APISession("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    search_filter_2 = Modifier(f"taxon_lineage={species_filter}")
    resources_2 = map(lambda r: r.json, mgnify.iterate(endpoint_name, filter=search_filter_2))
    resources_df_2 = pd.json_normalize(resources_2)

In [None]:
resources_df_2

In [None]:
resources_df.to_parquet('Listeria_resources.parquet')
listeria_df = pd.read_parquet('Listeria_resources.parquet')
listeria_df


Query and save the dataset as parquet file

To query the whole dataset, we can use the same method as previously. The only difference is that no filter is passed to the query.

Warning: Querying without filter is computationally expensive and will take time.

A pre-fetched copy of the data (as of 8 November 2022) is available in ../example-data/genomes/all_genome_resources.parquet.

In [None]:
# Careful, this takes a while to run
# on the order level, the query takes 
domain_filter = 'Bacteria'
order_filter = 'Lactobacillales'

with APISession("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    search_filter_3 = Modifier(f"taxon_lineage={order_filter}")
    resources_all = map(lambda r: r.json, mgnify.iterate(endpoint_name, filter=search_filter_3))
    resources_all_df = pd.json_normalize(resources_all)

resources_all_df
resources_all_df.to_parquet('latest_genome_resources.parquet')

## Spark session to load all that data

In [11]:
from pyspark.sql import SparkSession

import pyspark.sql.functions as F
import pyspark

In [None]:
pyspark.conf.SparkConf().set("spark.sql.debug.maxToStringFields", 500)

In [None]:
spark = SparkSession.builder.getOrCreate()

In [21]:
all_genomes_df = spark.read.parquet('latest_genome_resources.parquet')

In [None]:
all_genomes_df.count(), len(all_genomes_df.columns)

In [None]:
all_genomes_df.describe().show(truncate=False, vertical=True)

### Get most represented genus

In [None]:
# To see a sample of taxon-lineages present in the dataset:
all_genomes_df.select(f'`attributes.taxon-lineage`').show(truncate=False)

In [None]:
# The total number of genomes in the dataset:
all_genomes_df.select('`id`').distinct().count()

In [None]:
# The number of distinct lineages:
all_genomes_df.select('`attributes.taxon-lineage`').distinct().count()

 ### Split taxon-lineage column into 7 columns

In [None]:
features = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

all_genomes_tax_df = reduce(lambda df, i: df.withColumn(features[i], F.col('lineage_split')[i]),
    range(len(features)),
    all_genomes_df.withColumn('lineage_split', F.split(F.col('`attributes.taxon-lineage`'), ';')),
)

all_genomes_tax_df.select(features).show(n=5)

### Query examples

In [None]:
# To search the most represented taxon:
all_genomes_tax_df.groupby('`attributes.taxon-lineage`').count().filter(F.col('count')>100).show(truncate=False)

In [None]:
# To search for a particular lineage and count how many times it appears:
all_genomes_tax_df.filter(F.col('`attributes.taxon-lineage`').startswith('d__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Coriobacteriaceae;g__Collinsella')).count()

In [None]:
# To search for a particular genus and count how many times it appears:
all_genomes_tax_df.filter(F.col('`attributes.taxon-lineage`').contains('Collinsella')).count()

In [None]:
# To search for the most or least represented genus, species, ... in this dataset for example. The search is more flexible than for the full taxon.
all_genomes_tax_df.groupby('genus').count().filter(F.col('count')>100).show()

In [None]:
all_genomes_tax_df.filter(F.col('genus').isin('g__Prevotella', 'g__RC9', 'g__Collinsella')).groupby('genus').agg(F.countDistinct('species')).show()

In [None]:
# To see some of the Collinsella species in the dataset:
all_genomes_tax_df.filter(F.col('genus')=='g__Collinsella').select('species').distinct().show(truncate=False)

## Graphics

In [None]:
all_genomes_tax_df.count()

In [None]:
all_genomes_tax_df.select([F.count_distinct(x).alias(f'{features[i]}_count') for i, x in enumerate([*features])]).show()

In [None]:
import holoviews as hv
hv.extension('bokeh')

def get_sankey(df, cat_cols=[], value_cols='', title='Sankey Diagram'):
    # Colors
    colorPalette = ['rgba(31, 119, 180, 0.8)',
     'rgba(255, 127, 14, 0.8)',
     'rgba(44, 160, 44, 0.8)',
     'rgba(214, 39, 40, 0.8)',
     'rgba(148, 103, 189, 0.8)',
     'rgba(140, 86, 75, 0.8)',
     'rgba(227, 119, 194, 0.8)',
     'rgba(127, 127, 127, 0.8)']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
 
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
 
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum

    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
 
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
 
    # creating data for the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(
                color = "black",
                width = 0.5
            ),
            label = labelList,
            color = colorList
        ),
        link = dict(
            source = sourceTargetDf['sourceID'],
            target = sourceTargetDf['targetID'],
            value = sourceTargetDf['count']
        )
    )
    
    # override gray link colors with 'source' colors
    opacity = 0.4
    # change 'magenta' to its 'rgba' value to add opacity
    data['node']['color'] = ['rgba(255,0,255, 0.8)' if color == "magenta" else color for color in data['node']['color']]
    data['link']['color'] = [data['node']['color'][src].replace("0.8", str(opacity))
                                        for src in data['link']['source']]
    
    
    fig = go.Figure(data=[go.Sankey(
    # Define nodes
    node = dict(
      pad = 15,
      thickness = 15,
      line = dict(color = "black", width = 0.5),
      label =  data['node']['label'],
      color =  data['node']['color']
    ),
    # Add links
    link = dict(
      source =  data['link']['source'],
      target =  data['link']['target'],
      value =  data['link']['value'],
      color =  data['link']['color']
    ))])
    
    fig.update_layout(title_text=title, font_size=10)
    
    return fig.show(renderer='iframe')


# To create a Sankey diagram for the taxon lineage:
def sankey_hv():
    sankey_df = all_genomes_tax_df.groupby(features).count().toPandas()
    sankey_df = sankey_df.rename(columns={'count': 'value'})
    edges = pd.read_csv('data/health-breakup2.csv')
    sankey = hv.Sankey(edges, label='A Breakout of National Health Care Expenditures')
    sankey.opts(label_position='left', edge_color='target', node_color='index', cmap='tab20')

In [None]:
sankey_df = all_genomes_tax_df.groupby(features).count().toPandas()
sankey_df = sankey_df.rename(columns={'count': 'value'})
sankey_df.head()

In [None]:
sankey = hv.Sankey(sankey_df[['phylum', "species", "value"]], label='Taxon Lineage')
sankey.opts(width=600, height=400)

In [52]:
# Convert Spark DataFrame to Pandas DataFrame:
pdf = all_genomes_tax_df.select(features).groupby(features).count().toPandas()

### Representation of a sample of the genomes present in the dataset: example from the order of the Lactobacillales.

In [None]:
pdf_lactobacillales = all_genomes_tax_df.filter(F.col('order')=='o__Lactobacillales').select(features).groupby(features).count().toPandas()
# fig_l = get_sankey(pdf_lactobacillales,cat_cols=features[0:6], value_cols='count',title='Genomes from the Lactobacillales order')

# Note that there are too many distinct species in the Lactobacillales order to show individually:
all_genomes_tax_df.filter(F.col('order')=='o__Lactobacillales').select('species').distinct().count()



### Information such as genome length or GC-content can also be represented

We can group and visualise these at different levels like family, genus, species... depending on the number of sequences available and on the biological significance.


In [None]:
lactobacillales_df = all_genomes_tax_df.filter(F.col('order')=='o__Lactobacillales').orderBy('family').toPandas()
lactobacillales_count = all_genomes_tax_df.filter(F.col('order')=='o__Lactobacillales').groupby('family').count().orderBy('family').toPandas()
lactobacillales_count

In [None]:
fig = plt.figure(figsize=(10, 10), layout="constrained")
spec = fig.add_gridspec(3, 1)

ax00 = fig.add_subplot(spec[0, 0])
sns.barplot(data=lactobacillales_count, x='family', y='count')
plt.ylabel("Number of genome available")

ax10 = fig.add_subplot(spec[1, 0])
sns.boxplot(data=lactobacillales_df, x='family', y='attributes.length')
plt.ylabel("Genome length (bp)")
#plt.xlabel("Family of the Lactobacillales order")

ax20 = fig.add_subplot(spec[2, 0])
sns.boxplot(data=lactobacillales_df, x='family', y='attributes.gc-content')
plt.ylabel("GC-content (%)")
plt.xlabel("Family of the Lactobacillales order")


fig.suptitle('Number of genomes avalaible, genome length and GC-content of bacteria belonging the Lactobacillales order')


In [None]:
fig = plt.figure(figsize=(20, 5))
spec = fig.add_gridspec(1, 2)

#ax00 = fig.add_subplot(spec[0, 0])
#lactobacillales_df['relationships.biome.data.id'].hist()
#plt.xlabel("Biome")

ax01 = fig.add_subplot(spec[0:])
lactobacillales_df['relationships.catalogue.data.id'].hist()
plt.xlabel("Catalogue")
ax01.grid(False)

fig.suptitle('Biome and Catalogue related to bacteria belonging the Lactobacillales order')

### Another example: produce a quality control figure similar to Extended Data Fig. 4a of [Almeida et al 2020](https://www.nature.com/articles/s41587-020-0603-3/figures/10)

In [None]:
qc_df = all_genomes_tax_df.toPandas()
qc_df[['attributes.completeness', 'attributes.contamination']].describe()

In [None]:
fig = plt.figure(figsize=(5, 10), layout="constrained")
spec = fig.add_gridspec(1, 1)

ax00 = fig.add_subplot(spec[0, 0])
sns.boxplot(data=qc_df[['attributes.completeness', 'attributes.contamination']])
plt.ylabel("%")


fig.suptitle('Quality of genomes avalaible')


## Find out whether your own MAGs are novel compared to the MGnify catalogues

Another use for the MGnify genomes resource is to query your own MAG against MGnify's MAG catalogues, to see whether they are novel or already represented.
List directories of the files to be analysed:

Replace the str with your own path to folder containing your files. * allows to query all the file with the .fa extension.


In [None]:
files = glob.glob('../data/input_gecco/*.fa')
files


### Compute a sourmash sketch for each MAG

Create "sketches" for each MAG using Sourmash

A sketch goes into a signature, that we will use for searching. The signature is a sort of collection of hashes that are well suited for calculating the containment of your MAGs within the catalogue's MAGs.


In [None]:
for mag in files:
    # The sourmash parameters are chosen to match those used within MGnify
    sketch = sourmash.MinHash(n=0, ksize=31, scaled=1000)
    
    # A fasta file may have multiple records in it. Add them all to the sourmash signature.
    for index, record in enumerate(SeqIO.parse(mag, 'fasta')):
        sketch.add_sequence(str(record.seq))
        
    # Save the sourmash sketch as a "signature" file
    signature = sourmash.SourmashSignature(sketch, name=record.name)
    with open(pp(pp(mag).name).stem + '.sig', 'wt') as fp:
        sourmash.save_signatures([signature], fp)


## Fetch all of the catalogue IDs currently available on MGnify

To fetch the catalogue IDs to the MGnify API, use the following endpoint: https://www.ebi.ac.uk/metagenomics/api/v1/genome-catalogues.


In [62]:
catalogue_endpoint = "genome-catalogues"

In [63]:
with APISession("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    catalogues = map(lambda r: r.json, mgnify.iterate(catalogue_endpoint))
    catalogues = pd.json_normalize(catalogues)

In [None]:
catalogue_ids = list(catalogues['id'])
catalogue_ids


## Submit a search job to the MGnify API

Tosubmit a job to the MGnify API, use the following endpoint: https://www.ebi.ac.uk/metagenomics/api/v1/genomes-search/gather.
Data will be send to the API, which is called "POST"ing data in the API world.
This part of the API is quite specialized and so is not a formal JSON:API, the requests Python packageìs therefore used to communicate with it.


In [65]:
endpoint = 'https://www.ebi.ac.uk/metagenomics/api/v1/genomes-search/gather'

In [None]:
# Create a list of file uploads, and attach them to the API request
signatures = [open(sig, 'rb') for sig in glob.glob('*.sig')]
sketch_uploads = [('file_uploaded', signature) for signature in signatures]

# Send the API request - it specifies which catalogue to search against and attaches all of the signature files.
submitted_job = requests.post(endpoint, data={'mag_catalogues': catalogue_ids}, files=sketch_uploads).json()


map(lambda fp: fp.close(), signatures)  # tidy up open file pointers

print(submitted_job)

In [None]:
job_done = False
while not job_done:
    print('Checking status...')
    # The status_URL is another API endpoint that's unique for the submitted search job
    query_result = None
    
    while not query_result:
        query_result = requests.get(submitted_job['data']['status_URL'])
        print('Still waiting for jobs to complete. Current status of jobs')
        print('Will check again in 2 seconds')
        time.sleep(2) 
        
    queries_status = {sig['job_id']: sig['status'] for sig in query_result.json()['data']['signatures']}
    job_done = all(map(lambda q: q == 'SUCCESS', queries_status.values()))
    
print('Job done!')

In [None]:
query_result_df = pd.json_normalize(query_result.json()['data']['signatures'])
query_result_df

In [None]:
matches = query_result_df.dropna(subset=['result.match'])
matches