In [1]:
import os
import pandas as pd
import requests
from tqdm.notebook import trange, tqdm

pd.set_option('display.max_columns', 100)

# EBI MGnify catalogs

In [2]:
def select_representatives(df):
    representatives = set(df['Species_rep'].unique())
    return df.loc[df['Genome'].isin(representatives)]

## Human oral catalog

In [3]:
path_ebi_oral = '../raw_data/EBI/human-oral-v1-0'

In [4]:
!mkdir -p $path_ebi_oral/fna
!wget -P files/input/mgnify_meta http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-oral/v1.0/genomes-all_metadata.tsv
!mv 'files/input/mgnify_meta/genomes-all_metadata.tsv' 'files/input/mgnify_meta/genomes-all_metadata_oral.tsv'

In [5]:
df_ebi_oral = select_representatives(pd.read_csv('files/input/mgnify_meta/genomes-all_metadata_oral.tsv', sep='\t'))

In [6]:
df_ebi_oral.to_csv('files/output/mgnify_meta/representative-all_metadata_oral.tsv', index=False, sep='\t')

In [7]:
for mag in df_ebi_oral['Genome']:
    url = f"http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-oral/v1.0/species_catalogue/{mag[:-2]}/{mag}/genome/{mag}.fna"
    !wget -P "$path_ebi_oral/fna" "$url"

--2023-10-18 19:29:22--  http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-oral/v1.0/species_catalogue/MGYG0002980/MGYG000298013/genome/MGYG000298013.fna
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 1481016 (1.4M)
Saving to: ‘../raw_data/EBI/human-oral-v1-0/fna/MGYG000298013.fna.1’


2023-10-18 19:29:22 (110 MB/s) - ‘../raw_data/EBI/human-oral-v1-0/fna/MGYG000298013.fna.1’ saved [1481016/1481016]

--2023-10-18 19:29:22--  http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-oral/v1.0/species_catalogue/MGYG0002980/MGYG000298020/genome/MGYG000298020.fna
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy reques

## Human gut catalog

In [8]:
path_ebi_gut = '../raw_data/EBI/human-gut-v2-0-1'

In [9]:
!mkdir -p $path_ebi_gut/fna
!wget -P files/input/mgnify_meta http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.1/genomes-all_metadata.tsv 
!mv files/input/mgnify_meta/genomes-all_metadata.tsv files/input/mgnify_meta/genomes-all_metadata_gut.tsv

In [10]:
df_ebi_gut = select_representatives(pd.read_csv('files/input/mgnify_meta/genomes-all_metadata_gut.tsv', sep='\t'))

In [11]:
df_ebi_gut.to_csv('files/output/mgnify_meta/representative-all_metadata_gut.tsv', index=False, sep='\t')

In [12]:
for mag in df_ebi_gut['Genome']:
    url = f"http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.1/species_catalogue/{mag[:-2]}/{mag}/genome/{mag}.fna"
    !wget -P "$path_ebi_gut/fna" "$url"

--2023-10-18 19:29:37--  http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.1/species_catalogue/MGYG0000000/MGYG000000001/genome/MGYG000000001.fna
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 3277816 (3.1M)
Saving to: ‘../raw_data/EBI/human-gut-v2-0-1/fna/MGYG000000001.fna’


2023-10-18 19:29:39 (1.83 MB/s) - ‘../raw_data/EBI/human-gut-v2-0-1/fna/MGYG000000001.fna’ saved [3277816/3277816]

--2023-10-18 19:29:39--  http://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-gut/v2.0.1/species_catalogue/MGYG0000000/MGYG000000002/genome/MGYG000000002.fna
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy reque

# JGI GEM

In [13]:
path_jgi = '../raw_data/JGI'

In [14]:
!mkdir -p $path_jgi/fna
!wget -P $path_jgi https://portal.nersc.gov/GEM/genomes/genome_metadata.tsv 

--2023-10-18 19:30:11--  https://portal.nersc.gov/GEM/genomes/genome_metadata.tsv
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 12350919 (12M) [text/tab-separated-values]
Saving to: ‘../raw_data/JGI/genome_metadata.tsv’


2023-10-18 19:30:18 (1.97 MB/s) - ‘../raw_data/JGI/genome_metadata.tsv’ saved [12350919/12350919]



In [15]:
df_jgi = pd.read_csv(os.path.join(path_jgi, 'genome_metadata.tsv'), sep='\t')

In [16]:
mask = df_jgi['ecosystem_category'] == 'Human'
df_jgi.loc[mask].to_csv('files/input/jgi_meta/genome_metadata_human.tsv', sep='\t', index=False)

In [20]:
for mag in df_jgi['genome_id']:
    url = f'https://portal.nersc.gov/GEM/genomes/fna/{mag}.fna.gz'
    !wget -P "$path_jgi/fna" "$url" && gunzip $path_jgi/fna/$(basename $url)

--2023-10-18 19:36:06--  https://portal.nersc.gov/GEM/genomes/fna/3300025516_6.fna.gz
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 823351 (804K) [application/x-gzip]
Saving to: ‘../raw_data/JGI/fna/3300025516_6.fna.gz’


2023-10-18 19:36:07 (923 KB/s) - ‘../raw_data/JGI/fna/3300025516_6.fna.gz’ saved [823351/823351]

--2023-10-18 19:36:08--  https://portal.nersc.gov/GEM/genomes/fna/3300025516_8.fna.gz
Resolving www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)... 134.96.7.90, 134.96.7.92
Connecting to www-proxy.uni-saarland.de (www-proxy.uni-saarland.de)|134.96.7.90|:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: 687600 (671K) [application/x-gzip]
Saving to: ‘../raw_data/JGI/fna/3300025516_8.fna.gz’


2023-10-18 19:36:09 (926 KB/s) - ‘../raw_data/JGI/fna/

# HMP

In [21]:
path_hmp = '../raw_data/HMP'

In [22]:
!mkdir -p $path_hmp/dwnl

## Retrieving files from the wgs_assembled_seq_set category

### Metadata

In [23]:
url = "https://portal.hmpdacc.org/api/files"

params = {
    "fields": "file_format,file_type,file_annotation_pipeline,file_matrix_type",
    "filters": '{"op":"and","content":[{"op":"in","content":{"field":"file.format","value":["FASTA"]}},{"op":"in","content":{"field":"file.node_type","value":["wgs_assembled_seq_set"]}}]}',
    "from": 0,
    "save": "",
    "size": "100",
    "sort": "file_id:asc",
}

all_dfs = []
for params['from'] in tqdm(range(0, 4000, 100)): # <--- increase the range for next pages
    data = requests.get(url, params=params).json()
    all_dfs.append(pd.DataFrame([h['file'] for h in data['data']['hits']]))

df = pd.concat(all_dfs).reset_index(drop=True)

  0%|          | 0/40 [00:00<?, ?it/s]

### Deduplication

Keep only the latest assembly for a sample

In [25]:
df['basename'] = df['file_name'].map(os.path.basename)

In [26]:
df.loc[df['basename'].str.contains('SRS011061')]

Unnamed: 0,study,gender,assembly_name,node_type,paper,sop,subtype,https,id,s3,format_doc,organism_type,format,assembler,body_site,prep_id,data_modality,version,sequence_type,size,srs,rand_subject_id,data_type,name,abbrev,md5,file_name,access,basename
2319,Human microbiome project WGS production phase I.,Female,SRS011061.fna.bz2,wgs_assembled_seq_set,2017_paper,http://hmpdacc.org/doc/HMP_IDBA_Assembly_SOP.pdf,wgs_assembly,https://downloads.hmpdacc.org/dacc/hhs/genome/...,54a24ca84a57a7d5b06687939f1678b2,s3://nih-hmp-hhs/hmasm2/SRS011061.fna.bz2,http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml,bacterial,FASTA,IDBAUD,feces,700014562,whole metagenome,v2,nucleotide,32953525,SRS011061,158458797.0,sequence,700014555,HHS,e8c5506d5eebc27945615ccc171ae3b7,https://downloads.hmpdacc.org/dacc/hhs/genome/...,open,SRS011061.fna.bz2
2935,Human microbiome project WGS production phase I.,Female,SRS011061.scaffolds.fa.bz2,wgs_assembled_seq_set,2012_paper,http://hmpdacc.org/doc/HMP_Assembly_SOP.pdf,wgs_assembly,https://downloads.hmpdacc.org/dacc/hhs/genome/...,596fc2de57601ec08a01fdee59fe4039,s3://nih-hmp-hhs/HMASM/PGAs/stool/SRS011061.sc...,http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml,bacterial,FASTA,SOAPdenovo v.1.04,feces,700014562,whole metagenome,v1,nucleotide,26014986,SRS011061,,sequence,700014555,HHS,fa5942b5669aa360c22c6cf143fde32b,https://downloads.hmpdacc.org/dacc/hhs/genome/...,open,SRS011061.scaffolds.fa.bz2


In [27]:
mask = ~df['srs'].isna()
df_derep = df.loc[mask].sort_values(['srs', 'version'], ascending=False).groupby('srs').head(1)
df_derep = pd.concat([df_derep, df.loc[~mask]])

In [None]:
df_derep.to_csv('files/input/hmp_meta/wgs_assembled_seq_set.tsv', sep='\t', index=False)

### Download links

In [None]:
# list of files to download
df_derep['basename'] = df_derep['file_name'].map(os.path.basename)
df_derep['file_name'].to_csv(os.path.join(path_hmp, 'links_latest_wgs_assembled_seq_set.txt'), index=False, header=False, sep='\t')
df_derep[['md5', 'basename']].to_csv(os.path.join(path_hmp, 'links_latest_wgs_assembled_seq_set.md5'), index=False, header=False, sep=' ')

The files were retrieved with `cd $path_hmp/dwnl && parallel -j 20 -a ../files_wgs_assembled_seq_set.txt  wget {}` and unpacked to `$path_hmp/fna` under the names following `{id}.fna` pattern.
