# Pulling data from MGnify

Drew Honson

13 October 2023

In [18]:
import pandas as pd

from tqdm import tqdm
import os, re, sys, wget

def split_taxa(x):
    tax_dict = {'k':'Kingdom',
                'p':'Phylum',
                'c':'Class',
                'o':'Order',
                'f':'Family',
                'g':'Genus',
                's':'Species'}
    
    tax_ls = x.split(';')[1:]
    tax_ls = [i.split('__') for i in tax_ls]
    
    out_dict = {}

    for i,v in enumerate(tax_ls):
        #v[0] = tax_dict[v[0]]
        out_dict[tax_dict[v[0]]] = v[1]
    
    return out_dict

## Use wget to pull data

This study examined the diversity of mouse microbiomes at eight sites in Western Europe. The study can be found [here.](https://www.ebi.ac.uk/metagenomics/studies/MGYS00000516#overview)

In [28]:
# Make data directory
DATA_DIR = 'data'

if os.path.exists(DATA_DIR):
    pass
else:
    os.mkdir(DATA_DIR)
    
# Get taxonomy, sample, and run information
TAXON_URL = "https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00000516/pipelines/2.0/file/ERP001970_taxonomy_abundances_v2.0.tsv"
SAMPLE_URL = "https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00000516/analyses?include=sample&ordering=&format=csv"

TAXON_PATH = wget.download(TAXON_URL, out = DATA_DIR)
SAMPLE_PATH = wget.download(SAMPLE_URL, out = DATA_DIR)

# Read taxonomy data
TAXON_DATA = pd.read_csv(TAXON_PATH, sep='\t')

# Drop root
TAXON_DATA.drop(0,inplace=True)
TAXON_DATA.reset_index(drop=True,inplace=True)
TAXON_DATA.to_csv(TAXON_PATH, index=False)

TAXON_DATA.head()

-1 / unknown...........................................] 156156 / 156156

Unnamed: 0,#SampleID,ERR197719,ERR197720,ERR197721,ERR197722,ERR197723,ERR197724,ERR197725,ERR197726,ERR197727,...,ERR197910,ERR197911,ERR197912,ERR197913,ERR197914,ERR197915,ERR197916,ERR197917,ERR197918,ERR197919
0,Root;k__Bacteria;p__Acidobacteria;c__Acidobact...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Root;k__Bacteria;p__Acidobacteria;c__Acidobact...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Root;k__Bacteria;p__Actinobacteria;c__Actinoba...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Root;k__Bacteria;p__Actinobacteria;c__Actinoba...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Root;k__Bacteria;p__Actinobacteria;c__Actinoba...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Export reads with taxa
for i in TAXON_DATA.index:
    tdict = split_taxa(TAXON_DATA.loc[i,'#SampleID'])
    
    for t in tdict:
        TAXON_DATA.loc[i,t] = tdict[t]
        
TAXON_DATA.tail()

Unnamed: 0,#SampleID,ERR197719,ERR197720,ERR197721,ERR197722,ERR197723,ERR197724,ERR197725,ERR197726,ERR197727,...,ERR197917,ERR197918,ERR197919,Kingdom,Phylum,Class,Order,Family,Genus,Species
288,Root;k__Bacteria;p__TM7;c__TM7-1;o__;f__;g__;s__,0,0,0,0,0,0,0,0,0,...,0,0,0,Bacteria,TM7,TM7-1,,,,
289,Root;k__Bacteria;p__Tenericutes;c__Mollicutes;...,0,0,0,2,0,18,12,27,10,...,0,0,0,Bacteria,Tenericutes,Mollicutes,Anaeroplasmatales,Anaeroplasmataceae,Anaeroplasma,
290,Root;k__Bacteria;p__Tenericutes;c__Mollicutes;...,0,0,0,0,0,0,0,0,0,...,0,0,0,Bacteria,Tenericutes,Mollicutes,Mycoplasmatales,Mycoplasmataceae,Mycoplasma,
291,Root;k__Bacteria;p__Tenericutes;c__Mollicutes;...,0,0,1,0,0,0,0,0,0,...,0,2,0,Bacteria,Tenericutes,Mollicutes,RF39,,,
292,Root;k__Bacteria;p__Verrucomicrobia;c__Verruco...,0,0,0,0,0,0,0,0,0,...,0,0,0,Bacteria,Verrucomicrobia,Verrucomicrobiae,Verrucomicrobiales,Verrucomicrobiaceae,Akkermansia,muciniphila


In [27]:
# Export 
TAXON_DATA.to_csv(DATA_DIR+'/sampleWithTaxonomy.csv',index=False)

## Make a convenient dataframe for exploring taxa

In [9]:
# Taxonomy dataframe
taxons = [i.split(';')[1:] for i in TAXON_DATA.loc[1:,'#SampleID']]

for i,v in enumerate(taxons):
    taxons[i] = [i.split('__')[-1] for i in v]

taxonomy = pd.DataFrame(taxons, 
                       columns=['Kingdom','Phylum','Class',
                                'Order','Family','Genus','Species'])

taxonomy.head()

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Species
0,Bacteria,Acidobacteria,Acidobacteria-6,iii1-15,,,
1,Bacteria,Acidobacteria,Acidobacteria-6,iii1-15,mb2424,,
2,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,ACK-M1,,
3,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,Bogoriellaceae,Georgenia,
4,Bacteria,Actinobacteria,Actinobacteria,Actinomycetales,Brevibacteriaceae,Brevibacterium,


In [10]:
# Export 
taxonomy.to_csv('data/taxonomy.csv',index=False)

## Pull individual sample data

This study has 200 samples total. The following code extracts the location data for each of these. 

In [5]:
# Download sample data
SAMPLE_DATA = pd.read_csv(SAMPLE_PATH, usecols=['run','sample'])

SAMPLE_DATA['run'] = [i.split('/')[-1] for i in SAMPLE_DATA['run']]
SAMPLE_DATA['run'] = [i.split('?')[0] for i in SAMPLE_DATA['run']]

for s in tqdm(SAMPLE_DATA.index):
    SAMPLE_DATA.loc[s,'sampfile'] = wget.download(SAMPLE_DATA.loc[s,'sample'], out = DATA_DIR, bar=False)
    
SAMPLE_DATA.head()

100%|█████████████████████████████████████████| 201/201 [05:53<00:00,  1.76s/it]


Unnamed: 0,run,sample,sampfile
0,ERR197740,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194057
1,ERR197741,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194058
2,ERR197742,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194059
3,ERR197743,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194060
4,ERR197744,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194061


In [6]:
for s in tqdm(SAMPLE_DATA.index):
    df = pd.read_csv(SAMPLE_DATA.loc[s,'sampfile'])
    SAMPLE_DATA.loc[s,'geo_loc_name'] = df['geo_loc_name'][0]
    SAMPLE_DATA.loc[s,'latitude'] = df['latitude'][0]
    SAMPLE_DATA.loc[s,'longitude'] = df['longitude'][0]
    
SAMPLE_DATA.head()

100%|████████████████████████████████████████| 201/201 [00:00<00:00, 236.52it/s]


Unnamed: 0,run,sample,sampfile,geo_loc_name,latitude,longitude
0,ERR197740,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194057,France,47.4532,0.5949
1,ERR197741,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194058,France,47.4532,0.5949
2,ERR197742,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194059,France,47.4532,0.5949
3,ERR197743,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194060,France,47.4532,0.5949
4,ERR197744,https://www.ebi.ac.uk/metagenomics/api/v1/samp...,data/ERS194061,France,47.4532,0.5949


In [7]:
# Simplify dataframe
SAMPLE_DATA.drop('sample',axis=1,inplace=True)
SAMPLE_DATA['sample'] = [i.split('/')[-1] for i in SAMPLE_DATA['sampfile']]

# Clean up location files
for i in SAMPLE_DATA['sampfile']:
    os.remove(i)
SAMPLE_DATA.drop('sampfile',axis=1,inplace=True)

# Export data
SAMPLE_DATA.to_csv('data/sampleLocs.csv',index=False)

In [11]:
# Update location data
locations = pd.read_csv('data/sampleLocs.csv')

# Add coordinates
loc_ls = list(zip(locations['latitude'],locations['longitude']))
loc_ls = [str(i) for i in loc_ls]
locations['coord'] = loc_ls

locations.head()

Unnamed: 0,run,geo_loc_name,latitude,longitude,sample,coord
0,ERR197740,France,47.4532,0.5949,ERS194057,"(47.4532, 0.5949)"
1,ERR197741,France,47.4532,0.5949,ERS194058,"(47.4532, 0.5949)"
2,ERR197742,France,47.4532,0.5949,ERS194059,"(47.4532, 0.5949)"
3,ERR197743,France,47.4532,0.5949,ERS194060,"(47.4532, 0.5949)"
4,ERR197744,France,47.4532,0.5949,ERS194061,"(47.4532, 0.5949)"


In [12]:
# Export locations
locations.to_csv('data/sampleLocs.csv',index=False)

In [4]:
%load_ext watermark
%watermark -v --iversions

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.15.0

wget  : 3.2
sys   : 3.11.5 (main, Sep 11 2023, 08:19:27) [Clang 14.0.6 ]
pandas: 2.0.3
re    : 2.2.1

