### Libraries

In [8]:
from jsonapi_client import Session
import pandas as pd
from io import StringIO
import requests
from tqdm import tqdm
import asyncio
import aiohttp
import nest_asyncio

### Functions for asynchronous requests

In [25]:
async def fetch_taxonomy(session, analysis_id):
    url = f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/{analysis_id}/taxonomy/ssu"
    try:
        async with session.get(url) as response:
            if response.status == 200:
                data = await response.json()
                df_temp = pd.json_normalize(data["data"])
                df_temp["analysis_id"] = analysis_id
                return df_temp
            else:
                print(f"Error fetching data for {analysis_id}: HTTP {response.status}")
                return None
    except asyncio.TimeoutError:
        print(f"Timeout error fetching taxonomy for {analysis_id}")
        return None

async def fetch_metadata(session, analysis_id):
    url = f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/{analysis_id}"
    try:
        async with session.get(url) as response:
            if response.status == 200:
                data = await response.json()
                try:
                    sample_id = data["data"]["relationships"]["sample"]["data"]["id"]
                    sample_url = f"https://www.ebi.ac.uk/metagenomics/api/v1/samples/{sample_id}"
                    async with session.get(sample_url) as sample_response:
                        if sample_response.status == 200:
                            sample_data = await sample_response.json()
                            sample_attributes = sample_data["data"]["attributes"]
                            geographic_location = None
                            for entry in sample_attributes.get("sample-metadata", []):
                                key = entry.get("key", "").lower()
                                if "collection date" in key:
                                    collection_date = entry.get("value")
                                    break
                            return {
                                "analysis_id": analysis_id,
                                "sample_id": sample_id,
                                "sample_name": sample_attributes.get("sample-name"),
                                "collection_date": collection_date,
                                "geographic_location": sample_attributes.get("geo-loc-name"),
                            }
                        else:
                            print(f"Error fetching sample {sample_id}: HTTP {sample_response.status}")
                            return sample_id
                except Exception as e:
                    print(f"Error processing analysis JSON for {analysis_id}: {e}")
                    return None
            else:
                print(f"Error fetching analysis {analysis_id}: HTTP {response.status}")
                return None
    
    except asyncio.TimeoutError:
        print(f"Timeout error fetching metadata for {analysis_id}")
        return None

async def main(analysis_ids):
    timeout = aiohttp.ClientTimeout(total = 600000000)
    conn = aiohttp.TCPConnector(limit = 100000000)
    async with aiohttp.ClientSession(timeout=timeout, connector=conn) as session:
        # Schedule all requests concurrently
        taxonomy_tasks = [fetch_taxonomy(session, aid) for aid in analysis_ids]
        metadata_tasks = [fetch_metadata(session, aid) for aid in analysis_ids]

        # taxonomy_results = await asyncio.gather(*taxonomy_tasks)
        # metadata_results = await asyncio.gather(*metadata_tasks)

        taxonomy_results = []
        for task in tqdm(asyncio.as_completed(taxonomy_tasks), total=len(taxonomy_tasks), desc = "Fetching Taxonomy"):
            result = await task
            taxonomy_results.append(result)

        metadata_results = []
        error_samples_list = []
        for task in tqdm(asyncio.as_completed(metadata_tasks), total = len(metadata_tasks), desc = "Fetching Metadata"):
            result = await task
            if isinstance(result, dict):
                metadata_results.append(result)
            else:
                error_samples_list.append(result)
                print("Added to error list")

        # Filter out any failed (None) results
        taxonomy_dfs = [res for res in taxonomy_results if res is not None]
        metadata_list = [res for res in metadata_results if res is not None]

        taxonomy_df = pd.concat(taxonomy_dfs, ignore_index=True) if taxonomy_dfs else pd.DataFrame()
        metadata_df = pd.DataFrame(metadata_list) if metadata_list else pd.DataFrame()

        return taxonomy_df, metadata_df, error_samples_list

### Global surveillance of antimicrobial resistance (DTU-GE)

In [5]:
# Set study accession
study_accession = "MGYS00001312"

# Create session with MGnify API endpoint
with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    # Iterate over all analyses in study
    analyses_iter = mgnify.iterate(f"studies/{study_accession}/analyses")
    # Extract JSON from each record
    analyses_json = [record.json for record in analyses_iter]
    # Normalize HSON into pd.DataFrame
    df = pd.json_normalize(analyses_json)

analysis_ids = df["id"]

#### Extract data from analysis

In [None]:
# dfs = []

# with tqdm(total=len(analysis_ids), desc="Fetching Taxonomy SSU Data", unit="analysis") as pbar:
#     for analysis_id in analysis_ids:

#         url = f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/{analysis_id}/taxonomy/ssu"
#         response = requests.get(url)

#         if response.status_code == 200:

#             try:
#                 data = response.json()
#                 df_temp = pd.json_normalize(data["data"])
            
#             except Exception as e:
#                 tqdm.write(f"Error processing JSON for {analysis_id}: {e}")
#                 pbar.update(1)
#                 continue

#             df_temp["analysis_id"] = analysis_id
#             dfs.append(df_temp)

#         else:
#             tqdm.write(f"Error fetching data for {analysis_id}: HTTP {response.status_code}")
        
#         pbar.set_postfix_str(f"Remaining: {len(analysis_ids) - pbar.n - 1}")
#         pbar.update(1)

# if dfs:
#     final_df = pd.concat(dfs, ignore_index=True)
#     print("Combined DataFrame:")
#     final_df.head

# else:
#     print("No data was retrieved.")

Fetching Taxonomy SSU Data: 100%|██████████| 413/413 [41:26<00:00,  6.02s/analysis, Remaining: 0]    


Combined DataFrame:


#### Getting metadata from each analysis id sample

In [20]:
metadata_df = []

with tqdm(total=len(analysis_ids), desc="Fetching Analysis Sample Data", unit="analysis") as pbar:
    for analysis_id in analysis_ids:
        url = f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/{analysis_id}"
        response = requests.get(url)

        if response.status_code == 200:
            try:
                data = response.json()
                sample_id = data["data"]["relationships"]["sample"]["data"]["id"]
                sample_url = f"https://www.ebi.ac.uk/metagenomics/api/v1/samples/{sample_id}"
                sample_response = requests.get(sample_url)

                if sample_response.status_code == 200:
                    try:
                        sample_data = sample_response.json()
                        sample_attributes = sample_data["data"]["attributes"]
                        
                        # Extract geographic location from sample-metadata
                        geographic_location = None
                        sample_metadata_list = sample_attributes.get("sample-metadata", [])
                        
                        # Look for keys containing "geographic location"
                        for entry in sample_metadata_list:
                            key = entry.get("key", "").lower()
                            if "collection date" in key:
                                collection_date = entry.get("value")
                                break  # Stop after first match

                        sample_metadata = {
                            "analysis_id": analysis_id,
                            "sample_id": sample_id,
                            "sample_name": sample_attributes.get("sample-name"),
                            "collection_date": collection_date,
                            "geographic_location": sample_attributes.get("geo-loc-name"),
                        }
                        
                        metadata_df.append(sample_metadata)
                    
                    except Exception as e:
                        tqdm.write(f"Error processing sample JSON for {sample_id}: {e}")
                
                else:
                    tqdm.write(f"Error fetching sample {sample_id}: HTTP {sample_response.status_code}")
            
            except Exception as e:
                tqdm.write(f"Error processing analysis JSON for {analysis_id}: {e}")
        
        else:
            tqdm.write(f"Error fetching analysis {analysis_id}: HTTP {response.status_code}")
        
        pbar.update(1)

final_metadata_df = pd.DataFrame(metadata_df) if metadata_df else pd.DataFrame()
final_metadata_df

Fetching Analysis Sample Data:   1%|          | 5/413 [00:04<06:12,  1.10analysis/s]


KeyboardInterrupt: 

In [21]:
metadata_df

[{'analysis_id': 'MGYA00216443',
  'sample_id': 'ERS1426837',
  'sample_name': 'wastewater metagenome',
  'collection_date': '2016-01-25',
  'geographic_location': 'Slovenia'},
 {'analysis_id': 'MGYA00216444',
  'sample_id': 'ERS1426784',
  'sample_name': 'wastewater metagenome',
  'collection_date': '2016-02-15',
  'geographic_location': 'Brazil'},
 {'analysis_id': 'MGYA00216445',
  'sample_id': 'ERS1426839',
  'sample_name': 'wastewater metagenome',
  'collection_date': '2016-02-11',
  'geographic_location': 'Sweden'},
 {'analysis_id': 'MGYA00216446',
  'sample_id': 'ERS1426784',
  'sample_name': 'wastewater metagenome',
  'collection_date': '2016-02-15',
  'geographic_location': 'Brazil'},
 {'analysis_id': 'MGYA00216447',
  'sample_id': 'ERS1426787',
  'sample_name': 'wastewater metagenome',
  'collection_date': '2016-02-01',
  'geographic_location': 'Canada'}]

#### Asyncio - extraction of data and metadata

In [27]:
# Assuming `df` from your study analysis retrieval code:
analysis_ids = df["id"].tolist()

# Patch the event loop
nest_asyncio.apply()

# Run the asynchronous main function
taxonomy_df, metadata_df,  error_sample_list = asyncio.run(main(analysis_ids))

print("Combined Taxonomy DataFrame:")
print(taxonomy_df.head())
print("\nMetadata DataFrame:")
print(metadata_df.head())

Fetching Taxonomy: 100%|██████████| 413/413 [03:26<00:00,  2.00it/s]
Fetching Metadata:   1%|          | 4/413 [00:39<41:39,  6.11s/it]  

Error fetching sample ERS1443997: HTTP 404
Added to error list


Fetching Metadata:   3%|▎         | 11/413 [00:57<18:04,  2.70s/it]

Error fetching sample ERS1443986: HTTP 404
Added to error list


Fetching Metadata:   7%|▋         | 28/413 [01:27<09:58,  1.55s/it]

Error fetching sample ERS1443988: HTTP 404
Added to error list


Fetching Metadata:   7%|▋         | 30/413 [01:30<10:06,  1.58s/it]

Error fetching sample ERS1443927: HTTP 404
Added to error list


Fetching Metadata:   8%|▊         | 33/413 [01:34<08:31,  1.35s/it]

Error fetching sample ERS1443960: HTTP 404
Added to error list


Fetching Metadata:   9%|▉         | 39/413 [01:43<12:22,  1.98s/it]

Error fetching sample ERS1444004: HTTP 404
Added to error list


Fetching Metadata:  11%|█         | 45/413 [01:53<12:47,  2.09s/it]

Error fetching sample ERS1443963: HTTP 404
Added to error list


Fetching Metadata:  11%|█         | 46/413 [01:54<10:49,  1.77s/it]

Error fetching sample ERS1443916: HTTP 404
Added to error list


Fetching Metadata:  12%|█▏        | 48/413 [02:00<13:55,  2.29s/it]

Error fetching sample ERS1443999: HTTP 404
Added to error list


Fetching Metadata:  12%|█▏        | 51/413 [02:04<09:39,  1.60s/it]

Error fetching sample ERS1443966: HTTP 404
Added to error list


Fetching Metadata:  13%|█▎        | 53/413 [02:06<07:43,  1.29s/it]

Error fetching sample ERS1443921: HTTP 404
Added to error list


Fetching Metadata:  13%|█▎        | 54/413 [02:07<07:10,  1.20s/it]

Error fetching sample ERS1444008: HTTP 404
Added to error list


Fetching Metadata:  15%|█▍        | 61/413 [02:20<13:30,  2.30s/it]

Error fetching sample ERS1444011: HTTP 404
Added to error list


Fetching Metadata:  15%|█▌        | 62/413 [02:21<11:12,  1.92s/it]

Error fetching sample ERS1443919: HTTP 404
Added to error list


Fetching Metadata:  16%|█▌        | 66/413 [02:23<05:23,  1.07it/s]

Error fetching sample ERS1444003: HTTP 404
Added to error list


Fetching Metadata:  17%|█▋        | 70/413 [02:28<06:02,  1.06s/it]

Error fetching sample ERS1443962: HTTP 404
Added to error list


Fetching Metadata:  18%|█▊        | 74/413 [02:34<09:23,  1.66s/it]

Error fetching sample ERS1443943: HTTP 404
Added to error list


Fetching Metadata:  20%|██        | 83/413 [02:42<05:27,  1.01it/s]

Error fetching sample ERS1443964: HTTP 404
Added to error list
Error fetching sample ERS1443935: HTTP 404
Added to error list


Fetching Metadata:  21%|██        | 85/413 [02:44<05:26,  1.00it/s]

Error fetching sample ERS1443995: HTTP 404
Added to error list


Fetching Metadata:  21%|██▏       | 88/413 [02:47<05:46,  1.07s/it]

Error fetching sample ERS1443993: HTTP 404
Added to error list


Fetching Metadata:  22%|██▏       | 89/413 [02:48<05:39,  1.05s/it]

Error fetching sample ERS1443969: HTTP 404
Added to error list


Fetching Metadata:  22%|██▏       | 92/413 [02:51<05:25,  1.01s/it]

Error fetching sample ERS1443983: HTTP 404
Added to error list


Fetching Metadata:  23%|██▎       | 96/413 [02:55<05:22,  1.02s/it]

Error fetching sample ERS1444007: HTTP 404
Added to error list


Fetching Metadata:  24%|██▎       | 98/413 [02:56<04:03,  1.29it/s]

Error fetching sample ERS1443948: HTTP 404
Added to error list
Error fetching sample ERS1443959: HTTP 404
Added to error list


Fetching Metadata:  25%|██▌       | 104/413 [03:00<04:03,  1.27it/s]

Error fetching sample ERS1443976: HTTP 404
Added to error list


Fetching Metadata:  26%|██▋       | 109/413 [03:05<04:39,  1.09it/s]

Error fetching sample ERS1443945: HTTP 404
Added to error list
Error fetching sample ERS1443946: HTTP 404
Added to error list


Fetching Metadata:  27%|██▋       | 113/413 [03:08<04:14,  1.18it/s]

Error fetching sample ERS1443922: HTTP 404
Added to error list


Fetching Metadata:  28%|██▊       | 115/413 [03:09<03:26,  1.44it/s]

Error fetching sample ERS1443981: HTTP 404
Added to error list


Fetching Metadata:  29%|██▉       | 120/413 [03:13<03:57,  1.23it/s]

Error fetching sample ERS1443936: HTTP 404
Added to error list


Fetching Metadata:  31%|███       | 129/413 [03:18<03:05,  1.53it/s]

Error fetching sample ERS1443947: HTTP 404
Added to error list


Fetching Metadata:  34%|███▎      | 139/413 [03:26<04:29,  1.02it/s]

Error fetching sample ERS1444010: HTTP 404
Added to error list


Fetching Metadata:  34%|███▍      | 142/413 [03:28<03:43,  1.21it/s]

Error fetching sample ERS1443979: HTTP 404
Added to error list


Fetching Metadata:  35%|███▌      | 145/413 [03:31<04:01,  1.11it/s]

Error fetching sample ERS1443931: HTTP 404
Added to error list


Fetching Metadata:  36%|███▌      | 148/413 [03:33<03:30,  1.26it/s]

Error fetching sample ERS1443996: HTTP 404
Added to error list


Fetching Metadata:  37%|███▋      | 151/413 [03:35<03:19,  1.31it/s]

Error fetching sample ERS1443978: HTTP 404
Added to error list


Fetching Metadata:  37%|███▋      | 152/413 [03:36<03:31,  1.23it/s]

Error fetching sample ERS1443923: HTTP 404
Added to error list


Fetching Metadata:  38%|███▊      | 155/413 [03:38<02:34,  1.67it/s]

Error fetching sample ERS1443973: HTTP 404
Added to error list


Fetching Metadata:  40%|███▉      | 164/413 [03:45<02:58,  1.39it/s]

Error fetching sample ERS1443994: HTTP 404
Added to error list


Fetching Metadata:  43%|████▎     | 176/413 [03:53<02:24,  1.64it/s]

Error fetching sample ERS1443958: HTTP 404
Added to error list


Fetching Metadata:  43%|████▎     | 179/413 [03:55<02:38,  1.48it/s]

Error fetching sample ERS1443982: HTTP 404
Added to error list


Fetching Metadata:  44%|████▍     | 181/413 [03:57<03:01,  1.27it/s]

Error fetching sample ERS1443971: HTTP 404
Added to error list


Fetching Metadata:  44%|████▍     | 182/413 [03:58<03:15,  1.18it/s]

Error fetching sample ERS1443955: HTTP 404
Added to error list


Fetching Metadata:  45%|████▍     | 184/413 [03:59<02:43,  1.40it/s]

Error fetching sample ERS1444006: HTTP 404
Added to error list


Fetching Metadata:  48%|████▊     | 199/413 [04:10<03:27,  1.03it/s]

Error fetching sample ERS1443985: HTTP 404
Added to error list


Fetching Metadata:  48%|████▊     | 200/413 [04:11<02:52,  1.23it/s]

Error fetching sample ERS1443930: HTTP 404
Added to error list


Fetching Metadata:  49%|████▊     | 201/413 [04:11<02:45,  1.28it/s]

Error fetching sample ERS1443942: HTTP 404
Added to error list


Fetching Metadata:  52%|█████▏    | 215/413 [04:22<02:48,  1.17it/s]

Error fetching sample ERS1443970: HTTP 404
Added to error list


Fetching Metadata:  53%|█████▎    | 218/413 [04:26<03:37,  1.11s/it]

Error fetching sample ERS1444002: HTTP 404
Added to error list


Fetching Metadata:  54%|█████▍    | 222/413 [04:30<03:01,  1.05it/s]

Error fetching sample ERS1443992: HTTP 404
Added to error list


Fetching Metadata:  54%|█████▍    | 225/413 [04:31<02:31,  1.24it/s]

Error fetching sample ERS1443952: HTTP 404
Added to error list


Fetching Metadata:  55%|█████▍    | 227/413 [04:32<02:02,  1.52it/s]

Error fetching sample ERS1443937: HTTP 404
Added to error list


Fetching Metadata:  57%|█████▋    | 235/413 [04:37<01:52,  1.58it/s]

Error fetching sample ERS1443974: HTTP 404
Added to error list


Fetching Metadata:  57%|█████▋    | 237/413 [04:38<01:41,  1.74it/s]

Error fetching sample ERS1443954: HTTP 404
Added to error list


Fetching Metadata:  60%|█████▉    | 247/413 [04:45<01:54,  1.45it/s]

Error fetching sample ERS1443917: HTTP 404
Added to error list


Fetching Metadata:  63%|██████▎   | 260/413 [04:54<02:10,  1.17it/s]

Error fetching sample ERS1443940: HTTP 404
Added to error list


Fetching Metadata:  65%|██████▍   | 268/413 [04:59<01:18,  1.85it/s]

Error fetching sample ERS1443918: HTTP 404
Added to error list
Error fetching sample ERS1443967: HTTP 404
Added to error list


Fetching Metadata:  65%|██████▌   | 270/413 [05:01<01:40,  1.42it/s]

Error fetching sample ERS1443980: HTTP 404
Added to error list


Fetching Metadata:  66%|██████▋   | 274/413 [05:10<03:43,  1.61s/it]

Error fetching sample ERS1444009: HTTP 404
Added to error list


Fetching Metadata:  67%|██████▋   | 276/413 [05:11<02:32,  1.11s/it]

Error fetching sample ERS1443991: HTTP 404
Added to error list


Fetching Metadata:  68%|██████▊   | 279/413 [05:13<02:00,  1.11it/s]

Error fetching sample ERS1443990: HTTP 404
Added to error list


Fetching Metadata:  68%|██████▊   | 282/413 [05:16<02:09,  1.01it/s]

Error fetching sample ERS1443951: HTTP 404
Added to error list


Fetching Metadata:  69%|██████▉   | 284/413 [05:17<01:37,  1.32it/s]

Error fetching sample ERS1444000: HTTP 404
Added to error list


Fetching Metadata:  70%|██████▉   | 289/413 [05:22<01:47,  1.15it/s]

Error fetching sample ERS1443925: HTTP 404
Added to error list


Fetching Metadata:  72%|███████▏  | 297/413 [05:27<01:15,  1.53it/s]

Error fetching sample ERS1444005: HTTP 404
Added to error list


Fetching Metadata:  73%|███████▎  | 300/413 [05:29<01:16,  1.47it/s]

Error fetching sample ERS1443938: HTTP 404
Added to error list


Fetching Metadata:  74%|███████▎  | 304/413 [05:32<01:06,  1.64it/s]

Error fetching sample ERS1443977: HTTP 404
Added to error list


Fetching Metadata:  74%|███████▍  | 306/413 [05:32<00:56,  1.88it/s]

Error fetching sample ERS1443975: HTTP 404
Added to error list


Fetching Metadata:  76%|███████▋  | 315/413 [05:41<01:27,  1.11it/s]

Error fetching sample ERS1443972: HTTP 404
Added to error list


Fetching Metadata:  77%|███████▋  | 317/413 [05:43<01:13,  1.31it/s]

Error fetching sample ERS1443998: HTTP 404
Added to error list
Error fetching sample ERS1443950: HTTP 404
Added to error list


Fetching Metadata:  78%|███████▊  | 322/413 [05:46<01:01,  1.48it/s]

Error fetching sample ERS1443968: HTTP 404
Added to error list


Fetching Metadata:  78%|███████▊  | 324/413 [05:49<01:31,  1.03s/it]

Error fetching sample ERS1443956: HTTP 404
Added to error list


Fetching Metadata:  79%|███████▉  | 328/413 [05:52<01:12,  1.17it/s]

Error fetching sample ERS1443961: HTTP 404
Added to error list


Fetching Metadata:  80%|████████  | 332/413 [05:54<00:50,  1.59it/s]

Error fetching sample ERS1443928: HTTP 404
Added to error list
Error fetching sample ERS1444001: HTTP 404
Added to error list


Fetching Metadata:  83%|████████▎ | 341/413 [05:59<00:44,  1.61it/s]

Error fetching sample ERS1443949: HTTP 404
Added to error list


Fetching Metadata:  83%|████████▎ | 343/413 [06:00<00:39,  1.79it/s]

Error fetching sample ERS1443929: HTTP 404
Added to error list


Fetching Metadata:  84%|████████▍ | 347/413 [06:03<00:41,  1.58it/s]

Error fetching sample ERS1443987: HTTP 404
Added to error list


Fetching Metadata:  85%|████████▌ | 352/413 [06:07<00:47,  1.28it/s]

Error fetching sample ERS1443924: HTTP 404
Added to error list


Fetching Metadata:  86%|████████▌ | 354/413 [06:08<00:39,  1.48it/s]

Error fetching sample ERS1443914: HTTP 404
Added to error list


Fetching Metadata:  89%|████████▉ | 367/413 [06:20<00:41,  1.11it/s]

Error fetching sample ERS1443957: HTTP 404
Added to error list


Fetching Metadata:  89%|████████▉ | 368/413 [06:22<00:54,  1.21s/it]

Error fetching sample ERS1443953: HTTP 404
Added to error list


Fetching Metadata:  90%|████████▉ | 371/413 [06:25<00:45,  1.07s/it]

Error fetching sample ERS1443920: HTTP 404
Added to error list


Fetching Metadata:  90%|█████████ | 372/413 [06:27<00:53,  1.30s/it]

Error fetching sample ERS1443989: HTTP 404
Added to error list


Fetching Metadata:  91%|█████████ | 374/413 [06:29<00:37,  1.05it/s]

Error fetching sample ERS1443965: HTTP 404
Added to error list


Fetching Metadata:  91%|█████████▏| 377/413 [06:31<00:34,  1.05it/s]

Error fetching sample ERS1443984: HTTP 404
Added to error list


Fetching Metadata:  92%|█████████▏| 381/413 [06:33<00:20,  1.53it/s]

Error fetching sample ERS1443915: HTTP 404
Added to error list


Fetching Metadata:  93%|█████████▎| 383/413 [06:35<00:18,  1.66it/s]

Error fetching sample ERS1443926: HTTP 404
Added to error list


Fetching Metadata:  93%|█████████▎| 386/413 [06:37<00:21,  1.25it/s]

Error fetching sample ERS1443932: HTTP 404
Added to error list


Fetching Metadata:  94%|█████████▍| 390/413 [06:40<00:16,  1.37it/s]

Error fetching sample ERS1443939: HTTP 404
Added to error list


Fetching Metadata:  96%|█████████▌| 395/413 [06:43<00:12,  1.47it/s]

Error fetching sample ERS1443934: HTTP 404
Added to error list


Fetching Metadata:  96%|█████████▌| 397/413 [06:45<00:12,  1.25it/s]

Error fetching sample ERS1443933: HTTP 404
Added to error list


Fetching Metadata:  97%|█████████▋| 401/413 [06:47<00:07,  1.62it/s]

Error fetching sample ERS1443944: HTTP 404
Added to error list


Fetching Metadata:  99%|█████████▉| 410/413 [06:52<00:01,  2.40it/s]

Error fetching sample ERS1443941: HTTP 404
Added to error list


Fetching Metadata: 100%|██████████| 413/413 [06:53<00:00,  1.00s/it]


Combined Taxonomy DataFrame:
    analysis_id       type                                                 id  \
0  MGYA00216537  organisms             Archaea::Euryarchaeota:Methanomicrobia   
1  MGYA00216537  organisms  Archaea::Euryarchaeota:Methanomicrobia:Methano...   
2  MGYA00216537  organisms                                           Bacteria   
3  MGYA00216537  organisms                      Bacteria:::::::bacterium_LF-3   
4  MGYA00216537  organisms  Bacteria:::::::bacterium_enrichment_culture_cl...   

   attributes.count                                 attributes.lineage  \
0               1.0             Archaea::Euryarchaeota:Methanomicrobia   
1              33.0  Archaea::Euryarchaeota:Methanomicrobia:Methano...   
2             423.0                                           Bacteria   
3               1.0                      Bacteria:::::::bacterium_LF-3   
4               1.0  Bacteria:::::::bacterium_enrichment_culture_cl...   

  attributes.hierarchy.kingdom attribut

#### Define taxonomic rank and creating count data table

In [35]:
rank = "family"

ranked_df = taxonomy_df[(taxonomy_df[f"attributes.hierarchy.{rank}"].notna()) & 
                      (taxonomy_df[f"attributes.hierarchy.{rank}"] != '')
                      ]

grouped_df = ranked_df.groupby(["analysis_id", f"attributes.hierarchy.{rank}"],
                               as_index=False,
                               )["attributes.count"].sum()

wide_df = grouped_df.pivot_table(
    index="analysis_id",
    columns=f"attributes.hierarchy.{rank}",
    values="attributes.count",
    fill_value = 0
).reset_index()

merged_df = metadata_df.merge(
    wide_df,
    on="analysis_id",
    how="left"
).dropna().reset_index(drop=True)

merged_df

Unnamed: 0,analysis_id,sample_id,sample_name,collection_date,geographic_location,Acanthamoebidae,Acaridae,Acetobacteraceae,Acholeplasmataceae,Acidaminococcaceae,...,Vorticellidae,Vulgatibacteraceae,Wenzhouxiangellaceae,Williamsiaceae,Woeseiaceae,Xanthobacteraceae,Xanthomonadaceae,Yersiniaceae,Zoogloeaceae,Zoothamniidae
0,MGYA00216461,ERS1426787,wastewater metagenome,2016-02-01,Canada,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.0,0.0
1,MGYA00216522,ERS1426804,wastewater metagenome,2016-01-25,Georgia,0.0,0.0,0.0,1.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,78.0,0.0,18.0,0.0
2,MGYA00216638,ERS1426851,wastewater metagenome,2016-02-22,USA,0.0,0.0,1.0,0.0,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,6.0,0.0
3,MGYA00216447,ERS1426787,wastewater metagenome,2016-02-01,Canada,0.0,0.0,19.0,2.0,55.0,...,0.0,0.0,0.0,1.0,0.0,3.0,167.0,0.0,20.0,0.0
4,MGYA00216652,ERS1426821,wastewater metagenome,2016-02-04,Moldova,3.0,0.0,4.0,0.0,115.0,...,0.0,0.0,0.0,0.0,0.0,1.0,163.0,0.0,23.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,MGYA00216634,ERS1426794,wastewater metagenome,2016-02-01,Czech Republic,0.0,0.0,10.0,0.0,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,120.0,0.0,13.0,0.0
228,MGYA00216625,ERS1426779,wastewater metagenome,2016-02-04,Australia,0.0,0.0,6.0,0.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,103.0,0.0,16.0,0.0
229,MGYA00216668,ERS1426846,wastewater metagenome,2016-02-24,USA,0.0,0.0,18.0,0.0,377.0,...,3.0,1.0,0.0,0.0,0.0,2.0,219.0,1.0,20.0,0.0
230,MGYA00216453,ERS1426849,wastewater metagenome,2016-02-22,USA,0.0,1.0,13.0,16.0,270.0,...,0.0,0.0,0.0,0.0,0.0,3.0,106.0,1.0,74.0,0.0


#### Save .csv file

In [36]:
merged_df.to_csv("datasets/Global_surveillance/MGYS00001312_taxon_family.csv", index = False)