### Libraries

In [None]:
from jsonapi_client import Session
import pandas as pd
from io import StringIO
import requests
from tqdm import tqdm

### Global surveillance of antimicrobial resistance (DTU-GE)

In [None]:
# Set study accession
study_accession = "MGYS00001312"

# Create session with MGnify API endpoint
with Session("https://www.ebi.ac.uk/metagenomics/api/v1") as mgnify:
    # Iterate over all analyses in study
    analyses_iter = mgnify.iterate(f"studies/{study_accession}/analyses")
    # Extract JSON from each record
    analyses_json = [record.json for record in analyses_iter]
    # Normalize HSON into pd.DataFrame
    df = pd.json_normalize(analyses_json)

analysis_ids = df["id"]

#### Extract data from analysis

In [None]:
dfs = []

with tqdm(total=len(analysis_ids), desc="Fetching Taxonomy SSU Data", unit="analysis") as pbar:
    for analysis_id in analysis_ids:

        url = f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/{analysis_id}/taxonomy/ssu"
        response = requests.get(url)

        if response.status_code == 200:

            try:
                data = response.json()
                df_temp = pd.json_normalize(data["data"])
            
            except Exception as e:
                tqdm.write(f"Error processing JSON for {analysis_id}: {e}")
                pbar.update(1)
                continue

            df_temp["analysis_id"] = analysis_id
            dfs.append(df_temp)

        else:
            tqdm.write(f"Error fetching data for {analysis_id}: HTTP {response.status_code}")
        
        pbar.set_postfix_str(f"Remaining: {len(analysis_ids) - pbar.n - 1}")
        pbar.update(1)

if dfs:
    final_df = pd.concat(dfs, ignore_index=True)
    print("Combined DataFrame:")
    final_df.head

else:
    print("No data was retrieved.")

#### Getting metadata from each analysis id sample

In [None]:
metadata_df = []

with tqdm(total=len(analysis_ids), desc="Fetching Analysis Sample Data", unit="analysis") as pbar:
    for analysis_id in analysis_ids:
        url = f"https://www.ebi.ac.uk/metagenomics/api/v1/analyses/{analysis_id}"
        response = requests.get(url)

        if response.status_code == 200:
            try:
                data = response.json()
                sample_id = data["data"]["relationships"]["sample"]["data"]["id"]
                sample_url = f"https://www.ebi.ac.uk/metagenomics/api/v1/samples/{sample_id}"
                sample_response = requests.get(sample_url)

                if sample_response.status_code == 200:
                    try:
                        sample_data = sample_response.json()
                        sample_attributes = sample_data["data"]["attributes"]
                        
                        # Extract geographic location from sample-metadata
                        geographic_location = None
                        sample_metadata_list = sample_attributes.get("sample-metadata", [])
                        
                        # Look for keys containing "geographic location"
                        for entry in sample_metadata_list:
                            key = entry.get("key", "").lower()
                            if "country" in key:
                                geographic_location = entry.get("value")
                                break  # Stop after first match

                        sample_metadata = {
                            "analysis_id": analysis_id,
                            "sample_id": sample_id,
                            "sample_name": sample_attributes.get("sample-name"),
                            "collection_date": sample_attributes.get("collection-date"),
                            "geographic_location": geographic_location,
                        }
                        
                        metadata_df.append(sample_metadata)
                    
                    except Exception as e:
                        tqdm.write(f"Error processing sample JSON for {sample_id}: {e}")
                
                else:
                    tqdm.write(f"Error fetching sample {sample_id}: HTTP {sample_response.status_code}")
            
            except Exception as e:
                tqdm.write(f"Error processing analysis JSON for {analysis_id}: {e}")
        
        else:
            tqdm.write(f"Error fetching analysis {analysis_id}: HTTP {response.status_code}")
        
        pbar.update(1)

final_metadata_df = pd.DataFrame(metadata_df) if metadata_df else pd.DataFrame()
final_metadata_df

#### Define taxonomic rank and creating count data table

In [None]:
rank = "family"

ranked_df = final_df[(final_df[f"attributes.hierarchy.{rank}"].notna()) & 
                      (final_df[f"attributes.hierarchy.{rank}"] != '')
                      ]

grouped_df = ranked_df.groupby(["analysis_id", f"attributes.hierarchy.{rank}"],
                               as_index=False,
                               )["attributes.count"].sum()

wide_df = grouped_df.pivot_table(
    index="analysis_id",
    columns=f"attributes.hierarchy.{rank}",
    values="attributes.count",
    fill_value = 0
).reset_index()

final_merged_df = final_metadata_df.merge(
    wide_df,
    on="analysis_id",
    how="left"
)

final_merged_df

#### Save .csv file

In [None]:
# final_merged_df.to_csv("datasets/Global_surveillance/MGYS00005846_taxon_family.csv", index = False)