In [4]:
import pandas as pd
import requests

In [24]:
df = pd.read_csv("AnalysisJob.csv", usecols=["accession", "pipeline_version"])
df = df[df["pipeline_version"] == 4.1]
accession = df["accession"].dropna().unique().tolist()

In [26]:
BASE = "https://www.ebi.ac.uk/metagenomics/api/v1/analyses"

def fetch_taxonomy(accession, as_csv=False, BASE=BASE):
    """
    Fetch taxonomic abundance for a given analysis accession.
    If as_csv=True, return a pandas DataFrame from the CSV endpoint.
    Otherwise, return the raw JSON.
    """
    # Read the ssu taxonomy CSV directly
    url = f"{BASE}/{accession}/taxonomy/ssu.csv"
    df_ssu = pd.read_csv(url)
    return df_ssu

In [27]:
all_dfs = []  # to combine across accessions

for acc in accession:
    print(f"Fetching {acc} …")
    try:
        # Option A: CSV
        df_tax = fetch_taxonomy(acc, as_csv=True)
        
        # Option B: JSON
        # json_res = fetch_taxonomy(acc, as_csv=False)
        # df_tax = parse_taxonomy_json(json_res)
        
        df_tax["accession"] = acc
        all_dfs.append(df_tax)
    except Exception as e:
        print(f"  → failed: {e}")

# Combine
master_taxonomy = pd.concat(all_dfs, ignore_index=True)

Fetching MGYA00216443 …
Fetching MGYA00216444 …
Fetching MGYA00216445 …
Fetching MGYA00216446 …
Fetching MGYA00216447 …
Fetching MGYA00216448 …
Fetching MGYA00216449 …
Fetching MGYA00216450 …
Fetching MGYA00216451 …
Fetching MGYA00216452 …
Fetching MGYA00216453 …
Fetching MGYA00216454 …
Fetching MGYA00216455 …
Fetching MGYA00216456 …
Fetching MGYA00216457 …
Fetching MGYA00216458 …
Fetching MGYA00216459 …
Fetching MGYA00216460 …
Fetching MGYA00216461 …
Fetching MGYA00216462 …
Fetching MGYA00216463 …
Fetching MGYA00216464 …
Fetching MGYA00216465 …
Fetching MGYA00216466 …
Fetching MGYA00216467 …
Fetching MGYA00216468 …
Fetching MGYA00216469 …
Fetching MGYA00216470 …
Fetching MGYA00216471 …
Fetching MGYA00216472 …
Fetching MGYA00216473 …
Fetching MGYA00216474 …
Fetching MGYA00216475 …
  → failed: HTTP Error 413: Request Entity Too Large
Fetching MGYA00216476 …
Fetching MGYA00216477 …
Fetching MGYA00216478 …
Fetching MGYA00216479 …
Fetching MGYA00216480 …
Fetching MGYA00216481 …
Fetching MG

In [54]:
master_taxonomy.dropna(subset=["hierarchy.phylum"])

Unnamed: 0,count,domain,hierarchy.class,hierarchy.family,hierarchy.genus,hierarchy.kingdom,hierarchy.order,hierarchy.phylum,hierarchy.species,hierarchy.super kingdom,lineage,name,parent,pipeline_version,rank,url,accession
1,1,Archaea,,,,,,Euryarchaeota,,Archaea,Archaea::Euryarchaeota,Euryarchaeota,,4.1,phylum,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216443
2,3,Archaea,Methanobacteria,,,,Methanobacteriales,Euryarchaeota,,Archaea,Archaea::Euryarchaeota:Methanobacteria:Methano...,Methanobacteriales,Methanobacteria,4.1,order,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216443
3,105,Archaea,Methanobacteria,Methanobacteriaceae,,,Methanobacteriales,Euryarchaeota,,Archaea,Archaea::Euryarchaeota:Methanobacteria:Methano...,Methanobacteriaceae,Methanobacteriales,4.1,family,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216443
4,9,Archaea,Methanobacteria,Methanobacteriaceae,Methanobacterium,,Methanobacteriales,Euryarchaeota,,Archaea,Archaea::Euryarchaeota:Methanobacteria:Methano...,Methanobacterium,Methanobacteriaceae,4.1,genus,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216443
5,246,Archaea,Methanobacteria,Methanobacteriaceae,Methanobrevibacter,,Methanobacteriales,Euryarchaeota,,Archaea,Archaea::Euryarchaeota:Methanobacteria:Methano...,Methanobrevibacter,Methanobacteriaceae,4.1,genus,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147544,1,Eukaryota,Chromadorea,Diploscapteridae,,Metazoa,Rhabditida,Nematoda,,Eukaryota,Eukaryota:Metazoa:Nematoda:Chromadorea:Rhabdit...,Diploscapteridae,Rhabditida,4.1,family,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216676
147545,1,Eukaryota,Bdelloidea,,,Metazoa,,Rotifera,,Eukaryota,Eukaryota:Metazoa:Rotifera:Bdelloidea,Bdelloidea,Rotifera,4.1,class,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216676
147546,1,Eukaryota,Bdelloidea,Adinetidae,Adineta,Metazoa,Adinetida,Rotifera,Adineta_vaga,Eukaryota,Eukaryota:Metazoa:Rotifera:Bdelloidea:Adinetid...,Adineta_vaga,Adineta,4.1,species,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216676
147548,3,Eukaryota,,,,Viridiplantae,,Chlorophyta,,Eukaryota,Eukaryota:Viridiplantae:Chlorophyta,Chlorophyta,Viridiplantae,4.1,phylum,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00216676


In [69]:
selected_tax_4_1 = master_taxonomy[["count", "domain", "hierarchy.phylum", "accession"]].dropna(subset=["hierarchy.phylum"])

selected_tax_4_1["phylum_level"] = selected_tax_4_1["domain"].astype(str) + " " + selected_tax_4_1["hierarchy.phylum"].astype(str)

selected_tax_4_1 = selected_tax_4_1[["count", "phylum_level", "accession"]]

selected_tax_4_1

# pivot to wide format
df_wide_4_1 = selected_tax_4_1.pivot_table(
    index='accession',
    columns='phylum_level',
    values='count',
    aggfunc='sum',
    fill_value=0        # fill missing combinations with zero
)

# (optional) if you want the lineage levels back as normal columns rather than a MultiIndex:
df_wide_4_1 = df_wide_4_1.reset_index().rename_axis(columns=None)

df_wide_4_1["pipeline"] = "4.1"

# Reorder

cols = df_wide_4_1.columns.tolist()
# build a new order: your column first, then all others in original order
new_order = [cols[0]] + ['pipeline'] + [c for c in cols if c not in (cols[0], 'pipeline')]

# reindex df
df_wide_4_1 = df_wide_4_1[new_order]

df_wide_4_1

Unnamed: 0,accession,pipeline,Archaea Candidatus_Diapherotrites,Archaea Candidatus_Micrarchaeota,Archaea Candidatus_Nanohaloarchaeota,Archaea Candidatus_Verstraetearchaeota,Archaea Crenarchaeota,Archaea Euryarchaeota,Archaea Nanoarchaeota,Archaea Thaumarchaeota,...,Eukaryota Nematoda,Eukaryota Phaeophyceae,Eukaryota Picozoa,Eukaryota Platyhelminthes,Eukaryota Porifera,Eukaryota Rotifera,Eukaryota Streptophyta,Eukaryota Tardigrada,Eukaryota Xenacoelomorpha,Eukaryota Zoopagomycota
0,MGYA00216443,4.1,0,0,0,0,0,926,0,0,...,204,1,0,0,0,10,43,2,0,0
1,MGYA00216444,4.1,0,0,0,0,4,338,0,4,...,26,0,0,10,0,2,18,0,0,1
2,MGYA00216445,4.1,0,0,0,0,0,104,0,0,...,33,1,0,0,0,4,25,0,0,0
3,MGYA00216446,4.1,0,0,0,0,0,423,0,13,...,39,1,0,25,0,1,12,0,0,1
4,MGYA00216447,4.1,1,0,0,0,1,159,0,0,...,165,0,0,0,0,1,10,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,MGYA00216672,4.1,0,0,0,0,0,10,0,0,...,15,0,0,0,0,1,2,0,0,0
226,MGYA00216673,4.1,0,0,0,0,0,147,0,0,...,19,0,0,0,0,1,1,0,0,0
227,MGYA00216674,4.1,3,0,0,0,0,19,0,0,...,25,0,0,0,0,9,0,0,0,0
228,MGYA00216675,4.1,0,0,0,0,0,114,0,0,...,4,0,0,0,0,1,3,0,0,0


In [71]:
df = pd.read_csv("AnalysisJob.csv", usecols=["accession", "pipeline_version"])
df = df[df["pipeline_version"] == 3.0]
accession = df["accession"].dropna().unique().tolist()

In [70]:
def fetch_taxonomy_alt(accession, as_csv=False, BASE=BASE):
    """
    Fetch taxonomic abundance for a given analysis accession.
    If as_csv=True, return a pandas DataFrame from the CSV endpoint.
    Otherwise, return the raw JSON.
    """
    # Read the ssu taxonomy CSV directly
    url = f"{BASE}/{accession}/taxonomy.csv"
    df_ssu = pd.read_csv(url)
    return df_ssu

In [72]:
all_dfs = []  # to combine across accessions

for acc in accession:
    print(f"Fetching {acc} …")
    try:
        # Option A: CSV
        df_tax = fetch_taxonomy_alt(acc, as_csv=True)
        
        # Option B: JSON
        # json_res = fetch_taxonomy(acc, as_csv=False)
        # df_tax = parse_taxonomy_json(json_res)
        
        df_tax["accession"] = acc
        all_dfs.append(df_tax)
    except Exception as e:
        print(f"  → failed: {e}")

# Combine
master_taxonomy = pd.concat(all_dfs, ignore_index=True)

Fetching MGYA00085622 …
Fetching MGYA00085623 …
Fetching MGYA00085624 …
Fetching MGYA00085625 …
Fetching MGYA00085626 …
Fetching MGYA00085627 …
Fetching MGYA00085628 …
Fetching MGYA00085629 …
Fetching MGYA00085630 …
Fetching MGYA00085631 …
Fetching MGYA00085632 …
Fetching MGYA00085633 …
Fetching MGYA00085634 …
Fetching MGYA00085635 …
Fetching MGYA00085636 …
Fetching MGYA00085637 …
Fetching MGYA00085638 …
Fetching MGYA00085639 …
Fetching MGYA00085640 …
Fetching MGYA00085641 …
Fetching MGYA00085642 …
Fetching MGYA00085643 …
Fetching MGYA00085644 …
Fetching MGYA00085645 …
Fetching MGYA00085646 …
Fetching MGYA00085647 …
Fetching MGYA00085648 …
Fetching MGYA00085649 …
Fetching MGYA00085650 …
Fetching MGYA00085651 …
Fetching MGYA00085652 …
Fetching MGYA00085653 …
Fetching MGYA00085654 …
Fetching MGYA00085655 …
Fetching MGYA00085656 …
Fetching MGYA00085657 …
Fetching MGYA00085658 …
Fetching MGYA00085659 …
Fetching MGYA00085660 …
Fetching MGYA00085661 …
Fetching MGYA00085662 …
Fetching MGYA000

In [None]:
master_taxonomy

Unnamed: 0,count,domain,hierarchy.class,hierarchy.family,hierarchy.genus,hierarchy.kingdom,hierarchy.order,hierarchy.phylum,hierarchy.species,lineage,name,parent,pipeline_version,rank,url,accession
0,5,Bacteria,Synergistia,Dethiosulfovibrionaceae,PD-UASB-13,Bacteria,Synergistales,Synergistetes,,Bacteria:Synergistetes:Synergistia:Synergistal...,PD-UASB-13,Dethiosulfovibrionaceae,3.0,genus,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085622
1,20,Bacteria,Betaproteobacteria,,,Bacteria,MND1,Proteobacteria,,Bacteria:Proteobacteria:Betaproteobacteria:MND1,MND1,Betaproteobacteria,3.0,order,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085622
2,1,Bacteria,Alphaproteobacteria,Rhodobacteraceae,Amaricoccus,Bacteria,Rhodobacterales,Proteobacteria,,Bacteria:Proteobacteria:Alphaproteobacteria:Rh...,Amaricoccus,Rhodobacteraceae,3.0,genus,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085622
3,8,Bacteria,Bacteroidia,S24-7,,Bacteria,Bacteroidales,Bacteroidetes,,Bacteria:Bacteroidetes:Bacteroidia:Bacteroidal...,S24-7,Bacteroidales,3.0,family,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085622
4,23,Bacteria,Alphaproteobacteria,Rhizobiaceae,,Bacteria,Rhizobiales,Proteobacteria,,Bacteria:Proteobacteria:Alphaproteobacteria:Rh...,Rhizobiaceae,Rhizobiales,3.0,family,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130228,1,Bacteria,Clostridia,Clostridiaceae,Clostridium,Bacteria,Clostridiales,Firmicutes,tetani,Bacteria:Firmicutes:Clostridia:Clostridiales:C...,tetani,Clostridium,3.0,species,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085800
130229,9,Bacteria,Gammaproteobacteria,Oceanospirillaceae,,Bacteria,Oceanospirillales,Proteobacteria,,Bacteria:Proteobacteria:Gammaproteobacteria:Oc...,Oceanospirillaceae,Oceanospirillales,3.0,family,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085800
130230,4,Bacteria,Alphaproteobacteria,Sphingomonadaceae,Sphingobium,Bacteria,Sphingomonadales,Proteobacteria,,Bacteria:Proteobacteria:Alphaproteobacteria:Sp...,Sphingobium,Sphingomonadaceae,3.0,genus,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085800
130231,12,Bacteria,Betaproteobacteria,Comamonadaceae,Variovorax,Bacteria,Burkholderiales,Proteobacteria,,Bacteria:Proteobacteria:Betaproteobacteria:Bur...,Variovorax,Comamonadaceae,3.0,genus,https://www.ebi.ac.uk/metagenomics/api/v1/anno...,MGYA00085800


In [76]:
selected_tax_3_0 = master_taxonomy[["count", "domain", "hierarchy.phylum", "accession"]].dropna(subset=["hierarchy.phylum"])

selected_tax_3_0["phylum_level"] = selected_tax_3_0["domain"].astype(str) + " " + selected_tax_3_0["hierarchy.phylum"].astype(str)

selected_tax_3_0 = selected_tax_3_0[["count", "phylum_level", "accession"]]

selected_tax_3_0

# pivot to wide format
df_wide_3_0 = selected_tax_3_0.pivot_table(
    index='accession',
    columns='phylum_level',
    values='count',
    aggfunc='sum',
    fill_value=0        # fill missing combinations with zero
)

# (optional) if you want the lineage levels back as normal columns rather than a MultiIndex:
df_wide_3_0 = df_wide_3_0.reset_index().rename_axis(columns=None)

df_wide_3_0["pipeline"] = "3.0"

# Reorder

cols = df_wide_3_0.columns.tolist()
# build a new order: your column first, then all others in original order
new_order = [cols[0]] + ['pipeline'] + [c for c in cols if c not in (cols[0], 'pipeline')]

# reindex df
df_wide_3_0 = df_wide_3_0[new_order]

df_wide_3_0

Unnamed: 0,accession,pipeline,Archaea Crenarchaeota,Archaea Euryarchaeota,Archaea Parvarchaeota,Bacteria AC1,Bacteria Acidobacteria,Bacteria Actinobacteria,Bacteria Aquificae,Bacteria Armatimonadetes,...,Bacteria Verrucomicrobia,Bacteria WPS-2,Bacteria WS1,Bacteria WS2,Bacteria WS3,Bacteria WS4,Bacteria WS5,Bacteria WS6,Bacteria WWE1,Bacteria ZB3
0,MGYA00085622,3.0,0,95,0,0,4,1236,0,1,...,169,4,0,1,0,0,0,0,0,0
1,MGYA00085623,3.0,0,69,0,0,24,1046,0,4,...,222,10,0,0,1,0,0,2,3,0
2,MGYA00085624,3.0,0,16,0,0,29,403,0,3,...,74,12,0,0,0,0,0,0,0,0
3,MGYA00085625,3.0,0,26,0,0,4,377,0,5,...,384,3,3,0,0,0,0,6,18,0
4,MGYA00085626,3.0,0,10,0,0,3,88,0,0,...,13,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,MGYA00085796,3.0,0,209,0,0,32,3202,0,9,...,817,11,0,0,1,1,0,24,1,0
175,MGYA00085797,3.0,0,78,0,0,1,323,0,2,...,98,2,0,0,1,0,0,8,2,0
176,MGYA00085798,3.0,0,33,0,0,3,478,0,2,...,55,3,0,0,1,0,0,3,2,0
177,MGYA00085799,3.0,7,323,0,0,306,1016,0,13,...,726,12,2,0,10,0,0,244,16,1


In [78]:
concat_pd = pd.concat([df_wide_4_1, df_wide_3_0], axis=0, ignore_index=True, sort=False).fillna(0)

concat_pd

Unnamed: 0,accession,pipeline,Archaea Candidatus_Diapherotrites,Archaea Candidatus_Micrarchaeota,Archaea Candidatus_Nanohaloarchaeota,Archaea Candidatus_Verstraetearchaeota,Archaea Crenarchaeota,Archaea Euryarchaeota,Archaea Nanoarchaeota,Archaea Thaumarchaeota,...,Bacteria Thermi,Bacteria WPS-2,Bacteria WS1,Bacteria WS2,Bacteria WS3,Bacteria WS4,Bacteria WS5,Bacteria WS6,Bacteria WWE1,Bacteria ZB3
0,MGYA00216443,4.1,0.0,0.0,0.0,0.0,0,926,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MGYA00216444,4.1,0.0,0.0,0.0,0.0,4,338,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,MGYA00216445,4.1,0.0,0.0,0.0,0.0,0,104,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MGYA00216446,4.1,0.0,0.0,0.0,0.0,0,423,0.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MGYA00216447,4.1,1.0,0.0,0.0,0.0,1,159,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,MGYA00085796,3.0,0.0,0.0,0.0,0.0,0,209,0.0,0.0,...,3.0,11.0,0.0,0.0,1.0,1.0,0.0,24.0,1.0,0.0
405,MGYA00085797,3.0,0.0,0.0,0.0,0.0,0,78,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,8.0,2.0,0.0
406,MGYA00085798,3.0,0.0,0.0,0.0,0.0,0,33,0.0,0.0,...,0.0,3.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,0.0
407,MGYA00085799,3.0,0.0,0.0,0.0,0.0,7,323,0.0,0.0,...,0.0,12.0,2.0,0.0,10.0,0.0,0.0,244.0,16.0,1.0


In [80]:
df = pd.read_csv("AnalysisJob.csv", usecols=["accession"])
accession = df["accession"].dropna().unique().tolist()

In [88]:
BASE = "https://www.ebi.ac.uk/metagenomics/api/v1"

def get_run_id_for_analysis(accession):
    """Return the MGnify sample ID (e.g. ERS…) for a given analysis accession."""
    url = f"{BASE}/analyses/{accession}"
    resp = requests.get(url)
    resp.raise_for_status()
    data = resp.json()
    # Navigate into the JSON: relationships → sample → data → id
    return data["data"]["relationships"]["run"]["data"]["id"]

def get_country_for_sample(sample_id):
    """
    Fetch the sample record and return its 'country' attribute.
    """
    url = f"{BASE}/samples/{sample_id}"
    resp = requests.get(url)
    resp.raise_for_status()
    attrs = resp.json()["data"]["attributes"]
    # the exact key used by the API:
    return attrs.get("geographic location (country and/or sea,region)")

In [89]:
# 2) Fetch country for each
records = []
for acc in accession:
    try:
        samp_id = get_run_id_for_analysis(acc)
        records.append({"accession": acc, "sample_id": samp_id})
        print(f"Sample ID for {acc}: {samp_id}")
    except Exception as e:
        # you can choose to log or skip
        records.append({"accession": acc, "sample_id": None})
        print(f"Warning fetching {acc}: {e}")

# 3) Build and save
df_samples = pd.DataFrame(records)

df_samples

Sample ID for MGYA00216443: ERR1713390
Sample ID for MGYA00216444: ERR1713337
Sample ID for MGYA00216445: ERR1726005
Sample ID for MGYA00216446: ERR1725950
Sample ID for MGYA00216447: ERR1725956
Sample ID for MGYA00216448: ERR1725963
Sample ID for MGYA00216449: ERR1713357
Sample ID for MGYA00216450: ERR1713382
Sample ID for MGYA00216451: ERR1725952
Sample ID for MGYA00216452: ERR1725987
Sample ID for MGYA00216453: ERR1713402
Sample ID for MGYA00216454: ERR1725949
Sample ID for MGYA00216455: ERR1725984
Sample ID for MGYA00216456: ERR2592272
Sample ID for MGYA00216457: ERR1713360
Sample ID for MGYA00216458: ERR1726002
Sample ID for MGYA00216459: ERR1725982
Sample ID for MGYA00216460: ERR2592276
Sample ID for MGYA00216461: ERR1725957
Sample ID for MGYA00216462: ERR2592250
Sample ID for MGYA00216463: ERR1713385
Sample ID for MGYA00216464: ERR2592271
Sample ID for MGYA00216465: ERR2592331
Sample ID for MGYA00216466: ERR1713381
Sample ID for MGYA00216467: ERR2592247
Sample ID for MGYA0021646

Unnamed: 0,accession,sample_id
0,MGYA00216443,ERR1713390
1,MGYA00216444,ERR1713337
2,MGYA00216445,ERR1726005
3,MGYA00216446,ERR1725950
4,MGYA00216447,ERR1725956
...,...,...
408,MGYA00085796,ERR1713384
409,MGYA00085797,ERR1725965
410,MGYA00085798,ERR1725966
411,MGYA00085799,ERR1725950


In [99]:
metadata = pd.read_csv("SraRunTable-4.csv", usecols=["Run", "geographic_location_(country_and/or_sea)"])

metadata = metadata.rename(columns={"Run": "sample_id", "geographic_location_(country_and/or_sea)": "location"})

merged_data = pd.merge(df_samples, metadata, on=["sample_id"], how="left")

final_pd = pd.merge(merged_data, concat_pd, on=["accession"], how="inner")

final_pd.to_csv("../count/DTU-GE_phylum_count_data.csv", index=False)

In [112]:
country_counts = final_pd["location"].value_counts()
keep = country_counts[country_counts >= 15].index
df_filtered = final_pd[final_pd["location"].isin(keep)]
df_filtered["location"].nunique()

4