In [2]:
import pandas as pd
import requests
from Bio import Entrez

# this script was used to investigate the 'all taxa with neither family nor genus' classification in CZID

In [None]:
from Bio import Entrez

Entrez.email = "plogan@chanzuckerberg.com"


def get_child_taxids(taxid):
    handle = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    
    # This assumes the first record is the one we're interested in
    children = records[0].get("LineageEx")
    
    child_taxids = []
    if children:
        for child in children:
            child_taxids.append((child["TaxId"], child["ScientificName"]))
    
    return child_taxids

# Betacoronavirus taxid
# betacoronavirus_taxid = "694002"
# child_taxids = get_child_taxids(betacoronavirus_taxid)

# for taxid, name in child_taxids:
#     print(taxid, name)


In [46]:
def search_taxonomy(query):
    # Use Entrez.esearch to search the taxonomy database
    handle = Entrez.esearch(db="taxonomy", term=query)
    record = Entrez.read(handle)
    handle.close()
    
    # Print the ID of the first match
    id_list = record["IdList"]
    if id_list:
        print("Found IDs:", id_list)
        return id_list[0]
    else:
        print("No matches found.")
        return None

def fetch_taxonomy_details(tax_id):
    # Use Entrez.efetch to get detailed taxonomy information by ID
    handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    
    # Print some details about the first record
    if records:
        print("Details for ID:", tax_id)
        for record in records:
            print("Scientific Name:", record["ScientificName"])
            print("Rank:", record["Rank"])
            print("Lineage:", record["Lineage"])
            return records
    else:
        print("No details found for ID:", tax_id)

def fetch_taxonomy_details_by_taxid(tax_id):
    # Use Entrez.efetch to get detailed taxonomy information by ID
    handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    
    # Print some details about the record
    if records:
        for record in records:
            print("TaxID:", tax_id)
            print("Scientific Name:", record["ScientificName"])
            print("Rank:", record["Rank"])
            print("Lineage:", record["Lineage"])
            return records
    else:
        print("No details found for TaxID:", tax_id)


# query = "Burkholderiales"
# tax_id = search_taxonomy(query)
# if tax_id:
#     records = fetch_taxonomy_details(tax_id)

In [50]:
cookies = {
    'Cookie': 'x'
}

def get_sample_report_data(sample_id):
    url = f'https://staging.czid.org/samples/{sample_id}/report_v2.json?&id={sample_id}&merge_nt_nr=false'
    response = requests.get(url, cookies=cookies)
    if response.status_code == 200:
        unnamed = []
        for count, info in response.json()["counts"].items():
            for taxid, info_tax in info.items():
                if info_tax["name"] == "all taxa with neither family nor genus classification":
                    unnamed.append(info_tax)
                    # print(taxid, info_tax, info_tax["species_tax_ids"])
                    # print("-----------------")
        return unnamed


def get_sample_ids_for_a_project(project_id):
    url = f"https://staging.czid.org/samples/index_v2.json?projectId={project_id}&domain=my_data&offset=0&listAllIds=true&basic=false&workflow=short-read-mngs"
    response = requests.get(url, cookies=cookies)
    if response.status_code == 200:
        return response


def find_family_genus_info(record):
    # in the NCBI taxonomy is there a family or a genus ranking?
    found_family_genus = False
    for i in record[0]["LineageEx"]:
        if i["Rank"] == 'family' or i["Rank"] =="genus":
            found_family_genus = True
    return found_family_genus
    


In [123]:
from collections import defaultdict

report_data = defaultdict(list)
tax_id_to_tax_id = defaultdict(list)
# get all sample ids within a project
samples_response = get_sample_ids_for_a_project(1281)
sample_ids = samples_response.json()["all_samples_ids"]

for sample_id in sample_ids:
    # get the species_tax_ids when the 'name' is 'all taxa with neither family nor genus classification'
    unnamed_records = get_sample_report_data(sample_id)
    # fetch the taxonomy from NCBI for each taxid in the 'all taxa with neither family nor genus classification' bucket
    for tax_id in unnamed_records[0]["species_tax_ids"]:
        try: 
            records = fetch_taxonomy_details_by_taxid(tax_id)
            tax_id_to_tax_id[tax_id].append(records[0]['TaxId'])
            report_data[sample_id].append(records)
        except Exception as e:
            print(e)
            print(f"taxid {tax_id} not found")

TaxID: 3036250
Scientific Name: uncultured Vicinamibacterales bacterium
Rank: species
Lineage: cellular organisms; Bacteria; Acidobacteriota; Vicinamibacteria; Vicinamibacterales; environmental samples
TaxID: 3036248
Scientific Name: uncultured Thermoanaerobaculia bacterium
Rank: species
Lineage: cellular organisms; Bacteria; Acidobacteriota; Thermoanaerobaculia; environmental samples
TaxID: 3034326
Scientific Name: Opitutae bacterium KCR 482
Rank: species
Lineage: cellular organisms; Bacteria; PVC group; Verrucomicrobiota; Opitutae; unclassified Opitutae
TaxID: 3003696
Scientific Name: Pseudomonas phage Fyn8
Rank: species
Lineage: Viruses; Duplodnaviria; Heunggongvirae; Uroviricota; Caudoviricetes; unclassified Caudoviricetes
TaxID: 2996180
Scientific Name: Pseudomonas phage zjk6
Rank: species
Lineage: Viruses; Duplodnaviria; Heunggongvirae; Uroviricota; Caudoviricetes; unclassified Caudoviricetes
TaxID: 2969644
Scientific Name: Pseudomonas phage vB_PaeS-D14P
Rank: species
Lineage: Vi

In [126]:
report_data[37678]

[[{'TaxId': '3036250', 'ScientificName': 'uncultured Vicinamibacterales bacterium', 'ParentTaxId': '3036249', 'Rank': 'species', 'Division': 'Environmental samples', 'GeneticCode': {'GCId': '11', 'GCName': 'Bacterial, Archaeal and Plant Plastid'}, 'MitoGeneticCode': {'MGCId': '0', 'MGCName': 'Unspecified'}, 'Lineage': 'cellular organisms; Bacteria; Acidobacteriota; Vicinamibacteria; Vicinamibacterales; environmental samples', 'LineageEx': [{'TaxId': '131567', 'ScientificName': 'cellular organisms', 'Rank': 'no rank'}, {'TaxId': '2', 'ScientificName': 'Bacteria', 'Rank': 'superkingdom'}, {'TaxId': '57723', 'ScientificName': 'Acidobacteriota', 'Rank': 'phylum'}, {'TaxId': '1813735', 'ScientificName': 'Vicinamibacteria', 'Rank': 'class'}, {'TaxId': '2910145', 'ScientificName': 'Vicinamibacterales', 'Rank': 'order'}, {'TaxId': '3036249', 'ScientificName': 'environmental samples', 'Rank': 'no rank'}], 'CreateDate': '2023/03/23 06:57:02', 'UpdateDate': '2023/03/23 06:57:02', 'PubDate': '2023

In [124]:
tax_id_to_tax_id

defaultdict(list,
            {3036250: ['3036250', '3036250'],
             3036248: ['3036248', '3036248'],
             3034326: ['3034326'],
             3003696: ['3003696', '3003696'],
             2996180: ['2996180'],
             2969644: ['2969644'],
             2969642: ['2969642', '2969642'],
             2968558: ['2968558', '2968558'],
             2965061: ['2965061'],
             2960088: ['2960088', '2960088', '2960088', '2960088'],
             2950078: ['2950078'],
             2935773: ['2935773'],
             2935771: ['2935771'],
             2928017: ['2928017', '2928017'],
             2913493: ['2913493', '2913493'],
             2910168: ['2910168', '2910168'],
             2910147: ['2910147', '2910147'],
             2900548: ['2900548',
              '2900548',
              '2900548',
              '2900548',
              '2900548',
              '2900548',
              '2900548',
              '2900548',
              '2900548',
              '290054

In [131]:
# are there any records that have family/ genus that are in the 'all taxa with neither family nor genus classification' bucket?
records_w_family_genus = defaultdict(list)
for sample_id, records in report_data.items():
    for record in records:
        is_family_genus_found = find_family_genus_info(record)
        if is_family_genus_found:
            # print(record)
            # print(record[0]["TaxId"])
            records_w_family_genus[sample_id].append(record)
    

In [137]:
records_that_have_genus_species_unique_taxid_to_name = defaultdict(str)

for k, v in records_w_family_genus.items():
    for r in v:
        records_that_have_genus_species_unique_taxid_to_name[r[0]["TaxId"]] = r[0]["ScientificName"]

In [138]:
records_that_have_genus_species_unique_taxid_to_name

defaultdict(str,
            {'2447898': 'Candidatus Kapaibacterium sp.',
             '100': 'Ancylobacter aquaticus',
             '2776816': 'Streptococcus phage SA01',
             '2735749': 'Campylobacter sp. LMG 7929',
             '2735748': 'Campylobacter sp. LMG 17559',
             '2735747': 'Campylobacter sp. RM5063',
             '2735734': 'Campylobacter sp. RM12637',
             '2509768': 'Rhizobium phage RHph_TM39'})

In [143]:
records_w_family_genus_cleaned = {}
for sample_id, records in records_w_family_genus.items():
    taxid_name = {}
    for record in records:
        taxid_name[record[0]["TaxId"]] = record[0]["ScientificName"]
    records_w_family_genus_cleaned[sample_id] = taxid_name

In [144]:
records_w_family_genus_cleaned

{37678: {'2447898': 'Candidatus Kapaibacterium sp.',
  '100': 'Ancylobacter aquaticus'},
 37677: {'100': 'Ancylobacter aquaticus'},
 37676: {'100': 'Ancylobacter aquaticus'},
 37675: {'100': 'Ancylobacter aquaticus'},
 37674: {'100': 'Ancylobacter aquaticus'},
 37673: {'2447898': 'Candidatus Kapaibacterium sp.',
  '100': 'Ancylobacter aquaticus'},
 37672: {'2776816': 'Streptococcus phage SA01',
  '100': 'Ancylobacter aquaticus'},
 37671: {'2735749': 'Campylobacter sp. LMG 7929',
  '2735748': 'Campylobacter sp. LMG 17559',
  '2735747': 'Campylobacter sp. RM5063',
  '2735734': 'Campylobacter sp. RM12637',
  '2447898': 'Candidatus Kapaibacterium sp.',
  '100': 'Ancylobacter aquaticus'},
 37670: {'100': 'Ancylobacter aquaticus'},
 37669: {'100': 'Ancylobacter aquaticus'},
 37668: {'100': 'Ancylobacter aquaticus'},
 37605: {'100': 'Ancylobacter aquaticus'},
 37604: {'100': 'Ancylobacter aquaticus'},
 37603: {'2447898': 'Candidatus Kapaibacterium sp.',
  '100': 'Ancylobacter aquaticus'},
 37

In [152]:
records_w_family_genus_cleaned[37602]

{'2447898': 'Candidatus Kapaibacterium sp.', '100': 'Ancylobacter aquaticus'}

In [155]:
# look into 37602 a bit more

# from CZID data:

# get the species_tax_ids when the 'name' is 'all taxa with neither family nor genus classification'
unnamed_records = get_sample_report_data(37602)
# fetch the taxonomy from NCBI for each taxid in the 'all taxa with neither family nor genus classification' bucket
species_tax_id_unnamed_records_37602 = unnamed_records[0]["species_tax_ids"]
# check to see if record is there
[i for i in species_tax_id_unnamed_records_37602 if i == 2447898 or i == 100]

[2447898]

In [158]:
records_w_family_genus_extra_cleaned = {}
for sample_id, tax_to_name in records_w_family_genus_cleaned.items():
    new_tax_to_name = {}
    for t, n in tax_to_name.items():
        if t != '100': # this was happening because i was querying -100 from the sample report which means something different in CZID
           new_tax_to_name[t] = n
    if new_tax_to_name:
        records_w_family_genus_extra_cleaned[sample_id]=new_tax_to_name


In [159]:
records_w_family_genus_extra_cleaned

{37678: {'2447898': 'Candidatus Kapaibacterium sp.'},
 37673: {'2447898': 'Candidatus Kapaibacterium sp.'},
 37672: {'2776816': 'Streptococcus phage SA01'},
 37671: {'2735749': 'Campylobacter sp. LMG 7929',
  '2735748': 'Campylobacter sp. LMG 17559',
  '2735747': 'Campylobacter sp. RM5063',
  '2735734': 'Campylobacter sp. RM12637',
  '2447898': 'Candidatus Kapaibacterium sp.'},
 37603: {'2447898': 'Candidatus Kapaibacterium sp.'},
 37602: {'2447898': 'Candidatus Kapaibacterium sp.'},
 37593: {'2509768': 'Rhizobium phage RHph_TM39'}}

In [54]:
# this is a record that is not in the 'all taxa with neither family nor genus classification' bucket 
# (just to see what the different responses look like)

record_286 = fetch_taxonomy_details_by_taxid(286)

TaxID: 286
Scientific Name: Pseudomonas
Rank: genus
Lineage: cellular organisms; Bacteria; Pseudomonadota; Gammaproteobacteria; Pseudomonadales; Pseudomonadaceae


In [81]:
find_family_genus_info(record_286)

True

In [79]:
record_286[0]["LineageEx"]


[{'TaxId': '131567', 'ScientificName': 'cellular organisms', 'Rank': 'no rank'}, {'TaxId': '2', 'ScientificName': 'Bacteria', 'Rank': 'superkingdom'}, {'TaxId': '1224', 'ScientificName': 'Pseudomonadota', 'Rank': 'phylum'}, {'TaxId': '1236', 'ScientificName': 'Gammaproteobacteria', 'Rank': 'class'}, {'TaxId': '72274', 'ScientificName': 'Pseudomonadales', 'Rank': 'order'}, {'TaxId': '135621', 'ScientificName': 'Pseudomonadaceae', 'Rank': 'family'}]