# __Extracting all Bifidobacterium strains__

### Loading and cleaning the required datasets

Import required libraries

In [1]:
import pandas as pd
import numpy as np
import time
from pathlib import Path
from collections import defaultdict

We will now load `names.dmp` and `nodes.dmp` as pandas DataFrames from the `new_taxdump_2025-12-01.zip` archive.  
This allows us to work with the data in a clean and structured way.

> Note: These files are not included in the repository due to their large size.  
> You can download the `new_taxdump_2025-12-01` snapshot from the NCBI FTP site:  
> [https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/)


In [2]:
names_path = Path('../data/raw/new_taxdump_2025-12-01/names.dmp')
names = pd.read_csv(names_path,sep=r"\t\|\t",engine='python',header=None)
names.columns = ['tax_id','name_txt','unique_name','name_class']

In [3]:
names.head() 

Unnamed: 0,tax_id,name_txt,unique_name,name_class
0,1,all,,synonym\t|
1,1,root,,scientific name\t|
2,2,Bacteria,Bacteria <bacteria>,scientific name\t|
3,2,bacteria,bacteria <blast name>,blast name\t|
4,2,bacteria,bacteria <genbank common name>,genbank common name\t|


In [4]:
names['name_class'] = names['name_class'].str.strip('\t|') # Remove unwanted characters from the 'name_class' column

In [5]:
names.head(3)

Unnamed: 0,tax_id,name_txt,unique_name,name_class
0,1,all,,synonym
1,1,root,,scientific name
2,2,Bacteria,Bacteria <bacteria>,scientific name


In [6]:
nodes_path = Path('../data/raw/new_taxdump_2025-12-01/nodes.dmp')
nodes = pd.read_csv(nodes_path,engine='python',sep=r'\t\|\t',header=None,usecols=[0,1,2],names=['tax_id','parent_tax_id','rank'])

In [7]:
nodes.head()

Unnamed: 0,tax_id,parent_tax_id,rank
0,1,1,no rank
1,2,131567,domain
2,6,335928,genus
3,7,6,species
4,9,32199,species


### Traversing the Bifidobacterium sub-tree

Since the NCBI taxonomy nodes form a hierarchical tree, I will construct a dictionary mapping parent IDs to their children to efficiently retrieve all descendants of the genus *Bifidobacterium*.

Now that we cleaned __names__ and __nodes__, we need to identify the tax_id corresponding to the genus *Bifidobacterium*

In [8]:
_ = names[(names['name_txt'].str.contains('bifido',case=False)) & (names['name_class'] == 'scientific name')]

In [9]:
_.iloc[0]

tax_id                    1678
name_txt       Bifidobacterium
unique_name                NaN
name_class     scientific name
Name: 7682, dtype: object

Bifidobacterium genus taxId is 1678

In [10]:
bf_tax_id = _.iloc[0,0]
bf_tax_id

np.int64(1678)

In [11]:
# Build a mapping from parent tax_id to list of children
children = defaultdict(list)
for i,row in nodes.iterrows():
    parent = row['parent_tax_id']
    child = row['tax_id']
    children[parent].append(child)

In [12]:
unseen = [bf_tax_id]  # nodes to explore
found = set()   # store all found descendant tax_ids

while unseen:
    actual = unseen.pop()# take the next node to explore

    for child in children.get(actual, []):
        if child not in found: # skip if already found to avoid duplicates
            found.add(child)
            unseen.append(child)

In [13]:
len(found)

903

In [14]:
all_bifido_taxids = nodes.loc[nodes['tax_id'].isin(found),'tax_id']

### Determining whether the strains possess the hutH gene

To address this question, we will use the Biopython library, specifically the `Entrez` module, to retrieve relevant genomic information.

In [15]:
from Bio import Entrez

Entrez.email = "eliaszorrillagaldon@gmail.com"

# Prepare a list of tax_ids to query NCBI
taxids = found

results = []


# Query the NCBI protein database to check whether each strain has the HutH gene (EC 4.3.1.3)
# Using Entrez.esearch, we retrieve the count of matching proteins for each tax_id
# If the count is greater than 0, we record that HutH is present


for taxid in taxids:
    term = f"txid{taxid}[Organism:exp] AND 4.3.1.3[EC Number]" 

    handle = Entrez.esearch(
        db="protein",
        term=term,
        retmax=0
    )
    record = Entrez.read(handle)
    handle.close()

    count = int(record["Count"])

    results.append({
        "tax_id": taxid,
        "HutH": count > 0})

    time.sleep(0.34)  # NCBI rate limit

# 2. Save the results
df_out = pd.DataFrame(results)

In [16]:
df_out.head()

Unnamed: 0,tax_id,HutH
0,2834432,False
1,2834433,False
2,2834434,False
3,2834435,False
4,2834436,False


In [17]:
len(df_out)

903

Merge the HutH results with the names DataFrame where `name_class == 'scientific name'`. 

This adds the strain names to the output bifido_df_names includes both tax_id, HutH presence, and the name.

In [18]:
bifido_df_names = pd.merge(left=df_out,right=names[names['name_class']=='scientific name'][['tax_id','name_txt']],on='tax_id',how='left')
bifido_df_names.head()

Unnamed: 0,tax_id,HutH,name_txt
0,2834432,False,Bifidobacterium pongonis
1,2834433,False,Bifidobacterium saguinibicoloris
2,2834434,False,Bifidobacterium simiiventris
3,2834435,False,Bifidobacterium miconis
4,2834436,False,Bifidobacterium pluvialisilvae


Merge the `bifido_df_names` df with the `nodes` DataFrame to create a dataset containing both tax_ids, huth pressence and their corresponding names and rank.

In [19]:
final_bifido_df = pd.merge(left=bifido_df_names,right=nodes[['tax_id','rank']],on='tax_id',how='left')
final_bifido_df.head(10)

Unnamed: 0,tax_id,HutH,name_txt,rank
0,2834432,False,Bifidobacterium pongonis,species
1,2834433,False,Bifidobacterium saguinibicoloris,species
2,2834434,False,Bifidobacterium simiiventris,species
3,2834435,False,Bifidobacterium miconis,species
4,2834436,False,Bifidobacterium pluvialisilvae,species
5,2834437,False,Bifidobacterium miconisargentati,species
6,1798154,False,Bifidobacterium sp. LMM_I9,species
7,1798155,False,Bifidobacterium aerophilum,species
8,1798157,False,Bifidobacterium avesanii,species
9,1798158,False,Bifidobacterium ramosum,species


### Save the results

Export the Final DataFrame to a `.csv` file for later analysis or sharing.

In [None]:
folder_path = Path('../data/processed')
final_bifido_df.to_csv(folder_path/"histidine_ammonia_lyase_presence_all_descendants.csv", index=False)