In [None]:
from pathlib import Path
import re
from Bio import Entrez
import time
import pandas as pd
import tarfile

In [None]:
# Define paths

# Pipeline output folder containing all .pbd structures. By default also contains .json files
pipeline_colabfold_dir = Path("../BASEL_data/colabfold")

# Output directory
output_tar = Path("../BASEL_data/colabfold_structures.tar.gz")

Entrez.email = "mail@mail.com"

In [None]:
# Collect, compress and rename all relevant pdb files into a new directory
# New names will be: accession_prot_protein_(un)known.pdb
# Also grab all accessions, used to get taxonomy info later
count = 0
accession_list = set()

with tarfile.open(output_tar, "w:gz") as tar:
    for file in pipeline_colabfold_dir.glob("*.pdb"):
        pattern = re.compile(
            r"^(?P<accession>.+?)_prot_(?P<protein>.+?)_\d+_(?P<status>known|unknown)"
        )
        m = pattern.match(file.name.strip())
        newname = (
            m.group("accession")
            + "_prot_"
            + m.group("protein")
            + "_"
            + m.group("status")
            + ".pdb"
        )

        accession_list.add(m.group("accession"))

        tar.add(file, arcname=newname)
        count += 1

print(f"Sent {count} files from {pipeline_colabfold_dir} to {output_tar}")

In [None]:
len(accession_list)

In [None]:
# Grab taxonomy from NCBI for phages in accession_list
results = []

for i, acc in enumerate(accession_list, start=1):
    try:
        print(f"[{i}/{len(accession_list)}] Fetching taxonomy for: {acc}")

        # Get the record summary
        handle = Entrez.esummary(db="nucleotide", id=acc, retmode="xml")
        record = Entrez.read(handle)[0]
        handle.close()

        taxid = record.get("TaxId", None)
        organism = record.get("Title", record.get("Organism", None))

        # Get taxonomy lineage
        family = genus = species = None
        if taxid:
            thandle = Entrez.efetch(db="taxonomy", id=taxid, retmode="xml")
            tax_record = Entrez.read(thandle)[0]
            thandle.close()

            lineage = {
                t["Rank"]: t["ScientificName"] for t in tax_record.get("LineageEx", [])
            }
            family = lineage.get("family")
            genus = lineage.get("genus")
            species = tax_record.get("ScientificName", None)
        else:
            lineage = {}

        results.append(
            {
                "Accession": acc,
                "ScientificName": organism,
                "TaxID": taxid,
                "Family": family,
                "Genus": genus,
                "Species": species,
            }
        )

        time.sleep(0.2)

    except Exception as e:
        print(f"Error fetching {acc}: {e}")
        results.append(
            {
                "Accession": acc,
                "ScientificName": None,
                "TaxID": None,
                "Family": None,
                "Genus": None,
                "Species": None,
            }
        )

# Save results
df = pd.DataFrame(results)
df.to_csv("../BASEL_data/pipeline_phage_metadata.tsv", sep="\t", index=False)