In [1]:
import requests
import gzip
from Bio import SeqIO
import pandas as pd

# 1. Download protein FASTA and GenBank annotation
base = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2/"
files = {
    "faa":    "GCF_000195955.2_ASM19595v2_protein.faa.gz",
    "gbff":   "GCF_000195955.2_ASM19595v2_genomic.gbff.gz"
}
for key, fname in files.items():
    resp = requests.get(base + fname, stream=True)
    outname = fname
    with open(outname, "wb") as out:
        for block in resp.iter_content(8192):
            out.write(block)
    # decompress GenBank only
    if key == "gbff":
        with gzip.open(outname, "rb") as f_in, open("genomic.gbff", "wb") as f_out:
            f_out.write(f_in.read())

# 2. Parse protein FASTA into DataFrame
prot_records = SeqIO.parse(gzip.open(files["faa"], "rt"), "fasta")
prot_data = [
    {
      "protein_id": rec.id.split("|")[-1],  # ensure matching to GenBank protein_id
      "fasta_description": rec.description,
      "sequence": str(rec.seq),
      "length": len(rec.seq),
    }
    for rec in prot_records
]
df_faa = pd.DataFrame(prot_data)

# 3. Parse GenBank for CDS qualifiers
gbk_path = "genomic.gbff"
rows = []
for rec in SeqIO.parse(gbk_path, "genbank"):  # parse each contig/record :contentReference[oaicite:3]{index=3}
    for feat in rec.features:
        if feat.type == "CDS":                # focus on coding sequences :contentReference[oaicite:4]{index=4}
            q = feat.qualifiers
            rows.append({
                "protein_id": q.get("protein_id", [""])[0],
                "gene":       q.get("gene",         [""])[0],
                "locus_tag":  q.get("locus_tag",    [""])[0],
                "product":    q.get("product",      [""])[0],
            })
df_gb = pd.DataFrame(rows)

# 4. Merge FASTA & annotation tables
df_annot = pd.merge(
    df_faa, df_gb,
    on="protein_id", how="left"
)  # database-style join :contentReference[oaicite:5]{index=5}

# 5. Inspect and save
df_annot.head()


Unnamed: 0,protein_id,fasta_description,sequence,length,gene,locus_tag,product
0,NP_214515.1,NP_214515.1 chromosomal replication initiator ...,MTDDPGSGFTTVWNAVVSELNGDPKVDDGPSSDANLSAPLTPQQRA...,507,dnaA,Rv0001,chromosomal replication initiator protein DnaA
1,NP_214516.1,NP_214516.1 DNA polymerase III subunit beta [M...,MDAATTRVGLTDLTFRLLRESFADAVSWVAKNLPARPAVPVLSGVL...,402,dnaN,Rv0002,DNA polymerase III subunit beta
2,NP_214517.1,NP_214517.1 DNA replication/repair protein Rec...,MYVRHLGLRDFRSWACVDLELHPGRTVFVGPNGYGKTNLIEALWYS...,385,recF,Rv0003,DNA replication/repair protein RecF
3,NP_214518.1,NP_214518.1 hypothetical protein Rv0004 [Mycob...,MTGSVDRPDQNRGERSMKSPGLDLVRRTLDEARAAARARGQDAGRG...,187,,Rv0004,hypothetical protein
4,NP_214519.2,NP_214519.2 DNA gyrase subunit B [Mycobacteriu...,MAAQKKKAQDEYGAASITILEGLEAVRKRPGMYIGSTGERGLHHLI...,675,gyrB,Rv0005,DNA gyrase subunit B


In [2]:
df_annot.insert(0, 'genome_type', "H37Rv reference")
df_annot.head()

Unnamed: 0,genome_type,protein_id,fasta_description,sequence,length,gene,locus_tag,product
0,H37Rv reference,NP_214515.1,NP_214515.1 chromosomal replication initiator ...,MTDDPGSGFTTVWNAVVSELNGDPKVDDGPSSDANLSAPLTPQQRA...,507,dnaA,Rv0001,chromosomal replication initiator protein DnaA
1,H37Rv reference,NP_214516.1,NP_214516.1 DNA polymerase III subunit beta [M...,MDAATTRVGLTDLTFRLLRESFADAVSWVAKNLPARPAVPVLSGVL...,402,dnaN,Rv0002,DNA polymerase III subunit beta
2,H37Rv reference,NP_214517.1,NP_214517.1 DNA replication/repair protein Rec...,MYVRHLGLRDFRSWACVDLELHPGRTVFVGPNGYGKTNLIEALWYS...,385,recF,Rv0003,DNA replication/repair protein RecF
3,H37Rv reference,NP_214518.1,NP_214518.1 hypothetical protein Rv0004 [Mycob...,MTGSVDRPDQNRGERSMKSPGLDLVRRTLDEARAAARARGQDAGRG...,187,,Rv0004,hypothetical protein
4,H37Rv reference,NP_214519.2,NP_214519.2 DNA gyrase subunit B [Mycobacteriu...,MAAQKKKAQDEYGAASITILEGLEAVRKRPGMYIGSTGERGLHHLI...,675,gyrB,Rv0005,DNA gyrase subunit B


In [3]:
df_annot.to_parquet('staging/reference_proteome.parquet', index=False)

In [4]:
df_proteome = pd.read_parquet('staging/reference_proteome.parquet')
df_proteome.head()

Unnamed: 0,genome_type,protein_id,fasta_description,sequence,length,gene,locus_tag,product
0,H37Rv reference,NP_214515.1,NP_214515.1 chromosomal replication initiator ...,MTDDPGSGFTTVWNAVVSELNGDPKVDDGPSSDANLSAPLTPQQRA...,507,dnaA,Rv0001,chromosomal replication initiator protein DnaA
1,H37Rv reference,NP_214516.1,NP_214516.1 DNA polymerase III subunit beta [M...,MDAATTRVGLTDLTFRLLRESFADAVSWVAKNLPARPAVPVLSGVL...,402,dnaN,Rv0002,DNA polymerase III subunit beta
2,H37Rv reference,NP_214517.1,NP_214517.1 DNA replication/repair protein Rec...,MYVRHLGLRDFRSWACVDLELHPGRTVFVGPNGYGKTNLIEALWYS...,385,recF,Rv0003,DNA replication/repair protein RecF
3,H37Rv reference,NP_214518.1,NP_214518.1 hypothetical protein Rv0004 [Mycob...,MTGSVDRPDQNRGERSMKSPGLDLVRRTLDEARAAARARGQDAGRG...,187,,Rv0004,hypothetical protein
4,H37Rv reference,NP_214519.2,NP_214519.2 DNA gyrase subunit B [Mycobacteriu...,MAAQKKKAQDEYGAASITILEGLEAVRKRPGMYIGSTGERGLHHLI...,675,gyrB,Rv0005,DNA gyrase subunit B
