In [1]:
import gzip
from Bio import SeqIO
import pandas as pd

# Open the gzipped GenBank file in text mode
with gzip.open("raw/GCF_000195955.2_ASM19595v2_genomic.gbff.gz", "rt") as handle:
    records = list(SeqIO.parse(handle, "genbank"))

genes = []
for record in records:
    for feature in record.features:
        if feature.type in ["gene", "CDS"]:
            # Handle missing qualifiers gracefully
            gene_id = feature.qualifiers.get("gene", [""])[0] or None
            locus_tag = feature.qualifiers.get("locus_tag", [""])[0] or None
            protein_id = feature.qualifiers.get("protein_id", [""])[0] or None
            
            genes.append({
                "genome_type": "H37Rv reference",
                "gene_id": gene_id,
                "locus_tag": locus_tag,
                "start_pos": int(feature.location.start),
                "end_pos": int(feature.location.end),
                "strand": "+" if feature.location.strand > 0 else "-",
                "product": feature.qualifiers.get("product", [""])[0],
                "protein_id": protein_id,
            })

# Create DataFrame and clean
df_genes = pd.DataFrame(genes).drop_duplicates().reset_index(drop=True)
df_genes.head()

Unnamed: 0,genome_type,gene_id,locus_tag,start_pos,end_pos,strand,product,protein_id
0,H37Rv reference,dnaA,Rv0001,0,1524,+,,
1,H37Rv reference,dnaA,Rv0001,0,1524,+,chromosomal replication initiator protein DnaA,NP_214515.1
2,H37Rv reference,dnaN,Rv0002,2051,3260,+,,
3,H37Rv reference,dnaN,Rv0002,2051,3260,+,DNA polymerase III subunit beta,NP_214516.1
4,H37Rv reference,recF,Rv0003,3279,4437,+,,


In [2]:
df_genes.to_parquet('staging/reference_genes.parquet', index= False)

In [3]:
df_genes = pd.read_parquet('staging/reference_genes.parquet')

display(df_genes)

Unnamed: 0,genome_type,gene_id,locus_tag,start_pos,end_pos,strand,product,protein_id
0,H37Rv reference,dnaA,Rv0001,0,1524,+,,
1,H37Rv reference,dnaA,Rv0001,0,1524,+,chromosomal replication initiator protein DnaA,NP_214515.1
2,H37Rv reference,dnaN,Rv0002,2051,3260,+,,
3,H37Rv reference,dnaN,Rv0002,2051,3260,+,DNA polymerase III subunit beta,NP_214516.1
4,H37Rv reference,recF,Rv0003,3279,4437,+,,
...,...,...,...,...,...,...,...,...
7909,H37Rv reference,,Rv3922c,4410052,4410415,-,membrane protein insertion efficiency factor,NP_218439.1
7910,H37Rv reference,rnpA,Rv3923c,4410411,4410789,-,,
7911,H37Rv reference,rnpA,Rv3923c,4410411,4410789,-,ribonuclease P protein component,NP_218440.3
7912,H37Rv reference,rpmH,Rv3924c,4410785,4410929,-,,
