In [1]:
from Bio import SeqIO
import pandas as pd
import gzip
with gzip.open("raw/GCF_000195955.2_ASM19595v2_genomic.gbff.gz", "rt") as handle:
    records = list(SeqIO.parse(handle, "genbank"))

reg_elements = []
for record in records:
    for feature in record.features:
        if feature.type in ["promoter", "UTR", "ncRNA"]:
            element_data = {
                "genome_type": "H37Rv reference",
                "genome_position": int(feature.location.start),
                "element_type": feature.type,
                "associated_gene": feature.qualifiers.get("locus_tag", [""])[0],
                "sequence": str(feature.location.extract(record).seq),
            }
            reg_elements.append(element_data)

df_reg = pd.DataFrame(reg_elements)
display(df_reg.head())

Unnamed: 0,genome_type,genome_position,element_type,associated_gene,sequence
0,H37Rv reference,293603,ncRNA,RVnc0008,CGGATAGCCCCGTGTTGTTGTCTGACCCCCGACCCCGACGGCAATG...
1,H37Rv reference,704186,ncRNA,RVnc0005,CGGGACTCCTGAGAAGGATCCTGTAGGCCGCAGCCCCACCCACGGG...
2,H37Rv reference,918263,ncRNA,RVnc0002,CATAGAGGACGGAGTCGGTGAGGCTCTCCGCGAAATAGTGGCCCTG...
3,H37Rv reference,1220387,ncRNA,RVnc0034,AGCACACGCCATCCACCGTTGCACGTCTGCACCGCAAACGGCAGCC...
4,H37Rv reference,1283692,ncRNA,RVnc0012,TCGATGCCGGTCGGAAGATGTGCCTGCACACCTGGCTCGGCGCCCA...


In [2]:
df_reg.to_parquet('staging/reference_regulatory.parquet', index=False)