In [1]:
import pandas as pd

# Read GTF file
columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"]
df_gtf = pd.read_csv(
    "raw/GCF_000195955.2_ASM19595v2_genomic.gtf.gz", 
    sep="\t", 
    comment="#", 
    names=columns,
    compression="gzip"  # Ensure decompression for .gz files
)

# Filter for exons
df_exons = df_gtf[df_gtf["feature"] == "exon"].copy()

# Parse attributes column (key-value pairs)
# Example attribute: gene_id "Rv0001"; transcript_id "Rv0001_t1";
df_exons["gene_id"] = df_exons["attributes"].str.extract(r'gene_id "([^"]+)"')
df_exons["transcript_id"] = df_exons["attributes"].str.extract(r'transcript_id "([^"]+)"')

# Rename columns and select relevant fields
df_exons = df_exons.rename(columns={"start": "start_pos", "end": "end_pos"})
df_exons = df_exons[["gene_id", "transcript_id", "start_pos", "end_pos", "strand"]]

# Add exon numbers (group by transcript)
df_exons["exon_number"] = df_exons.groupby("transcript_id").cumcount() + 1

df_exons.insert(0, 'genome_type', "H37Rv reference")

df_exons.head()

Unnamed: 0,genome_type,gene_id,transcript_id,start_pos,end_pos,strand,exon_number
30,H37Rv reference,Rvnt01,unassigned_transcript_8,10887,10960,+,1
33,H37Rv reference,Rvnt02,unassigned_transcript_9,11112,11184,+,1
88,H37Rv reference,Rvnt03,unassigned_transcript_23,25644,25726,+,1
1008,H37Rv reference,RVnc0008,unassigned_transcript_253,293604,293705,+,1
1308,H37Rv reference,Rvnt04,unassigned_transcript_328,386204,386274,-,1


In [2]:
df_exons.to_parquet('staging/reference_exons.parquet', index=False)

In [3]:
df_validation = pd.read_parquet('staging/reference_exons.parquet')
df_validation.head()

Unnamed: 0,genome_type,gene_id,transcript_id,start_pos,end_pos,strand,exon_number
0,H37Rv reference,Rvnt01,unassigned_transcript_8,10887,10960,+,1
1,H37Rv reference,Rvnt02,unassigned_transcript_9,11112,11184,+,1
2,H37Rv reference,Rvnt03,unassigned_transcript_23,25644,25726,+,1
3,H37Rv reference,RVnc0008,unassigned_transcript_253,293604,293705,+,1
4,H37Rv reference,Rvnt04,unassigned_transcript_328,386204,386274,-,1
