In [69]:
import cerberus
import pandas as pd

In [70]:
df = pd.read_csv('merged_smol.gtf', sep='\t')

In [71]:
# give everything a unique tid
df['tid'] = ['transcript_'+str(i) for i in range(len(df.index))]
df

Unnamed: 0,Chromosome,Strand,ic,tss,tes,source,tid
0,GL000008.2,+,83545-83859-84014-85566-85625-88635-88695-129984,83369,130597,"iq_GM22305_1,espresso_GM22305_1",transcript_0
1,GL000008.2,+,83545-83926-84014-85442-85477-85566-85625-199427,83374,199914,"iq_HG01567_1,iq_GM22234_1",transcript_1
2,GL000008.2,+,83545-83926-84014-85456-85477-85566-85625-1351...,83408,157165,"iq_GM22234_1,espresso_GM22234_1",transcript_2
3,GL000008.2,+,83545-83926-84014-85456-85477-85566-85625-173515,83391,173639,espresso_HG01567_1,transcript_3
4,GL000008.2,+,83545-83926-84014-85456-85477-85566-85625-88635,83350,88839,iq_GM22234_1,transcript_4
5,GL000008.2,+,83545-83926-84014-85566-85625-129984,83378,130204,espresso_HG03732_1,transcript_5
6,GL000008.2,+,83545-83926-84014-85566-85625-155429-155531-15...,83381,157165,"iq_GM22234_1,iq_HG03732_1,espresso_GM22234_1,e...",transcript_6
7,GL000008.2,+,83545-83926-84014-85566-85625-173515,83375,175317,espresso_GM22234_1,transcript_7
8,GL000008.2,+,83545-83926-84014-85566-85625-173515-173643-19...,83378,199914,iq_GM22234_1,transcript_8
9,chrY,-,3218694-3187915-3187827-3185395,3218774,3184977,"iq_HG02261_1,iq_GM19390_1,espresso_HG02261_1,e...",transcript_9


In [68]:
# data = {
#     "Chromosome": ["chr"],
#     "Strand": ["+"],
#     "ic": ["2-3-7-10"],
#     "tid": 'test',
#     "tss": [1],
#     "tes": [15],
#     "source": ["source_x"]
# }

# df = pd.DataFrame(data)

In [77]:
df.tail()

Unnamed: 0,Chromosome,Strand,ic,tss,tes,source,tid
14,chrY,-,386367-361590-361404-347693-347589-347336-3472...,386906,333932,"iq_GM22300_1,iq_GM19352_1",transcript_14
15,chrY,-,57203181-57202020,57203404,57201134,iq_HG03729_1,transcript_15
16,chrY,-,57203181-57202145,57203397,57201134,"iq_HG04217_1,iq_HG03729_1,iq_HG02293_1",transcript_16
17,chrY,-,7076386-7067885,7076490,7067600,espresso_HG02261_1,transcript_17
18,chrY,-,965270-964752,965366,964423,iq_HG01975_1,transcript_18


In [80]:
def process_table_to_gtf_df(df):
    gtf_entries = []

    for index, row in df.iterrows():
        chrom = row['Chromosome']
        strand = row['Strand']
        ic_list = list(map(int, row['ic'].split('-')))
        tss = int(row['tss'])
        tes = int(row['tes'])
        source = row['source']
        gene_id = f"gene_{index}"
        transcript_id = f"transcript_{index}"

        # Define exons based on strand
        if strand == '+':
            exon_coords = [(tss, ic_list[0])]  # First exon
            for i in range(1, len(ic_list) - 1, 2):
                exon_coords.append((ic_list[i], ic_list[i + 1]))  # Middle exons
            exon_coords.append((ic_list[-1], tes))  # Last exon
        else:  # Reverse strand '-'
            exon_coords = [(tes, ic_list[-1])]  # First exon (on reverse strand)
            for i in range(len(ic_list) - 2, 0, -2):
                exon_coords.append((ic_list[i + 1], ic_list[i]))  # Middle exons
            exon_coords.append((ic_list[0], tss))  # Last exon (on reverse strand)

        # Ensure each exon has start < end and create GTF entries
        for i, (start, end) in enumerate(exon_coords):
            start, end = min(start, end), max(start, end)
            attributes = f'gene_id "{gene_id}"; transcript_id "{transcript_id}"; exon_number "{i+1}";'
            gtf_entries.append([chrom, source, 'exon', start, end, '.', strand, '.', attributes])

    # Convert the list of GTF entries to a DataFrame
    gtf_df = pd.DataFrame(gtf_entries, columns=['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes'])
    
    return gtf_df


In [81]:
gtf_df = process_table_to_gtf_df(df)

In [82]:
gtf_df.loc[gtf_df.Attributes.str.contains('transcript_0')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,Attributes
0,GL000008.2,"iq_GM22305_1,espresso_GM22305_1",exon,83369,83545,.,+,.,"gene_id ""gene_0""; transcript_id ""transcript_0""..."
1,GL000008.2,"iq_GM22305_1,espresso_GM22305_1",exon,83859,84014,.,+,.,"gene_id ""gene_0""; transcript_id ""transcript_0""..."
2,GL000008.2,"iq_GM22305_1,espresso_GM22305_1",exon,85566,85625,.,+,.,"gene_id ""gene_0""; transcript_id ""transcript_0""..."
3,GL000008.2,"iq_GM22305_1,espresso_GM22305_1",exon,88635,88695,.,+,.,"gene_id ""gene_0""; transcript_id ""transcript_0""..."
4,GL000008.2,"iq_GM22305_1,espresso_GM22305_1",exon,129984,130597,.,+,.,"gene_id ""gene_0""; transcript_id ""transcript_0""..."


In [83]:
df.loc[df.tid=='transcript_0']

Unnamed: 0,Chromosome,Strand,ic,tss,tes,source,tid
0,GL000008.2,+,83545-83859-84014-85566-85625-88635-88695-129984,83369,130597,"iq_GM22305_1,espresso_GM22305_1",transcript_0


In [84]:
gtf_df.loc[gtf_df.Attributes.str.contains('transcript_17')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,Attributes
79,chrY,espresso_HG02261_1,exon,7067600,7067885,.,-,.,"gene_id ""gene_17""; transcript_id ""transcript_1..."
80,chrY,espresso_HG02261_1,exon,7076386,7076490,.,-,.,"gene_id ""gene_17""; transcript_id ""transcript_1..."


In [85]:
df.loc[df.tid=='transcript_17']

Unnamed: 0,Chromosome,Strand,ic,tss,tes,source,tid
17,chrY,-,7076386-7067885,7076490,7067600,espresso_HG02261_1,transcript_17
