In [96]:
import pyranges as pr
import pandas as pd
import cerberus

In [97]:
def make_exon_df(n,c,e,s,g,t,nt=None,ag1=None,ag2=None):
    df = pd.DataFrame()
    df['Chromosome'] = c
    try:
        df['Start'] = [i[0] for i in e]
    except:
        import pdb; pdb.set_trace()
    df['End'] = [i[1] for i in e]
    df['Strand'] = s
    df['Feature'] = 'exon'
    cols = ['gene_name', 'gene_id']
    for c in cols:
        df[c] = g
    cols = ['transcript_id', 'transcript_name']
    for c in cols:
        df[c] = t
    if nt:
        df['new_transcript_id'] = nt
    if ag1:
        df['ag1'] = ag1
    if ag2:
        df['ag2'] = ag2

    # reorder exons and starts/ stops if needed
    df['new_Start'] = df[['Start', 'End']].min(axis=1)
    df['new_End'] = df[['Start', 'End']].max(axis=1)
    df.drop(['Start', 'End'], axis=1, inplace=True)
    df.rename({'new_Start':'Start',
               'new_End':'End'}, axis=1, inplace=True)
    s = s[0]
    if s == '+':
        ascending = True
    elif s == '-':
        ascending = False
    df.sort_values(by='Start', ascending=ascending, inplace=True)
    return df

def make_hier_entry(df, how='t'):
    """
    kind {'g','t'}
    """
    agg_dict = {'min_coord': 'min', 'max_coord': 'max'}
    t_df = df.copy(deep=True)
    t_df['min_coord'] = t_df[['Start', 'End']].min(axis=1)
    t_df['max_coord'] = t_df[['Start', 'End']].max(axis=1)
    if how == 't':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id', 'transcript_id', 'transcript_name',
                   'tss_id', 'tes_id',
                   'new_transcript_id', 'original_transcript_id',
                   'original_transcript_name', 'ag1', 'ag2']
        gb_cols = list(set(gb_cols)&(set(t_df.columns)))
    elif how == 'g':
        gb_cols = ['Chromosome', 'Strand', 'gene_name',
                   'gene_id']

    cols = gb_cols + ['min_coord', 'max_coord']
    t_df = t_df[cols]
    t_df = t_df.groupby(gb_cols).agg(agg_dict).reset_index()
    t_df.rename({'min_coord': 'Start', 'max_coord': 'End'}, axis=1, inplace=True)
    if how == 't':
        t_df['Feature'] = 'transcript'
    elif how == 'g':
        t_df['Feature'] = 'gene'

    return t_df

def make_test_gtf(ts):
    df = pd.concat(ts)
        # make transcript entries
    t_df = make_hier_entry(df, how='t')
    # make gene entries
    g_df = make_hier_entry(df, how='g')

    # concat everything and sort by gene id, transcript id, feature rank (gene =0, t =1, exon=2), then start coords
    df = pd.concat([df, t_df, g_df])
    df = cerberus.sort_gtf(df)
    return df


# need:
# transcripts that don't need to be aggregated
# transcripts that do need to be aggregated

ts = []

# t1 - transcript that won't need to be aggregated
e = [[1,10], [20,30]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g1'
nt = 'g1[1,1,1]'
t = 'g1_t1'
ag1='known'
ag2='p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g1_1'
df['tes_id'] = 'g1_1'
ts.append(df)


# t3, g2
e = [[1,55]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g2'
e = [[x + 200 for x in e1] for e1 in e]
nt = 'g2[1,1,3]'
t = 'g2_t3'
ag1 = 'novel'
ag2 = 'p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g2_1'
df['tes_id'] = 'g2_3'
ts.append(df)

# t5n-- 5/3' ends longer/ shorter, same IC
e = [[20,30], [40,100]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g3'
e = [[x + 300 for x in e1] for e1 in e]
nt = 'g3[1,1,1]'
t = 'g3_t5'
ag1 = 'known'
ag2 = 'p3'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g3_1'
df['tes_id'] = 'g3_1'
ts.append(df)

# t7n-- 5/3' ends longer/ shorter, same IC
e = [[20,30], [40,45]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g4'
e = [[x + 400 for x in e1] for e1 in e]
nt = 'g4[1,1,1]'
t = 'g4_t7'
ag1 = 'known'
ag2 = 'p3'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g4_1'
df['tes_id'] = 'g4_1'
ts.append(df)

# make test gtf
test_df = make_test_gtf(ts)
test_df.rename({'transcript_id': 'original_transcript_id',
                'transcript_name': 'original_transcript_name'},
               axis=1, inplace=True)
test_df.rename({'new_transcript_id': 'transcript_id'},
               axis=1, inplace=True)
test_df['transcript_name'] = test_df['transcript_id']
test_df['Source'] = 'test_1'
test_df = pr.PyRanges(test_df)
test_df.to_gtf('test_gtf_1.gtf')



In [98]:
ts = []
# t2 - transcript with shorter 5'
e = [[8,10], [20,30]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g1'
nt = 'g1[2,1,1]'
t = 'g1_t2'
ag1='known'
ag2='p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g1_1'
df['tes_id'] = 'g1_1'
ts.append(df)

# t4, g2 monoexonic <80% overlap
e = [[50,85]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g2'
e = [[x + 200 for x in e1] for e1 in e]
nt = 'g2[1,1,3]'
t = 'g2_t4'
ag1 = 'novel'
ag2 = 'p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g2_1'
df['tes_id'] = 'g2_3'
ts.append(df)

# t9, g2 monoexonic <80% overlap
e = [[50,150]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g2'
e = [[x + 200 for x in e1] for e1 in e]
nt = 'g2[1,1,3]'
t = 'g2_t4'
ag1 = 'novel'
ag2 = 'p1'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g2_1'
df['tes_id'] = 'g2_3'
ts.append(df)

# t6n-- 5/3' ends longer/ shorter, same IC
e = [[1,30], [40,50]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g3'
e = [[x + 300 for x in e1] for e1 in e]
nt = 'g3[2,1,2]'
t = 'g3_t6'
ag1 = 'known'
ag2 = 'p3'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g3_1'
df['tes_id'] = 'g3_1'
ts.append(df)

# t8
e = [[10,25], [35,45]]
n = len(e)
c = ['chr1' for i in range(n)]
s = ['+' for i in range(n)]
g = 'g4'
e = [[x + 400 for x in e1] for e1 in e]
nt = 'g4[1,2,1]'
t = 'g4_t8'
ag1 = 'known'
ag2 = 'p3'
df = make_exon_df(n,c,e,s,g,t,nt,ag1,ag2)
df['tss_id'] = 'g4_1'
df['tes_id'] = 'g4_1'
ts.append(df)

# make test gtf
test_df = make_test_gtf(ts)
test_df.rename({'transcript_id': 'original_transcript_id',
                'transcript_name': 'original_transcript_name'},
               axis=1, inplace=True)
test_df.rename({'new_transcript_id': 'transcript_id'},
               axis=1, inplace=True)
test_df['transcript_name'] = test_df['transcript_id']
test_df = pr.PyRanges(test_df)
test_df.to_gtf('test_gtf_2.gtf')

In [99]:
df = pd.DataFrame()
for gtf in ['test_gtf_1.gtf', 'test_gtf_2.gtf']:
    temp = pr.read_gtf(gtf).df
    df = pd.concat([df, temp], axis=0)

In [100]:
df = pr.PyRanges(df)
df.to_gtf('test_gtf.gtf')