In [36]:
import pandas as pd
import pyranges as pr

In [37]:
# fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
# v39 = pr.read_gtf(fname, duplicate_attr=True)

In [38]:
# v39.head()

In [40]:
def get_ic(gtf_pr):
    """
    Get a hyphen-separated representation of each transcript's intron chain
    from a PyRanges GTF

    Parameters:
        gtf_pr (pyranges PyRanges): GTF PyRanges object

    Returns:
        df (pandas DataFrame): DataFrame detailing intron chain, gene, strand,
            chromosome, and transcript that intron chain was seen in
    """
    df = gtf_pr.df.copy(deep=True)

    # restrict to exon entries
    df = df.loc[df.Feature == 'exon']
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'transcript_id', 'gene_id']
    df = df[cols]

    # melt to isolate individual coordinates
    df = pd.melt(df, id_vars=['Chromosome', 'Strand', 'transcript_id', 'gene_id'],
                value_vars=['Start', 'End'],
                value_name='Coord')
    df.drop('variable', axis=1, inplace=True)

    # sort to order coordinates correctly
    df.Coord = df.Coord.astype(int)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)
    rev = df.loc[df.Strand == '-'].copy(deep=True)

    fwd.sort_values(by=['Chromosome', 'transcript_id', 'Coord'],
                    ascending=[True, True, True], inplace=True)
    rev.sort_values(by=['Chromosome', 'transcript_id', 'Coord'],
                    ascending=[True, True, False], inplace=True)
    df = pd.concat([fwd, rev])

    # create intron chain strings
    df.Coord = df.Coord.astype(str)
    df = df.groupby(['Chromosome', 'Strand',
                     'transcript_id', 'gene_id'], observed=True)['Coord'].apply('-'.join).reset_index()

    # remove tss and tes from intron chain
    df['temp'] = df.Coord.str.split('-', n=1, expand=True)[1]
    df['ic'] = df.temp.str.rsplit('-', n=1, expand=True)[0]

    return df

In [65]:
# TALON transcripts
fname = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
talon = pr.read_gtf(fname, duplicate_attr=True)

nov_df = get_ic(talon)

# limit to just NIC and NNC transcripts
talon = talon.df
nov_tids = talon.loc[(talon.NNC_transcript=='TRUE')|(talon.NNC_transcript=='TRUE')].transcript_id.tolist()
nov_df = nov_df.loc[nov_df.transcript_id.isin(nov_tids)]

In [67]:
# GENCODE transcripts 
fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
genc = pr.read_gtf(fname)
genc_df = get_ic(genc)

In [68]:
nov_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,Coord,temp,ic
0,SIRV1,+,ENCODEHT000206942,ENCODEHG000058846,10647-10791-10882-11057-11434-11606,10791-10882-11057-11434-11606,10791-10882-11057-11434
1,SIRV1,-,ENCODEHT000206867,ENCODEHG000058837,10790-10647-10366-10282-7814-7552-6813-6560-64...,10647-10366-10282-7814-7552-6813-6560-6473-633...,10647-10366-10282-7814-7552-6813-6560-6473-633...
2,SIRV1,-,ENCODEHT000206868,ENCODEHG000058837,10790-10444-10366-10282-7814-7552-6813-6560-64...,10444-10366-10282-7814-7552-6813-6560-6473-633...,10444-10366-10282-7814-7552-6813-6560-6473-633...
3,SIRV1,-,ENCODEHT000206870,ENCODEHG000058837,10790-10553-7808-7552-1484-999,10553-7808-7552-1484-999,10553-7808-7552-1484
5,SIRV3,+,ENCODEHT000206896,ENCODEHG000058844,1980-2005-4568-4779-6057-7988-8127-8207-8755-8937,2005-4568-4779-6057-7988-8127-8207-8755-8937,2005-4568-4779-6057-7988-8127-8207-8755


In [69]:
genc_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,Coord,temp,ic
0,chr1,+,ENST00000003912.7,ENSG00000001461.17,24415802-24415904-24419290-24419640-24433103-2...,24415904-24419290-24419640-24433103-24433213-2...,24415904-24419290-24419640-24433103-24433213-2...
1,chr1,+,ENST00000008440.9,ENSG00000010072.16,231338255-231338604-231339768-231339868-231351...,231338604-231339768-231339868-231351303-231352521,231338604-231339768-231339868-231351303
2,chr1,+,ENST00000009105.5,ENSG00000008118.10,209583716-209583932-209594954-209595075-209599...,209583932-209594954-209595075-209599982-209600...,209583932-209594954-209595075-209599982-209600...
3,chr1,+,ENST00000010299.10,ENSG00000009780.16,27726056-27726161-27727471-27727536-27730231-2...,27726161-27727471-27727536-27730231-27730333-2...,27726161-27727471-27727536-27730231-27730333-2...
4,chr1,+,ENST00000011700.10,ENSG00000048707.15,12277120-12278038-12279498-12279650-12282704-1...,12278038-12279498-12279650-12282704-12283736-1...,12278038-12279498-12279650-12282704-12283736-1...


In [70]:
# which intron chains from the novel data exist in the new gencode annotation?
nov_df.drop(['temp', 'Coord'], axis=1, inplace=True)
genc_df.drop(['temp', 'Coord'], axis=1, inplace=True)

nov_df = nov_df.merge(genc_df, how='left', on=['Chromosome', 'Strand', 'ic'], suffixes=('_talon', '_genc'))

In [71]:
nov_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id_talon,gene_id_talon,ic,transcript_id_genc,gene_id_genc
0,SIRV1,+,ENCODEHT000206942,ENCODEHG000058846,10791-10882-11057-11434,,
1,SIRV1,-,ENCODEHT000206867,ENCODEHG000058837,10647-10366-10282-7814-7552-6813-6560-6473-633...,,
2,SIRV1,-,ENCODEHT000206868,ENCODEHG000058837,10444-10366-10282-7814-7552-6813-6560-6473-633...,,
3,SIRV1,-,ENCODEHT000206870,ENCODEHG000058837,10553-7808-7552-1484,,
4,SIRV3,+,ENCODEHT000206896,ENCODEHG000058844,2005-4568-4779-6057-7988-8127-8207-8755,,


In [77]:
temp = nov_df.loc[~nov_df.transcript_id_genc.isnull()]
n = len(temp.index)
supp_tids = temp.transcript_id_genc.str.split('.', n=1, expand=True)[0].tolist()
print('Found {} novel transcripts w/ their intron chains supported by v39'.format(n))

Found 389 novel transcripts w/ their intron chains supported by v39


In [78]:
n = len(nov_df.loc[~(nov_df.Chromosome.str.contains('SIRV'))&~(nov_df.Chromosome.str.contains('ERCC'))].index)
print('{} novel transcripts (w/o sirv / ercc) total'.format(n))

8285 novel transcripts (w/o sirv / ercc) total


In [79]:
# were these transcripts designated at novel because these transcripts just didn't exist in prior versions of the annotation?
fname = '/Users/fairliereese/mortazavi_lab/data/rnawg/refs/gencode_v29_sirv4_ercc.gtf'
v29 = pr.read_gtf(fname)

In [80]:
fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
v39 = pr.read_gtf(fname)

In [81]:
supp_tids[:5]

['ENST00000394517',
 'ENST00000685968',
 'ENST00000690421',
 'ENST00000677378',
 'ENST00000678689']

In [83]:
v29 = v29.df
v29['tid_stable'] = v29.transcript_id.str.split('.', n=1, expand=True)[0]

In [85]:
temp = v29.loc[v29.tid_stable.isin(supp_tids)]
temp = temp.loc[temp.Feature == 'transcript']
n = len(temp.index)
print('{} transcripts whose intron chains are supported in v39 also are annotated in v29'.format(n))

59 transcripts whose intron chains are supported in v39 also are annotated in v29
