In [1]:
import pandas as pd
import pyranges as pr

In [2]:
def get_ic(gtf_pr):
    """
    Get a hyphen-separated representation of each transcript's intron chain
    from a PyRanges GTF

    Parameters:
        gtf_pr (pyranges PyRanges): GTF PyRanges object

    Returns:
        df (pandas DataFrame): DataFrame detailing intron chain, gene, strand,
            chromosome, and transcript that intron chain was seen in
    """
    df = gtf_pr.df.copy(deep=True)

    # restrict to exon entries
    df = df.loc[df.Feature == 'exon']
    cols = ['Chromosome', 'Strand', 'Start', 'End', 'transcript_id', 'gene_id']
    df = df[cols]

    # melt to isolate individual coordinates
    df = pd.melt(df, id_vars=['Chromosome', 'Strand', 'transcript_id', 'gene_id'],
                value_vars=['Start', 'End'],
                value_name='Coord')
    df.drop('variable', axis=1, inplace=True)

    # sort to order coordinates correctly
    df.Coord = df.Coord.astype(int)
    fwd = df.loc[df.Strand == '+'].copy(deep=True)
    rev = df.loc[df.Strand == '-'].copy(deep=True)

    fwd.sort_values(by=['Chromosome', 'transcript_id', 'Coord'],
                    ascending=[True, True, True], inplace=True)
    rev.sort_values(by=['Chromosome', 'transcript_id', 'Coord'],
                    ascending=[True, True, False], inplace=True)
    df = pd.concat([fwd, rev])

    # create intron chain strings
    df.Coord = df.Coord.astype(str)
    df = df.groupby(['Chromosome', 'Strand',
                     'transcript_id', 'gene_id'], observed=True)['Coord'].apply('-'.join).reset_index()

    # remove tss and tes from intron chain
    df['temp'] = df.Coord.str.split('-', n=1, expand=True)[1]
    df['ic'] = df.temp.str.rsplit('-', n=1, expand=True)[0]

    return df

## How many filtered novel transcripts' intron chains are supported in v39?

In [94]:
# TALON transcripts
fname = '/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/talon/human_known_nic_nnc_talon.gtf'
talon = pr.read_gtf(fname, duplicate_attr=True)

nov_df = get_ic(talon)

# limit to just NIC and NNC transcripts
talon = talon.df
nov_tids = talon.loc[(talon.NNC_transcript=='TRUE')|(talon.NNC_transcript=='TRUE')].transcript_id.tolist()
nov_df = nov_df.loc[nov_df.transcript_id.isin(nov_tids)]

In [95]:
# GENCODE transcripts 
fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
genc = pr.read_gtf(fname)
genc_df = get_ic(genc)

In [96]:
nov_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,Coord,temp,ic
0,SIRV1,+,ENCODEHT000206942,ENCODEHG000058846,10647-10791-10882-11057-11434-11606,10791-10882-11057-11434-11606,10791-10882-11057-11434
1,SIRV1,-,ENCODEHT000206867,ENCODEHG000058837,10790-10647-10366-10282-7814-7552-6813-6560-64...,10647-10366-10282-7814-7552-6813-6560-6473-633...,10647-10366-10282-7814-7552-6813-6560-6473-633...
2,SIRV1,-,ENCODEHT000206868,ENCODEHG000058837,10790-10444-10366-10282-7814-7552-6813-6560-64...,10444-10366-10282-7814-7552-6813-6560-6473-633...,10444-10366-10282-7814-7552-6813-6560-6473-633...
3,SIRV1,-,ENCODEHT000206870,ENCODEHG000058837,10790-10553-7808-7552-1484-999,10553-7808-7552-1484-999,10553-7808-7552-1484
5,SIRV3,+,ENCODEHT000206896,ENCODEHG000058844,1980-2005-4568-4779-6057-7988-8127-8207-8755-8937,2005-4568-4779-6057-7988-8127-8207-8755-8937,2005-4568-4779-6057-7988-8127-8207-8755


In [97]:
genc_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id,gene_id,Coord,temp,ic
0,chr1,+,ENST00000003912.7,ENSG00000001461.17,24415802-24415904-24419290-24419640-24433103-2...,24415904-24419290-24419640-24433103-24433213-2...,24415904-24419290-24419640-24433103-24433213-2...
1,chr1,+,ENST00000008440.9,ENSG00000010072.16,231338255-231338604-231339768-231339868-231351...,231338604-231339768-231339868-231351303-231352521,231338604-231339768-231339868-231351303
2,chr1,+,ENST00000009105.5,ENSG00000008118.10,209583716-209583932-209594954-209595075-209599...,209583932-209594954-209595075-209599982-209600...,209583932-209594954-209595075-209599982-209600...
3,chr1,+,ENST00000010299.10,ENSG00000009780.16,27726056-27726161-27727471-27727536-27730231-2...,27726161-27727471-27727536-27730231-27730333-2...,27726161-27727471-27727536-27730231-27730333-2...
4,chr1,+,ENST00000011700.10,ENSG00000048707.15,12277120-12278038-12279498-12279650-12282704-1...,12278038-12279498-12279650-12282704-12283736-1...,12278038-12279498-12279650-12282704-12283736-1...


In [98]:
# which intron chains from the novel data exist in the new gencode annotation?
nov_df.drop(['temp', 'Coord'], axis=1, inplace=True)
genc_df.drop(['temp', 'Coord'], axis=1, inplace=True)

nov_df = nov_df.merge(genc_df, how='left', on=['Chromosome', 'Strand', 'ic'], suffixes=('_talon', '_genc'))

In [99]:
nov_df.head()

Unnamed: 0,Chromosome,Strand,transcript_id_talon,gene_id_talon,ic,transcript_id_genc,gene_id_genc
0,SIRV1,+,ENCODEHT000206942,ENCODEHG000058846,10791-10882-11057-11434,,
1,SIRV1,-,ENCODEHT000206867,ENCODEHG000058837,10647-10366-10282-7814-7552-6813-6560-6473-633...,,
2,SIRV1,-,ENCODEHT000206868,ENCODEHG000058837,10444-10366-10282-7814-7552-6813-6560-6473-633...,,
3,SIRV1,-,ENCODEHT000206870,ENCODEHG000058837,10553-7808-7552-1484,,
4,SIRV3,+,ENCODEHT000206896,ENCODEHG000058844,2005-4568-4779-6057-7988-8127-8207-8755,,


In [100]:
temp = nov_df.loc[~nov_df.transcript_id_genc.isnull()]
n = len(temp.index)
supp_tids = temp.transcript_id_genc.str.split('.', n=1, expand=True)[0].tolist()
print('Found {} novel transcripts w/ their intron chains supported by v39'.format(n))

Found 389 novel transcripts w/ their intron chains supported by v39


In [101]:
n = len(nov_df.loc[~(nov_df.Chromosome.str.contains('SIRV'))&~(nov_df.Chromosome.str.contains('ERCC'))].index)
print('{} novel transcripts (w/o sirv / ercc) total'.format(n))

8285 novel transcripts (w/o sirv / ercc) total


In [117]:
temp[['transcript_id_talon', 'transcript_id_genc']].head()

Unnamed: 0,transcript_id_talon,transcript_id_genc
23,ENCODEHT000222627,ENST00000394517.7
25,ENCODEHT000238573,ENST00000685968.1
51,ENCODEHT000494112,ENST00000690421.1
100,ENCODEHT000670446,ENST00000677378.1
101,ENCODEHT000670479,ENST00000678689.1


## How many of these stable transcript ids exist in v29 and in v39? 

In [79]:
# were these transcripts designated at novel because these transcripts just didn't exist in prior versions of the annotation?
fname = '/Users/fairliereese/mortazavi_lab/data/rnawg/refs/gencode_v29_sirv4_ercc.gtf'
v29 = pr.read_gtf(fname)

In [80]:
fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
v39 = pr.read_gtf(fname)

In [81]:
supp_tids[:5]

['ENST00000394517',
 'ENST00000685968',
 'ENST00000690421',
 'ENST00000677378',
 'ENST00000678689']

In [85]:
v29 = v29.df
v29['tid_stable'] = v29.transcript_id.str.split('.', n=1, expand=True)[0]

temp = v29.loc[v29.tid_stable.isin(supp_tids)]
temp = temp.loc[temp.Feature == 'transcript']
n = len(temp.index)
print('{} transcripts whose intron chains are supported in v39 also are annotated in v29'.format(n))

59 transcripts whose intron chains are supported in v39 also are annotated in v29


In [86]:
v29.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,gene_biotype,exon_assignment,tid_stable
0,ERCC-00002,ERCC,exon,0,1045,0,+,.,ERCC-00002A,,...,,,,,,,,ERCC_spike_in,,DQ459430
1,ERCC-00003,ERCC,exon,0,1007,0,+,.,ERCC-00003A,,...,,,,,,,,ERCC_spike_in,,DQ516784
2,ERCC-00004,ERCC,exon,0,507,0,+,.,ERCC-00004A,,...,,,,,,,,ERCC_spike_in,,DQ516752
3,ERCC-00009,ERCC,exon,0,968,0,+,.,ERCC-00009A,,...,,,,,,,,ERCC_spike_in,,DQ668364
4,ERCC-00012,ERCC,exon,0,978,0,+,.,ERCC-00012A,,...,,,,,,,,ERCC_spike_in,,DQ883670


In [87]:
v29.Feature.unique()

array(['exon', 'gene', 'transcript', 'CDS', 'start_codon', 'stop_codon',
       'UTR', 'Selenocysteine'], dtype=object)

In [88]:
v29.loc[v29.Feature == 'Selenocysteine']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,gene_biotype,exon_assignment,tid_stable
20380,chr1,HAVANA,Selenocysteine,25802092,25802095,.,+,.,ENSG00000162430.16,protein_coding,...,seleno,OTTHUMT00000019314.2,,,,ENSP00000355141.2,CCDS41282.1,,,ENST00000361547
20381,chr1,HAVANA,Selenocysteine,25812788,25812791,.,+,.,ENSG00000162430.16,protein_coding,...,seleno,OTTHUMT00000019314.2,,,,ENSP00000355141.2,CCDS41282.1,,,ENST00000361547
20413,chr1,HAVANA,Selenocysteine,25812788,25812791,.,+,.,ENSG00000162430.16,protein_coding,...,seleno,OTTHUMT00000019315.2,,,,ENSP00000363434.1,CCDS41283.1,,,ENST00000374315
46423,chr1,HAVANA,Selenocysteine,53904703,53904706,.,+,.,ENSG00000211452.10,protein_coding,...,seleno,OTTHUMT00000388166.2,,,,ENSP00000432797.1,,,,ENST00000529589
46435,chr1,HAVANA,Selenocysteine,53904703,53904706,.,+,.,ENSG00000211452.10,protein_coding,...,seleno,OTTHUMT00000023247.3,,,,ENSP00000354643.3,CCDS41339.1,,,ENST00000361921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620238,chr22,HAVANA,Selenocysteine,19877110,19877113,.,-,.,ENSG00000184470.20,protein_coding,...,seleno,OTTHUMT00000314903.4,,,,ENSP00000383365.1,CCDS42981.1,,,ENST00000400521
2620286,chr22,HAVANA,Selenocysteine,19877110,19877113,.,-,.,ENSG00000184470.20,protein_coding,...,seleno,OTTHUMT00000314927.3,,,,ENSP00000485499.2,,,,ENST00000485358
2620306,chr22,HAVANA,Selenocysteine,19877110,19877113,.,-,.,ENSG00000184470.20,protein_coding,...,seleno,OTTHUMT00000314928.3,,,,ENSP00000485466.2,,,,ENST00000462843
2629658,chr22,HAVANA,Selenocysteine,31105950,31105953,.,-,.,ENSG00000198832.10,protein_coding,...,seleno,OTTHUMT00000321789.1,,,,ENSP00000384564.1,CCDS43003.1,,,ENST00000402395


## How many more reads would be known if we were using v39?

In [3]:
import sqlite3
import pandas as pd
import pyranges as pr

In [4]:
db = '../human.db'
annot = '../human_talon_read_annot.tsv'
genome_build = 'hg38'

In [5]:
def get_ic_from_talon_db(db, annot, genome_build):
    conn = sqlite3.connect(db)

    # get edge annotations
    query = """
            SELECT 
                e.edge_ID,
                loc1.chromosome,
                MIN(loc1.position,loc2.position),
                MAX(loc1.position,loc2.position),
                e.strand
             FROM edge e
             LEFT JOIN location loc1 ON e.v1 = loc1.location_ID
             LEFT JOIN location loc2 ON e.v2 = loc2.location_ID
             WHERE loc1.genome_build = '""" + genome_build + """' AND
             loc2.genome_build = '""" + genome_build + \
             """' AND e.edge_type = 'exon';"""

    e_df = pd.read_sql_query(query, conn)
    e_df.columns = ['eid', 'Chromosome', 'Start', 'End', 'Strand']
    e_df.eid = e_df.eid.astype(int)

    # get transcript annotations 
    query = """
            SELECT 
                   start_exon,
                   end_exon,
                   jn_path,
                   transcript_ID,
                   gene_ID
            FROM transcripts
            """
    t_df = pd.read_sql_query(query, conn)
    t_df['temp'] = t_df.jn_path.str.split(',')
    t_df.drop('jn_path', axis=1, inplace=True)
    t_df = t_df.explode(column='temp')
    t_df = t_df.melt(id_vars=['transcript_ID', 'gene_ID'],
              value_vars=['temp', 'start_exon', 'end_exon'])
    t_df.drop('variable', axis=1, inplace=True)
    t_df.rename({'value': 'temp'}, axis=1, inplace=True)
    t_df.drop_duplicates(inplace=True)
    t_df.rename({'transcript_ID': 'transcript_id',
                 'gene_ID': 'gene_id'},
                axis=1, inplace=True)

    # drop things w/o intron chains (ie monoexonic transcripts)
    t_df = t_df.loc[~t_df.temp.isnull()]

    t_df.temp = t_df.temp.astype(int)

    # add information about each coordinate
    t_df = t_df.merge(e_df, how='left', left_on='temp', right_on='eid')

    # drop all introns
    t_df = t_df.loc[~t_df.eid.isnull()]

    # format as pyranges
    t_df.drop(['temp', 'eid'], axis=1, inplace=True)
    t_df['Feature'] = 'exon'
    t_df.Start = t_df.Start - 1
    t_df = pr.PyRanges(t_df)

    df = get_ic(t_df)
    
    # get novelty type of each transcript
    query = """
            SELECT 
                ID,
                value
            FROM transcript_annotations
            WHERE transcript_annotations.attribute = 'transcript_status'
                AND transcript_annotations.value = 'KNOWN'
            """
    known_df = pd.read_sql_query(query, conn)
    
    known_df.rename({'ID': 'transcript_id'}, axis=1, inplace=True)
    known_df.drop('value', axis=1, inplace=True)
    known_df['v29'] = True
    
    df = df.merge(known_df, how='left', on='transcript_id')    
    
    # everything that wasn't known gets "false"
    df.v29.fillna(value=False, inplace=True)

    return df

In [6]:
df = get_ic_from_talon_db(db, annot, genome_build)

In [7]:
df.to_csv('human_talon_ics.tsv', sep='\t', index=False)

In [8]:
df = pd.read_csv('human_talon_ics.tsv', sep='\t')

In [9]:
# restrict to only known transcripts
df = df.loc[df.v29 == False]

In [10]:
df.loc[df.transcript_id == 222627, 'ic'].tolist()

['1059782-1060276-1060393-1061019-1061117-1063078-1063201-1065829']

In [11]:
# get intron chains from v39 transcripts
fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
genc = pr.read_gtf(fname)
genc_df = get_ic(genc)

In [12]:
genc_df.head()
genc_df.to_csv('gencode_v39_ics.tsv', sep='\t', index=False)

In [13]:
df = df.merge(genc_df, how='left', on=['Chromosome', 'Strand', 'ic'], suffixes=('_talon', '_genc'))

In [15]:
print('from talon gtf')
print(temp.loc[temp.transcript_id_talon == 'ENCODEHT000222627', ['Chromosome', 'Strand']])
temp.loc[temp.transcript_id_talon == 'ENCODEHT000222627', 'ic'].tolist()

In [12]:
print('from gencode v39')
print(genc_df.loc[genc_df.transcript_id == 'ENST00000394517.7', ['Chromosome', 'Strand']])
genc_df.loc[genc_df.transcript_id == 'ENST00000394517.7', 'ic'].tolist()

from gencode v39
     Chromosome Strand
2141       chr1      +


['1059782-1060276-1060393-1061019-1061117-1063078-1063201-1065829']

In [16]:
print('from talon db')
print(df.loc[df.transcript_id_talon == 222627, ['Chromosome', 'Strand']])
df.loc[df.transcript_id_talon == 222627, 'ic'].tolist()

from talon db
     Chromosome Strand
1287       chr1      +


['1059782-1060276-1060393-1061019-1061117-1063078-1063201-1065829']

In [17]:
df.head()

Unnamed: 0,Chromosome,Strand,transcript_id_talon,gene_id_talon,Coord_talon,temp_talon,ic,v29,transcript_id_genc,gene_id_genc,Coord_genc,temp_genc
0,ERCC-00002,+,206802,58798,0-1035,1035,1035,False,,,,
1,ERCC-00002,+,206838,58798,0-61-141-1035,61-141-1035,61-141,False,,,,
2,ERCC-00002,+,206839,58798,0-344-399-1035,344-399-1035,344-399,False,,,,
3,ERCC-00002,+,1149583,58798,0-887-942-1035,887-942-1035,887-942,False,,,,
4,ERCC-00002,+,1149585,58798,0-195-510-1035,195-510-1035,195-510,False,,,,


In [18]:
df.loc[df.transcript_id_talon == 222627]

Unnamed: 0,Chromosome,Strand,transcript_id_talon,gene_id_talon,Coord_talon,temp_talon,ic,v29,transcript_id_genc,gene_id_genc,Coord_genc,temp_genc
1287,chr1,+,222627,72,1059733-1059782-1060276-1060393-1061019-106111...,1059782-1060276-1060393-1061019-1061117-106307...,1059782-1060276-1060393-1061019-1061117-106307...,False,ENST00000394517.7,ENSG00000217801.11,1059707-1059782-1060276-1060393-1061019-106111...,1059782-1060276-1060393-1061019-1061117-106307...


In [19]:
df['v39'] = False
df.loc[~df.transcript_id_genc.isnull(), 'v39'] = True

In [20]:
df.loc[df.v39 == True]
v39_tids = df.loc[df.v39 == True, 'transcript_id_talon'].tolist()

In [21]:
# how many reads do these transcripts correspond to in the read_annot file?
read_df = pd.read_csv('../human_talon_read_annot.tsv', sep='\t', usecols=[10])

In [35]:
total_reads = len(read_df.index)
print(total_reads)

181558331


In [22]:
nov_v39_reads = len(read_df.loc[read_df.transcript_ID.isin(v39_tids)].index)

In [23]:
known_reads = len(read_df.index)-nov_v39_reads

In [24]:
print(nov_v39_reads)

2058984


In [25]:
print(known_reads)

179499347


In [38]:
print((nov_v39_reads/total_reads)*100)

1.134061978130874


In [26]:
len(v39_tids)

16324

In [27]:
v39_tids[:5]

[208393, 208410, 208584, 208597, 208599]

In [30]:
# and then how many of these table transcript IDs are in v29 but these reads were
# not assigned to those models (presumably because of differences at the ends)
temp = df.loc[~df.transcript_id_genc.isnull()]
supp_tids = temp.transcript_id_genc.str.split('.', n=1, expand=True)[0].tolist()

In [31]:
len(supp_tids)

16324

In [32]:
# were these transcripts designated at novel because these transcripts just didn't exist in prior versions of the annotation?
fname = '/Users/fairliereese/mortazavi_lab/data/rnawg/refs/gencode_v29_sirv4_ercc.gtf'
v29 = pr.read_gtf(fname)

In [33]:
fname = '/Users/fairliereese/mortazavi_lab/ref/gencode.v39/gencode.v39.annotation.gtf'
v39 = pr.read_gtf(fname)

In [34]:
v29 = v29.df
v29['tid_stable'] = v29.transcript_id.str.split('.', n=1, expand=True)[0]

temp = v29.loc[v29.tid_stable.isin(supp_tids)]
temp = temp.loc[temp.Feature == 'transcript']
n = len(temp.index)
print('{} transcripts whose intron chains are supported in v39 also are annotated in v29'.format(n))

2318 transcripts whose intron chains are supported in v39 also are annotated in v29


## Does pyranges lengthen intervals when it reads GTFs in? 
Yes [gh issue](https://github.com/biocore-ntnu/pyranges/pull/260)

In [17]:
beep = pr.read_gtf('confusing_transcript_talon.gtf')

In [18]:
beep

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_name,...,transcript_name,talon_transcript,NNC_transcript,exon_number,exon_id,talon_exon,exon_status,source,ont,tag
0,chr1,HAVANA,gene,1059733,1066441,.,+,.,ENSG00000217801.9,AL390719.1,...,,,,,,,,,,
1,chr1,TALON,transcript,1059733,1066441,.,+,.,ENSG00000217801.9,AL390719.1,...,ENCODEHT000222627,222627.0,True,,,,,,,
2,chr1,TALON,exon,1059733,1059782,.,+,.,ENSG00000217801.9,AL390719.1,...,ENCODEHT000222627,222627.0,,1.0,971902,971902.0,NOVEL,,,
3,chr1,TALON,exon,1060276,1060393,.,+,.,ENSG00000217801.9,AL390719.1,...,ENCODEHT000222627,222627.0,,2.0,971874,971874.0,NOVEL,,,
4,chr1,HAVANA,exon,1061019,1061117,.,+,.,ENSG00000217801.9,AL390719.1,...,ENCODEHT000222627,222627.0,,3.0,ENSE00001630784.1,936.0,KNOWN,HAVANA,,
5,chr1,HAVANA,exon,1063078,1063201,.,+,.,ENSG00000217801.9,AL390719.1,...,ENCODEHT000222627,222627.0,,4.0,ENSE00001613074.1,933.0,KNOWN,HAVANA,PGO:0000019,basic
6,chr1,TALON,exon,1065829,1066441,.,+,.,ENSG00000217801.9,AL390719.1,...,ENCODEHT000222627,222627.0,,5.0,971887,971887.0,NOVEL,,,
